TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
quantization.cpp
Go to the documentation of this file.
1#include "quantization.h"
2#include "model_macros.h"
3
4#include <algorithm>
5#include <atomic>
6#include <cassert>
7#include <cmath>
8#include <cstdint>
9#include <cstring>
10#include <iomanip>
11#include <iostream>
12#include <limits>
13#include <numeric>
14#include <sstream>
15#include <stdexcept>
16#include <string>
17#include <vector>
18
19#include "logger.h"
20#include "gguf_parser.h"
21
22constexpr float K_SCALE_VALUES[64] = {
23 1.0f, 1.0625f, 1.125f, 1.1875f, 1.25f, 1.3125f, 1.375f, 1.4375f,
24 1.5f, 1.5625f, 1.625f, 1.6875f, 1.75f, 1.8125f, 1.875f, 1.9375f,
25 2.0f, 2.125f, 2.25f, 2.375f, 2.5f, 2.625f, 2.75f, 2.875f,
26 3.0f, 3.125f, 3.25f, 3.375f, 3.5f, 3.625f, 3.75f, 3.875f,
27 4.0f, 4.25f, 4.5f, 4.75f, 5.0f, 5.25f, 5.5f, 5.75f,
28 6.0f, 6.25f, 6.5f, 6.75f, 7.0f, 7.25f, 7.5f, 7.75f,
29 8.0f, 8.5f, 9.0f, 9.5f, 10.0f, 10.5f, 11.0f, 11.5f,
30 12.0f, 12.5f, 13.0f, 13.5f, 14.0f, 14.5f, 15.0f, 15.5f};
31
32constexpr float K_MIN_VALUES[64] = {
33 0.0f, -0.0078125f, -0.015625f, -0.0234375f, -0.03125f, -0.0390625f,
34 -0.046875f, -0.0546875f, -0.0625f, -0.0703125f, -0.078125f, -0.0859375f,
35 -0.09375f, -0.1015625f, -0.109375f, -0.1171875f, -0.125f, -0.140625f,
36 -0.15625f, -0.171875f, -0.1875f, -0.203125f, -0.21875f, -0.234375f,
37 -0.25f, -0.265625f, -0.28125f, -0.296875f, -0.3125f, -0.328125f,
38 -0.34375f, -0.359375f, -0.375f, -0.40625f, -0.4375f, -0.46875f,
39 -0.5f, -0.53125f, -0.5625f, -0.59375f, -0.625f, -0.65625f,
40 -0.6875f, -0.71875f, -0.75f, -0.78125f, -0.8125f, -0.84375f,
41 -0.875f, -0.9375f, -1.0f, -1.0625f, -1.125f, -1.1875f,
42 -1.25f, -1.3125f, -1.375f, -1.4375f, -1.5f, -1.5625f,
43 -1.625f, -1.6875f, -1.75f, -1.8125f};
44
45static std::atomic<int> g_vec_dot_q4_k_q8_k_log_count{0};
46
47float fp16_to_fp32(uint16_t h, bool is_gguf_scale_field) {
48 uint16_t h_to_convert = h;
49 bool original_sign_bit_was_set = (h & 0x8000);
50 uint32_t sign = (h_to_convert >> 15) & 1;
51 uint32_t exp_fp16 = (h_to_convert >> 10) & 0x1f;
52 uint32_t mant_fp16 = h_to_convert & 0x3ff;
53 uint32_t x;
54
55 if (exp_fp16 == 0) {
56 if (mant_fp16 == 0) {
57 x = (sign << 31);
58
59 } else {
60 exp_fp16 = 1;
61 while ((mant_fp16 & 0x400) == 0) {
62 mant_fp16 <<= 1;
63 exp_fp16--;
64 }
65 mant_fp16 &= ~0x400;
66 uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
67 uint32_t mant_fp32 = mant_fp16 << 13;
68 x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
69 }
70 } else if (exp_fp16 == 0x1f) {
71 x = (sign << 31) | (0xff << 23) | (mant_fp16 << 13);
72 } else {
73 uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
74 uint32_t mant_fp32 = mant_fp16 << 13;
75 x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
76 }
77
78 float f;
79 std::memcpy(&f, &x, sizeof(float));
80
81 if (is_gguf_scale_field) {
82 static std::atomic<int> q8_scale_f_log_count{0};
83 }
84
85 if (is_gguf_scale_field && f < 0.0f && !(std::isnan(f) || std::isinf(f))) {
86 f = std::abs(f);
87 }
88
89 return f;
90}
91
92uint16_t fp32_to_fp16(float f) {
93 uint32_t x;
94 std::memcpy(&x, &f, sizeof(float));
95
96 uint32_t sign = (x >> 31) & 1;
97 uint32_t exp_fp32 = (x >> 23) & 0xff;
98 uint32_t mant_fp32 = x & 0x7fffff;
99
100 uint16_t u;
101
102 if (exp_fp32 == 0xff) {
103 u = (sign << 15) | 0x7c00 | (mant_fp32 != 0 ? 0x200 : 0);
104 } else {
105 int exp_fp16 = (int)exp_fp32 - 127 + 15;
106
107 if (exp_fp16 >= 0x1f) {
108 u = (sign << 15) | 0x7c00;
109 } else if (exp_fp16 <= 0) {
110 if (exp_fp16 < -10) {
111 u = (sign << 15);
112 } else {
113 mant_fp32 = (mant_fp32 | 0x800000) >> (1 - exp_fp16);
114
115 if ((mant_fp32 >> 13) & 1) {
116 mant_fp32 += (1 << 13);
117 }
118 u = (sign << 15) | (mant_fp32 >> 13);
119 }
120 } else {
121 if ((mant_fp32 >> 13) & 1) {
122 mant_fp32 += (1 << 13);
123 if ((mant_fp32 >> 23) == 1) {
124 mant_fp32 = 0;
125 exp_fp16++;
126 if (exp_fp16 >= 0x1f) {
127 u = (sign << 15) | 0x7c00;
128 return u;
129 }
130 }
131 }
132 u = (sign << 15) | (exp_fp16 << 10) | (mant_fp32 >> 13);
133 }
134 }
135 return u;
136}
137
138namespace {
139
140std::vector<float> k_lookup_table_scale;
141std::vector<float> k_lookup_table_min;
142
143} // namespace
144
145static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d_val, uint8_t * m_val) {
146 // j is the index for the 32-element group (0 to 7 for a QK_K block)
147 // q is x[i].scales which is uint8_t scales[12] in block_q4_K
148 if (j < 4) { // For first 4 groups of 32 (i.e., first 128 elements, using scales[0..7])
149 *d_val = q[j] & 63; // Takes lower 6 bits from q[j] (scales[0..3])
150 *m_val = q[j + 4] & 63; // Takes lower 6 bits from q[j+4] (scales[4..7])
151 } else { // For next 4 groups of 32 (i.e., next 128 elements, using scales[8..11] and bits from scales[0..7])
152 *d_val = (q[j+4] & 0x0F) | ((q[j-4] >> 6) << 4); // Lower 4 bits from q[8..11] and upper 2 bits from q[0..3]
153 *m_val = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); // Upper 4 bits from q[8..11] and upper 2 bits from q[4..7]
154 }
155}
156
157static inline void get_scale_min_indices_q4_K(
158 int j,
159 const uint8_t* scales,
160 uint8_t* scale_index,
161 uint8_t* min_index) {
162 assert(j >= 0 && j < 16);
163
164 *scale_index = scales[j % 8] >> (4 * (j / 8));
165 *scale_index &= 0x0F;
166
167 *min_index = scales[j % 4 + 8] >> (4 * (j / 4));
168 *min_index &= 0x0F;
169}
170
171void dequantize_q4_k_m(const block_q4_K* qblock, float* output,
172 int num_weights_in_block, bool log_this_block) {
173 if (num_weights_in_block != GGML_QK_K) {
174 std::cout
175 << "Warning: dequantize_q4_k_m called with num_weights != GGML_QK_K ("
176 << num_weights_in_block << ")" << std::endl;
177 std::memset(output, 0, num_weights_in_block * sizeof(float));
178 return;
179 }
180
181 const float d_super_scale = fp16_to_fp32(qblock->d, false);
182 const float d_super_min = fp16_to_fp32(qblock->dmin, false);
183
184 const uint8_t * q_bytes_ptr = qblock->qs; // Pointer to the start of quantized data for the block
185 float * y_ptr = output; // Pointer to the start of output float data for the block
186
187 int scale_group_idx = 0; // This will be the 'is' in llama.cpp reference (0, 2, 4, 6)
188
189 // Loop 4 times, processing 64 elements (four 16-element sub-sub-blocks) in each iteration.
190 // This corresponds to the outer j loop in dequantize_row_q4_K from llama.cpp (j from 0 to QK_K, step 64)
191 for (int sixtyfour_chunk_idx = 0; sixtyfour_chunk_idx < GGML_QK_K / 64; ++sixtyfour_chunk_idx) {
192 uint8_t sc_val1, m_val1;
193 uint8_t sc_val2, m_val2;
194
195 // Get scales/mins for the two 32-element halves of this 64-element chunk
196 // The `j` for get_scale_min_k4 is the 32-element group index (0-7)
197 get_scale_min_k4(scale_group_idx + 0, qblock->scales, &sc_val1, &m_val1);
198 get_scale_min_k4(scale_group_idx + 1, qblock->scales, &sc_val2, &m_val2);
199
200 const float d1 = d_super_scale * static_cast<float>(sc_val1);
201 const float m1 = d_super_min * static_cast<float>(m_val1);
202 const float d2 = d_super_scale * static_cast<float>(sc_val2);
203 const float m2 = d_super_min * static_cast<float>(m_val2);
204
205 // Dequantize the first 32 elements of this 64-element chunk
206 for (int l = 0; l < 32; ++l) {
207 // q_bytes_ptr points to the start of the 32 bytes for these 64 quants
208 uint8_t quant_nibble = (q_bytes_ptr[l] & 0x0F); // First nibble for first element
209 *y_ptr++ = d1 * static_cast<float>(quant_nibble) - m1;
210 }
211
212 // Dequantize the second 32 elements of this 64-element chunk
213 for (int l = 0; l < 32; ++l) {
214 uint8_t quant_nibble = (q_bytes_ptr[l] >> 4); // Upper nibble from the same byte
215 *y_ptr++ = d2 * static_cast<float>(quant_nibble) - m2;
216 }
217
218 q_bytes_ptr += 32; // Advance q_bytes_ptr by 32 bytes (covering 64 quants)
219 scale_group_idx += 2; // Advance the scale group index for get_scale_min_k4
220 }
221}
222
223void dequantize_q6_k(const block_q6_K* qblock, float* output,
224 int num_weights_in_block, bool log_this_block) {
225 if (num_weights_in_block != GGML_QK_K) {
226 std::cout
227 << "Warning: dequantize_q6_k called with num_weights != GGML_QK_K ("
228 << num_weights_in_block << ")" << std::endl;
229 std::memset(output, 0, num_weights_in_block * sizeof(float));
230 return;
231 }
232
233 const float d = fp16_to_fp32(qblock->d, false);
234
235 // Pointers to the start of the whole block's data
236 const uint8_t * p_ql = qblock->ql;
237 const uint8_t * p_qh = qblock->qh;
238 const int8_t * p_sc = qblock->scales;
239 float * p_y = output;
240
241 // Process the 256 elements of the block.
242 // The llama.cpp code structure processes it in two 128-element chunks.
243 for (int half_idx = 0; half_idx < 2; ++half_idx) { // Process first 128, then next 128
244 // Set up pointers for the current 128-element half
245 const uint8_t * ql = p_ql + (half_idx * 64); // Each half uses 64 bytes of ql
246 const uint8_t * qh = p_qh + (half_idx * 32); // Each half uses 32 bytes of qh
247 const int8_t * sc = p_sc + (half_idx * 8); // Each half uses 8 scales for these 128 elements
248 float * y = p_y + (half_idx * 128); // Output pointer for this half
249
250 // Inner loop processes 32 sets of 4 values = 128 floats
251 for (int l = 0; l < 32; ++l) {
252 int is = l / 16; // Scale sub-group index within this half's 8 scales (0 for l=0..15, 1 for l=16..31)
253
254 // Extract the four 6-bit quantized values, already offset by -32
255 const int8_t q1 = (int8_t)(((ql[l + 0] & 0x0F) | (((qh[l] >> 0) & 0x03) << 4))) - 32;
256 const int8_t q2 = (int8_t)(((ql[l + 32] & 0x0F) | (((qh[l] >> 2) & 0x03) << 4))) - 32;
257 const int8_t q3 = (int8_t)(((ql[l + 0] >> 4) | (((qh[l] >> 4) & 0x03) << 4))) - 32;
258 const int8_t q4 = (int8_t)(((ql[l + 32] >> 4) | (((qh[l] >> 6) & 0x03) << 4))) - 32;
259
260 y[l + 0] = d * sc[is + 0] * q1;
261 y[l + 32] = d * sc[is + 2] * q2;
262 y[l + 64] = d * sc[is + 4] * q3;
263 y[l + 96] = d * sc[is + 6] * q4;
264 }
265 }
266}
267
268void handle_i8_tensor(const void* input_data, float* output_data,
269 size_t num_elements) {
270 const int8_t* input_ptr = static_cast<const int8_t*>(input_data);
271 for (size_t i = 0; i < num_elements; ++i) {
272 output_data[i] = static_cast<float>(input_ptr[i]);
273 }
274}
275
276void quantize_q4_k_m(const float* input, void* output_qblock_void,
277 int num_elements) {
278 if (num_elements != GGML_QK_K) {
279 throw std::invalid_argument(
280 "quantize_q4_k_m currently only supports block size " +
281 std::to_string(GGML_QK_K));
282 }
283
284 block_q4_K* output_qblock = static_cast<block_q4_K*>(output_qblock_void);
285
286 std::memset(output_qblock->scales, 0, sizeof(output_qblock->scales));
287 std::memset(output_qblock->qs, 0, sizeof(output_qblock->qs));
288
289 float block_min_val = std::numeric_limits<float>::max();
290 float block_max_val = std::numeric_limits<float>::lowest();
291 for (int i = 0; i < num_elements; ++i) {
292 block_min_val = SAFE_MIN(block_min_val, input[i]);
293 block_max_val = SAFE_MAX(block_max_val, input[i]);
294 }
295
296 if (block_max_val == block_min_val) {
297 block_max_val = block_min_val + GGUF_SMALL_VAL;
298 }
299 if (block_max_val < GGUF_EPSILON && block_max_val > -GGUF_EPSILON) {
300 block_max_val = GGUF_SMALL_VAL;
301 block_min_val = 0.0f;
302 }
303
304 const float d_super_scale_candidate = (block_max_val - block_min_val) / Q4K_SCALE_FACTOR;
305 const float d_super =
306 d_super_scale_candidate > GGUF_EPSILON ? d_super_scale_candidate : GGUF_EPSILON;
307 const float min_super = block_min_val;
308
309 output_qblock->d = fp32_to_fp16(d_super);
310 output_qblock->dmin = fp32_to_fp16(min_super);
311
312 for (int j = 0; j < GGML_QK_K / 16; ++j) {
313 const float* sub_block_input = input + j * 16;
314
315 float sub_min_val = sub_block_input[0];
316 float sub_max_val = sub_block_input[0];
317 for (int i = 1; i < 16; ++i) {
318 sub_min_val = SAFE_MIN(sub_min_val, sub_block_input[i]);
319 sub_max_val = SAFE_MAX(sub_max_val, sub_block_input[i]);
320 }
321
322 float ideal_scale = 0.0f;
323 if (sub_max_val > sub_min_val + GGUF_EPSILON) {
324 ideal_scale = (sub_max_val - sub_min_val) / Q4K_SCALE_FACTOR;
325 }
326 float ideal_min = sub_min_val;
327
328 uint8_t best_scale_idx = 0;
329 float min_scale_err = std::numeric_limits<float>::max();
330 if (d_super > GGUF_EPSILON) {
331 for (uint8_t k = 0; k < 16; ++k) {
332 float candidate_scale = d_super * K_SCALE_VALUES[k];
333 float err = std::abs(candidate_scale - ideal_scale);
334 if (err < min_scale_err) {
335 min_scale_err = err;
336 best_scale_idx = k;
337 }
338 }
339 }
340
341 uint8_t best_min_idx = 0;
342 float min_min_err = std::numeric_limits<float>::max();
343
344 for (uint8_t l = 0; l < 16; ++l) {
345 float candidate_min = min_super * K_MIN_VALUES[l];
346 float err = std::abs(candidate_min - ideal_min);
347 if (err < min_min_err) {
348 min_min_err = err;
349 best_min_idx = l;
350 }
351 }
352
353 int scale_byte_idx = j % 8;
354 int scale_shift = 4 * (j / 8);
355 output_qblock->scales[scale_byte_idx] |= (best_scale_idx << scale_shift);
356
357 int min_byte_idx = (j % 4) + 8;
358 int min_shift = 4 * (j / 4);
359 output_qblock->scales[min_byte_idx] |= (best_min_idx << min_shift);
360
361 float actual_scale = d_super * K_SCALE_VALUES[best_scale_idx];
362 float actual_min = min_super * K_MIN_VALUES[best_min_idx];
363 float inv_actual_scale = (actual_scale > GGUF_EPSILON || actual_scale < -GGUF_EPSILON)
364 ? 1.0f / actual_scale
365 : 0.0f;
366
367 uint8_t packed_qs[8];
368
369 std::memset(packed_qs, 0, sizeof(packed_qs));
370
371 for (int i = 0; i < 16; ++i) {
372 float val = sub_block_input[i];
373
374 int quant_val = 0;
375 if (inv_actual_scale != 0.0f) {
376 quant_val =
377 static_cast<int>(std::round((val - actual_min) * inv_actual_scale)) + Q4K_OFFSET;
378 }
379 quant_val = SAFE_MAX(0, SAFE_MIN(15, quant_val));
380
381 int byte_idx_qs = i / 2;
382 int shift_qs = (i % 2) * 4;
383 packed_qs[byte_idx_qs] |= (static_cast<uint8_t>(quant_val) << shift_qs);
384 }
385
386 uint8_t* qs_target = output_qblock->qs + j * 8;
387 for (int i = 0; i < 8; ++i) {
388 uint8_t low_nibble_val = packed_qs[i] & 0x0F;
389 uint8_t high_nibble_val = (packed_qs[i] >> 4) & 0x0F;
390 qs_target[i] = low_nibble_val | (high_nibble_val << 4);
391 }
392 }
393}
394
395void dequantize_q2_k(const void* qblock_void, float* output,
396 int num_weights_in_block) {
397 if (num_weights_in_block != GGML_QK_K) {
398 throw std::invalid_argument(
399 "dequantize_q2_k currently only supports block size " +
400 std::to_string(GGML_QK_K));
401 }
402
403 const block_q2_K* qblock = static_cast<const block_q2_K*>(qblock_void);
404
405 const float d_float_raw = fp16_to_fp32(qblock->d);
406 const float dmin_float_raw = fp16_to_fp32(qblock->dmin);
407
408 const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
409 const float dmin_float =
410 (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
411
412 const float d_float_clamped = SAFE_MIN(SAFE_MAX(d_float, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
413 const float dmin_float_clamped = SAFE_MIN(SAFE_MAX(dmin_float, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
414
415 const uint8_t* scales_ptr = qblock->scales;
416 const uint8_t* qs_ptr = qblock->qs;
417 int weight_index = 0;
418 float dequantized_scales[16];
419
420 for (int i = 0; i < 8; ++i) {
421 uint8_t packed_scales = scales_ptr[i];
422 uint8_t scale_low = packed_scales & 0x0F;
423 uint8_t scale_high = packed_scales >> 4;
424
425 dequantized_scales[i * 2 + 0] =
426 d_float_clamped * static_cast<float>(scale_low);
427 dequantized_scales[i * 2 + 1] =
428 d_float_clamped * static_cast<float>(scale_high);
429
430 dequantized_scales[i * 2 + 0] =
431 SAFE_MIN(SAFE_MAX(dequantized_scales[i * 2 + 0], TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
432 dequantized_scales[i * 2 + 1] =
433 SAFE_MIN(SAFE_MAX(dequantized_scales[i * 2 + 1], TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
434 }
435
436 weight_index = 0;
437 for (int j = 0; j < GGML_QK_K / 16; ++j) {
438 float sub_block_scale = dequantized_scales[j];
439
440 const uint8_t* qs_subblock_ptr = qs_ptr + j * 4;
441
442 for (int i = 0; i < 4; ++i) {
443 uint8_t packed_weights = qs_subblock_ptr[i];
444
445 uint8_t q0 = (packed_weights >> 0) & 0x03;
446 uint8_t q1 = (packed_weights >> 2) & 0x03;
447 uint8_t q2 = (packed_weights >> 4) & 0x03;
448 uint8_t q3 = (packed_weights >> 6) & 0x03;
449
450 float val0 =
451 sub_block_scale * static_cast<float>(q0) + dmin_float_clamped;
452 float val1 =
453 sub_block_scale * static_cast<float>(q1) + dmin_float_clamped;
454 float val2 =
455 sub_block_scale * static_cast<float>(q2) + dmin_float_clamped;
456 float val3 =
457 sub_block_scale * static_cast<float>(q3) + dmin_float_clamped;
458
463
464 output[weight_index++] = val0;
465
466 output[weight_index++] = val1;
467
468 output[weight_index++] = val2;
469
470 output[weight_index++] = val3;
471 }
472 }
473 assert(weight_index == GGML_QK_K);
474}
475
476void dequantize_q3_k(const void* qblock_void, float* output,
477 int num_weights_in_block) {
478 if (num_weights_in_block != GGML_QK_K) {
479 throw std::invalid_argument(
480 "dequantize_q3_k currently only supports block size " +
481 std::to_string(GGML_QK_K));
482 }
483
484 const block_q3_K* qblock = static_cast<const block_q3_K*>(qblock_void);
485
486 const float d_float_raw = fp16_to_fp32(qblock->d);
487 const float dmin_float_raw = fp16_to_fp32(qblock->dmin);
488
489 const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
490 const float dmin_float =
491 (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
492
493 const uint8_t* hmask_ptr = qblock->hmask;
494 const uint8_t* qs_ptr = qblock->qs;
495 const uint8_t* scales_ptr = qblock->scales;
496
497 int weight_index = 0;
498
499 for (int j = 0; j < GGML_QK_K / 16; ++j) {
500 uint8_t scale_idx;
501
502 if (j < 8) {
503 scale_idx = scales_ptr[j] & 0x3F;
504 } else {
505 scale_idx = scales_ptr[j + 4] & 0x3F;
506 }
507
508 assert(scale_idx < 64 && "Scale index out of bounds for Q3_K lookup");
509 const float sub_block_scale_factor = K_SCALE_VALUES[scale_idx];
510
511 const float final_sub_block_scale = d_float * sub_block_scale_factor;
512 const float final_sub_block_min = dmin_float;
513
514 for (int i = 0; i < 4; ++i) {
515 uint8_t qs_byte = qs_ptr[j * 4 + i];
516 uint8_t hmask_byte = hmask_ptr[j];
517
518 for (int bit_pos = 0; bit_pos < 8; bit_pos += 2) {
519 uint8_t lower_bits = (qs_byte >> bit_pos) & 0x3;
520
521 int hmask_bit_idx = (i * 4) + (bit_pos / 2);
522
523 uint8_t high_bit = (hmask_byte >> hmask_bit_idx) & 0x1;
524
525 uint8_t q_val = (high_bit << 2) | lower_bits;
526
527 float val = final_sub_block_scale * static_cast<float>(q_val) +
528 final_sub_block_min;
529
530 if (!std::isfinite(val)) {
531 val = 0.0f;
532 }
533
534 output[weight_index++] = val;
535 }
536 }
537 }
538
539 if (weight_index != GGML_QK_K) {
540 std::cout << "ERROR: Processed " << weight_index << " weights instead of "
541 << GGML_QK_K << std::endl;
542
543 while (weight_index < GGML_QK_K) {
544 output[weight_index++] = 0.0f;
545 }
546 }
547}
548
549void quantize_q6_k(const float* input, void* output_qblock_void,
550 int num_elements) {
551 if (num_elements != GGML_QK_K) {
552 throw std::invalid_argument(
553 "quantize_q6_k currently only supports block size " +
554 std::to_string(GGML_QK_K));
555 }
556
557 block_q6_K* output_qblock = static_cast<block_q6_K*>(output_qblock_void);
558
559 uint8_t* ql = output_qblock->ql;
560 uint8_t* qh = output_qblock->qh;
561 int8_t* scales = output_qblock->scales;
562 std::memset(ql, 0, GGML_QK_K / 2);
563 std::memset(qh, 0, GGML_QK_K / 4);
564
565 float amax = 0.0f;
566 for (int i = 0; i < num_elements; ++i) {
567 amax = SAFE_MAX(amax, std::abs(input[i]));
568 }
569
570 const float d_float = (amax > GGUF_EPSILON) ? (amax / Q6K_SCALE_FACTOR) : GGUF_EPSILON;
571 output_qblock->d = fp32_to_fp16(d_float);
572
573 for (int sub = 0; sub < GGML_QK_K / 16; ++sub) {
574 const float* sub_in = input + sub * 16;
575
576 float sub_amax = 0.0f;
577 for (int i = 0; i < 16; ++i) {
578 sub_amax = SAFE_MAX(sub_amax, std::abs(sub_in[i]));
579 }
580
581 int8_t scale = (d_float > 0.0f) ? std::round(sub_amax / d_float) : 1;
582 if (scale == 0) scale = 1;
583 scales[sub] = scale;
584
585 for (int i = 0; i < 16; ++i) {
586 float val = sub_in[i];
587 int q = static_cast<int>(std::round(val / (d_float * scale))) + Q6K_OFFSET;
588 q = SAFE_MAX(0, SAFE_MIN(63, q));
589
590 int idx = sub * 16 + i;
591 int ql_idx = idx / 2;
592 int ql_shift = (idx % 2) * 4;
593 ql[ql_idx] |= (q & 0x0F) << ql_shift;
594 int qh_idx = idx / 4;
595 int qh_shift = (idx % 4) * 2;
596 qh[qh_idx] |= ((q >> 4) & 0x03) << qh_shift;
597 }
598 }
599}
600
601const char* ggml_type_name(GGMLType type) {
602 switch (type) {
604 return "F32";
606 return "F16";
608 return "Q4_0";
610 return "Q4_1";
612 return "Q5_0";
614 return "Q5_1";
616 return "Q8_0";
618 return "Q8_1";
620 return "Q2_K";
622 return "Q3_K";
624 return "Q4_K";
626 return "Q5_K";
628 return "Q6_K";
630 return "Q8_K";
632 return "I8";
634 return "I16";
636 return "I32";
638 return "BF16";
640 return "COUNT";
641 default:
642 return "Unknown";
643 }
644}
645
647 switch (type) {
649 return sizeof(float);
651 return sizeof(uint16_t);
653 return sizeof(int8_t);
655 return sizeof(block_q4_K);
657 return sizeof(block_q2_K);
659 return sizeof(block_q3_K);
661 return sizeof(block_q6_K);
663 return 18;
664
666 return 34;
668 return 40;
670 return 116;
672 return 290;
674 return sizeof(int16_t);
676 return sizeof(int32_t);
678 return sizeof(uint16_t);
680 default:
681 std::cout << " UNKNOWN GGML TYPE: " << static_cast<int>(type)
682 << std::endl;
683 throw std::invalid_argument("Unknown GGML type in ggml_type_size: " +
684 std::to_string(static_cast<int>(type)));
685 }
686}
687
689 switch (type) {
694
695 return GGML_QK_K;
696
699
700 return 32;
701
708 return 1;
709
710 default:
711 std::cout << "Warning: Unknown GGMLType in ggml_type_block_size: "
712 << static_cast<int>(type) << std::endl;
713 return 0;
714 }
715
716 return 0;
717}
718
719std::vector<block_q8_K> quantize_fp32_to_q8_K(
720 const std::vector<float>& f_data) {
721 if (f_data.size() % GGML_QK_K != 0) {
722 throw std::runtime_error(
723 "Input vector size must be a multiple of GGML_QK_K (" +
724 std::to_string(GGML_QK_K) + ")");
725 }
726
727 size_t num_blocks = f_data.size() / GGML_QK_K;
728 std::vector<block_q8_K> q_data(num_blocks);
729 const float* x = f_data.data();
730 block_q8_K* y = q_data.data();
731
732 static std::atomic<int> log_count_q8k_quant_scales = 0;
733
734 for (size_t i = 0; i < num_blocks; ++i) {
735 float amax = 0.0f;
736 for (int j = 0; j < GGML_QK_K; ++j) {
737 amax = SAFE_MAX(amax, std::abs(x[j]));
738 }
739
740 const float d_fp32 = amax / Q8K_SCALE_FACTOR;
741 const float id = (d_fp32 != 0.f) ? 1.0f / d_fp32 : 0.0f;
742 y[i].d = fp32_to_fp16(d_fp32);
743
744 if (log_count_q8k_quant_scales < 10) {
745 std::stringstream q8k_scale_log_ss;
746 q8k_scale_log_ss << "[Q8K_QUANT_SCALES] Block #" << i
747 << " Input amax=" << amax << " -> d_fp32=" << d_fp32
748 << " -> Stored d_fp16=0x" << std::hex << y[i].d
749 << std::dec;
750 Logger::debug(q8k_scale_log_ss.str());
751 log_count_q8k_quant_scales++;
752 }
753
754 int16_t block_sum[16] = {0};
755 for (int j = 0; j < GGML_QK_K; ++j) {
756 const float val_scaled = x[j] * id;
757
758 int8_t q_val = static_cast<int8_t>(
759 SAFE_MAX(-128.0f, SAFE_MIN(127.0f, std::round(val_scaled))));
760 y[i].qs[j] = q_val;
761 block_sum[j / 16] += q_val;
762 }
763
764 std::memcpy(y[i].bsums, block_sum, sizeof(block_sum));
765
766 x += GGML_QK_K;
767 }
768
769 return q_data;
770}
771
772float vec_dot_q6_k_q8_k_cpu(int n, const std::vector<block_q6_K>& x_vec,
773 const std::vector<block_q8_K>& y_vec,
774 bool log_this_call) {
775 if (n % GGML_QK_K != 0) {
776 throw std::runtime_error("vec_dot_q6_k_q8_k: n must be multiple of QK_K");
777 }
778 size_t nb = n / GGML_QK_K;
779 if (x_vec.size() != nb || y_vec.size() != nb) {
780 throw std::runtime_error("vec_dot_q6_k_q8_k: vector block count mismatch");
781 }
782
783 const block_q6_K* x = x_vec.data();
784 const block_q8_K* y = y_vec.data();
785
786 int8_t aux8[GGML_QK_K];
787 int16_t aux16[8];
788 float sums[8];
789 int32_t aux32[8];
790 std::memset(sums, 0, 8 * sizeof(float));
791
792 float sumf = 0.0f;
793
794 static std::atomic<int> log_count_dot = 0;
795 bool should_log_this_block = log_this_call && log_count_dot < 5;
796
797 for (size_t i = 0; i < nb; ++i) {
798 const uint8_t* ql = x[i].ql;
799 const uint8_t* qh = x[i].qh;
800 const int8_t* q8 = y[i].qs;
801 std::memset(aux32, 0, 8 * sizeof(int32_t));
802
803 int8_t* a = aux8;
804 for (int j = 0; j < GGML_QK_K; j += 128) {
805 for (int l = 0; l < 32; ++l) {
806 a[l + 0] = static_cast<int8_t>(
807 ((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32);
808 a[l + 32] = static_cast<int8_t>(
809 ((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32);
810 a[l + 64] = static_cast<int8_t>(
811 ((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32);
812 a[l + 96] = static_cast<int8_t>(
813 ((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
814 }
815 a += 128;
816 ql += 64;
817 qh += 32;
818 }
819
820 a = aux8;
821 int is = 0;
822 for (int j = 0; j < GGML_QK_K / 16; ++j) {
823 int scale = x[i].scales[is++];
824 for (int l = 0; l < 8; ++l)
825 aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
826 for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
827 q8 += 8;
828 a += 8;
829 for (int l = 0; l < 8; ++l)
830 aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
831 for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
832 q8 += 8;
833 a += 8;
834 }
835
836 int32_t sumi_mins = 0;
837 for (int j = 0; j < GGML_QK_K / 16; ++j) {
838 sumi_mins += static_cast<int32_t>(y[i].bsums[j]) *
839 static_cast<int32_t>(x[i].scales[j]);
840 }
841
842 const float d_q6 = fp16_to_fp32(x[i].d);
843 const float d_q8 = fp16_to_fp32(y[i].d);
844 const float d = d_q6 * d_q8;
845
846 float block_contribution = 0.0f;
847 for (int l = 0; l < 8; ++l) {
848 float term = d * (aux32[l] - 32 * sumi_mins / 8);
849 sums[l] += term;
850 block_contribution += term;
851 }
852
853 if (i == 0 && should_log_this_block) {
854 std::stringstream ss_log;
855 ss_log << "[DOT_Q6K_Q8K] Call #" << (log_count_dot.load() + 1)
856 << ", Block #0:";
857 Logger::debug(ss_log.str());
858 ss_log.str("");
859 ss_log << " Q6_K Scale d_q6: " << d_q6 << " (Raw FP16: 0x" << std::hex
860 << x[i].d << std::dec << ")";
861 Logger::debug(ss_log.str());
862 ss_log.str("");
863 ss_log << " Q8_K Scale d_q8: " << d_q8;
864 Logger::debug(ss_log.str());
865 ss_log.str("");
866 ss_log << " Combined Scale d: " << d;
867 Logger::debug(ss_log.str());
868 ss_log.str("");
869 ss_log << " Q6_K Sub-scales (int8): ";
870 for (int k = 0; k < 16; ++k) ss_log << (int)x[i].scales[k] << " ";
871 Logger::debug(ss_log.str());
872 ss_log.str("");
873 ss_log << " Int32 Sums (aux32, before compensation): ";
874 for (int l = 0; l < 8; ++l) ss_log << aux32[l] << " ";
875 Logger::debug(ss_log.str());
876 ss_log.str("");
877 ss_log << " Compensation term (sumi_mins): " << sumi_mins
878 << ", -32 * sumi_mins: " << (-32 * sumi_mins);
879 Logger::debug(ss_log.str());
880 ss_log.str("");
881 ss_log << " Block #0 Contribution to Sums (after compensation): "
882 << block_contribution;
883 Logger::debug(ss_log.str());
884 }
885 }
886
887 for (int l = 0; l < 8; ++l) {
888 sumf += sums[l];
889 }
890
891 if (should_log_this_block) {
892 log_count_dot++;
893 }
894 return sumf;
895}
896
897void matvec_q6k_q8k_cpu(const std::vector<block_q6_K>& mat_q6k,
898 const std::vector<block_q8_K>& vec_q8k,
899 std::vector<float>& out_f32, int rows, int cols,
900 bool log_calls) {
901 if (cols % GGML_QK_K != 0) {
902 throw std::runtime_error(
903 "matvec_q6k_q8k_cpu: cols must be divisible by GGML_QK_K");
904 }
905 size_t blocks_per_row = cols / GGML_QK_K;
906 if (mat_q6k.size() != (size_t)rows * blocks_per_row) {
907 throw std::runtime_error("matvec_q6k_q8k_cpu: mat_q6k size mismatch");
908 }
909 if (vec_q8k.size() != blocks_per_row) {
910 throw std::runtime_error("matvec_q6k_q8k_cpu: vec_q8k size mismatch");
911 }
912 out_f32.resize(rows);
913 for (int r = 0; r < rows; ++r) {
914 const std::vector<block_q6_K> row_q6k(
915 mat_q6k.begin() + r * blocks_per_row,
916 mat_q6k.begin() + (r + 1) * blocks_per_row);
917
918 out_f32[r] = vec_dot_q6_k_q8_k_cpu(cols, row_q6k, vec_q8k, log_calls);
919 }
920}
921
922float vec_dot_q4_k_q8_k_cpu(int n, const std::vector<block_q4_K>& x_vec,
923 const std::vector<block_q8_K>& y_vec,
924 bool log_this_call) {
925 int log_count_now = g_vec_dot_q4_k_q8_k_log_count.fetch_add(1);
926 if (log_count_now >= 5) log_this_call = false;
927
928 if (n % GGML_QK_K != 0) {
929 throw std::runtime_error("vec_dot_q4_k_q8_k: n must be multiple of QK_K");
930 }
931 size_t nb = n / GGML_QK_K;
932 if (x_vec.size() != nb || y_vec.size() != nb) {
933 throw std::runtime_error("vec_dot_q4_k_q8_k: vector block count mismatch");
934 }
935
936 const block_q4_K* x = x_vec.data();
937 const block_q8_K* y = y_vec.data();
938
939 float sumf = 0.0f;
940 for (size_t i = 0; i < nb; ++i) {
941 int8_t q4_vals[GGML_QK_K];
942 const uint8_t* q4 = x[i].qs;
943 for (int j = 0; j < GGML_QK_K / 2; ++j) {
944 q4_vals[2 * j + 0] = static_cast<int8_t>(q4[j] & 0xF);
945 q4_vals[2 * j + 1] = static_cast<int8_t>(q4[j] >> 4);
946 }
947
948 const int8_t* q8 = y[i].qs;
949
950 for (int sub = 0; sub < 16; ++sub) {
951 uint8_t scale_idx, min_idx;
952 get_scale_min_indices_q4_K(sub, x[i].scales, &scale_idx, &min_idx);
953 float scale = fp16_to_fp32(x[i].d) * K_SCALE_VALUES[scale_idx];
954 float minv = fp16_to_fp32(x[i].dmin) * K_MIN_VALUES[min_idx];
955 for (int k = 0; k < 16; ++k) {
956 int idx = sub * 16 + k;
957 float q4_val = static_cast<float>(q4_vals[idx]) - 8.0f;
958 float q8_val = static_cast<float>(q8[idx]);
959 sumf += (scale * q4_val + minv) * q8_val;
960 }
961 }
962
963 if (i == 0 && log_this_call) {
964 std::stringstream ss;
965 ss << "[Q4K_Q8K] Block #0: d: " << fp16_to_fp32(x[i].d)
966 << ", dmin: " << fp16_to_fp32(x[i].dmin);
967 Logger::debug(ss.str());
968 ss.str("");
969 ss << "[Q4K_Q8K] Block #0: Q8_K input (first 16): ";
970 for (int k = 0; k < 16; ++k) ss << (int)q8[k] << " ";
971 Logger::debug(ss.str());
972 ss.str("");
973 ss << "[Q4K_Q8K] Block #0: Q4_K unpacked (first 16): ";
974 for (int k = 0; k < 16; ++k) ss << (int)q4_vals[k] << " ";
975 Logger::debug(ss.str());
976 ss.str("");
977 }
978 }
979 return sumf;
980}
981
982void matvec_q4k_q8k_cpu(const std::vector<block_q4_K>& mat_q4k,
983 const std::vector<block_q8_K>& vec_q8k,
984 std::vector<float>& out_f32, int rows, int cols,
985 bool log_calls) {
986 if (cols % GGML_QK_K != 0) {
987 throw std::runtime_error(
988 "matvec_q4k_q8k_cpu: cols must be divisible by GGML_QK_K");
989 }
990 size_t blocks_per_row = cols / GGML_QK_K;
991 if (mat_q4k.size() != (size_t)rows * blocks_per_row) {
992 throw std::runtime_error("matvec_q4k_q8k_cpu: mat_q4k size mismatch");
993 }
994 if (vec_q8k.size() != blocks_per_row) {
995 throw std::runtime_error("matvec_q4k_q8k_cpu: vec_q8k size mismatch");
996 }
997 out_f32.resize(rows);
998
999#pragma omp parallel for
1000 for (int r = 0; r < rows; ++r) {
1001 const std::vector<block_q4_K> row_q4k(
1002 mat_q4k.begin() + r * blocks_per_row,
1003 mat_q4k.begin() + (r + 1) * blocks_per_row);
1004
1005 out_f32[r] = vec_dot_q4_k_q8_k_cpu(cols, row_q4k, vec_q8k, log_calls);
1006 }
1007}
1008
1009void dequantize_q8_k(const std::vector<block_q8_K>& q_data,
1010 std::vector<float>& x, int n, bool log_this_block) {
1011 if (n % GGML_QK_K != 0) {
1012 std::cerr
1013 << "Error: n must be a multiple of GGML_QK_K for Q8_K dequantization."
1014 << std::endl;
1015 return;
1016 }
1017 size_t num_blocks = n / GGML_QK_K;
1018 if (q_data.size() < num_blocks) {
1019 std::cerr << "Error: Not enough Q8_K blocks provided for dequantization."
1020 << std::endl;
1021 return;
1022 }
1023
1024 static std::atomic<int> log_count_q8k_dequant_scales = 0;
1025
1026 for (size_t i = 0; i < num_blocks; ++i) {
1027 const block_q8_K* qblock = &q_data[i];
1028 float* x_block = &x[i * GGML_QK_K];
1029
1030 const float d = fp16_to_fp32(qblock->d, true);
1031
1032 if (log_this_block && log_count_q8k_dequant_scales < 10) {
1033 std::stringstream scale_log_ss;
1034 scale_log_ss << "[Q8K_DEQUANT_SCALES] Block #"
1035 << (log_count_q8k_dequant_scales.load()) << " Raw_d_fp16=0x"
1036 << std::hex << qblock->d << std::dec << " -> d=" << d;
1037 Logger::debug(scale_log_ss.str());
1038 log_count_q8k_dequant_scales++;
1039 }
1040
1041 for (int j = 0; j < GGML_QK_K; ++j) {
1042 x_block[j] = d * static_cast<float>(qblock->qs[j]);
1043 }
1044 }
1045}
1046
1047void dequantize_q8_0_block(const block_q8_0* qblock, float* output) {
1048 const float d_fp32 = fp16_to_fp32(qblock->d, true);
1049 for (int i = 0; i < GGML_QK8_0; ++i) {
1050 output[i] = d_fp32 * static_cast<float>(qblock->qs[i]);
1051 }
1052}
1053
1054void dequantize_vector_q6k_to_f32(const std::vector<block_q6_K>& q_weights,
1055 std::vector<float>& f32_weights,
1056 size_t total_num_elements,
1057 int log_first_n_blocks) {
1058 if (q_weights.empty()) {
1059 Logger::warning("[DEQUANT_VEC_Q6K] Input Q6_K weight vector is empty. Output float vector will be empty.");
1060 f32_weights.clear();
1061 return;
1062 }
1063
1064 f32_weights.resize(total_num_elements);
1065 size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
1066
1067 if (q_weights.size() != expected_blocks) {
1068 Logger::error("[DEQUANT_VEC_Q6K] Mismatch in Q6_K block count. Expected: " +
1069 std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
1070 ". Total elements: " + std::to_string(total_num_elements));
1071 }
1072
1073
1074 float* current_output_ptr = f32_weights.data();
1075 size_t elements_processed = 0;
1076
1077 for (size_t i = 0; i < q_weights.size(); ++i) {
1078 const block_q6_K* current_block_ptr = &q_weights[i];
1079 int elements_in_this_block = GGML_QK_K;
1080
1081 if (elements_processed + GGML_QK_K > total_num_elements) {
1082 elements_in_this_block = total_num_elements - elements_processed;
1083 }
1084
1085 if (elements_in_this_block <= 0) {
1086 Logger::warning("[DEQUANT_VEC_Q6K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
1087 continue;
1088 }
1089
1090 bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
1091
1092 // Enhanced logging: Log block index for all blocks during the first dequantization call
1093 static std::atomic<bool> first_call_ever{true};
1094 bool is_first_call = first_call_ever.exchange(false);
1095
1096 dequantize_q6_k(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
1097
1098 current_output_ptr += elements_in_this_block;
1099 elements_processed += elements_in_this_block;
1100
1101 }
1102
1103 if (elements_processed != total_num_elements) {
1104 Logger::warning("[DEQUANT_VEC_Q6K] Processed " + std::to_string(elements_processed) +
1105 " elements, but expected " + std::to_string(total_num_elements) + ".");
1106 }
1107}
1108
1109void dequantize_vector_q4k_to_f32(const std::vector<block_q4_K>& q_weights,
1110 std::vector<float>& f32_weights,
1111 size_t total_num_elements,
1112 int log_first_n_blocks) {
1113 if (q_weights.empty()) {
1114 Logger::warning("[DEQUANT_VEC_Q4K] Input Q4_K weight vector is empty. Output float vector will be empty.");
1115 f32_weights.clear();
1116 return;
1117 }
1118
1119 f32_weights.resize(total_num_elements);
1120 size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
1121
1122 if (q_weights.size() != expected_blocks) {
1123 Logger::error("[DEQUANT_VEC_Q4K] Mismatch in Q4_K block count. Expected: " +
1124 std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
1125 ". Total elements: " + std::to_string(total_num_elements));
1126 }
1127
1128 float* current_output_ptr = f32_weights.data();
1129 size_t elements_processed = 0;
1130
1131 for (size_t i = 0; i < q_weights.size(); ++i) {
1132 const block_q4_K* current_block_ptr = &q_weights[i];
1133 int elements_in_this_block = GGML_QK_K;
1134
1135 if (elements_processed + GGML_QK_K > total_num_elements) {
1136 elements_in_this_block = total_num_elements - elements_processed;
1137 }
1138
1139 if (elements_in_this_block <= 0) {
1140 Logger::warning("[DEQUANT_VEC_Q4K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
1141 continue;
1142 }
1143
1144 bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
1145
1146 // Enhanced logging: Log block index for all blocks during the first dequantization call
1147 static std::atomic<bool> first_call_ever{true};
1148 bool is_first_call = first_call_ever.exchange(false);
1149
1150 // Call the Q4_K specific single-block dequantizer
1151 dequantize_q4_k_m(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
1152
1153
1154 current_output_ptr += elements_in_this_block;
1155 elements_processed += elements_in_this_block;
1156
1157 }
1158
1159 if (elements_processed != total_num_elements) {
1160 Logger::warning("[DEQUANT_VEC_Q4K] Processed " + std::to_string(elements_processed) +
1161 " elements, but expected " + std::to_string(total_num_elements) + ".");
1162 }
1163}
1164
1165void dequantize_vector_q8_0_to_f32(const std::vector<block_q8_0>& q_weights,
1166 std::vector<float>& f32_weights,
1167 size_t total_num_elements,
1168 int log_first_n_blocks) {
1169 if (q_weights.empty()) {
1170 Logger::warning("[DEQUANT_VEC_Q8_0] Input Q8_0 weight vector is empty. Output float vector will be empty.");
1171 f32_weights.clear();
1172 return;
1173 }
1174
1175 f32_weights.resize(total_num_elements);
1176
1177 size_t expected_blocks = (total_num_elements + GGML_QK8_0 - 1) / GGML_QK8_0;
1178
1179 if (q_weights.size() != expected_blocks) {
1180 Logger::error("[DEQUANT_VEC_Q8_0] Mismatch in Q8_0 block count. Expected: " +
1181 std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
1182 ". Total elements: " + std::to_string(total_num_elements));
1183 }
1184
1185 float* current_output_ptr = f32_weights.data();
1186 size_t elements_processed = 0;
1187
1188 for (size_t i = 0; i < q_weights.size(); ++i) {
1189 const block_q8_0* current_block_ptr = &q_weights[i];
1190 int elements_in_this_block = GGML_QK8_0;
1191
1192 if (elements_processed + GGML_QK8_0 > total_num_elements) {
1193 elements_in_this_block = total_num_elements - elements_processed;
1194 }
1195
1196 if (elements_in_this_block <= 0) {
1197 Logger::warning("[DEQUANT_VEC_Q8_0] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
1198 continue;
1199 }
1200
1201 bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
1202
1203 static std::atomic<bool> first_call_ever{true};
1204 bool is_first_call = first_call_ever.exchange(false);
1205
1206 if (elements_in_this_block == GGML_QK8_0) {
1207 dequantize_q8_0_block(current_block_ptr, current_output_ptr);
1208 } else {
1209 // Handle partial block
1210 float temp_block[GGML_QK8_0];
1211 dequantize_q8_0_block(current_block_ptr, temp_block);
1212 std::memcpy(current_output_ptr, temp_block, elements_in_this_block * sizeof(float));
1213 }
1214
1215 current_output_ptr += elements_in_this_block;
1216 elements_processed += elements_in_this_block;
1217 }
1218
1219 if (elements_processed != total_num_elements) {
1220 Logger::warning("[DEQUANT_VEC_Q8_0] Processed " + std::to_string(elements_processed) +
1221 " elements, but expected " + std::to_string(total_num_elements) + ".");
1222 }
1223}
static void debug(const std::string &message)
Definition logger.cpp:131
static void warning(const std::string &message)
Definition logger.cpp:139
static void error(const std::string &message)
Definition logger.cpp:143
GGMLType
Enumeration of GGML tensor data types.
Definition ggml_types.h:21
@ GGML_TYPE_Q8_1
Definition ggml_types.h:30
@ GGML_TYPE_Q2_K
Definition ggml_types.h:31
@ GGML_TYPE_F32
Definition ggml_types.h:22
@ GGML_TYPE_I16
Definition ggml_types.h:38
@ GGML_TYPE_BF16
Definition ggml_types.h:40
@ GGML_TYPE_Q5_0
Definition ggml_types.h:27
@ GGML_TYPE_I8
Definition ggml_types.h:37
@ GGML_TYPE_Q8_K
Definition ggml_types.h:36
@ GGML_TYPE_F16
Definition ggml_types.h:23
@ GGML_TYPE_Q3_K
Definition ggml_types.h:32
@ GGML_TYPE_Q6_K
Definition ggml_types.h:35
@ GGML_TYPE_Q4_1
Definition ggml_types.h:25
@ GGML_TYPE_Q8_0
Definition ggml_types.h:29
@ GGML_TYPE_I32
Definition ggml_types.h:39
@ GGML_TYPE_Q5_K
Definition ggml_types.h:34
@ GGML_TYPE_Q5_1
Definition ggml_types.h:28
@ GGML_TYPE_COUNT
Definition ggml_types.h:41
@ GGML_TYPE_Q4_K
Definition ggml_types.h:33
@ GGML_TYPE_Q4_0
Definition ggml_types.h:24
Parser for GGUF (GPT-Generated Unified Format) files.
constexpr float TENSOR_SCALE_MAX
Constants for tensor value validation.
Definition gguf_parser.h:50
constexpr int8_t Q4K_OFFSET
Offset values for quantization methods.
Definition gguf_parser.h:63
constexpr int8_t Q6K_OFFSET
Definition gguf_parser.h:64
constexpr float Q8K_SCALE_FACTOR
Definition gguf_parser.h:58
constexpr float Q4K_SCALE_FACTOR
Scale factors for different quantization methods.
Definition gguf_parser.h:56
constexpr float TENSOR_SCALE_MIN
Definition gguf_parser.h:51
constexpr float Q6K_SCALE_FACTOR
Definition gguf_parser.h:57
constexpr float GGUF_EPSILON
Constants for numeric stability in calculations.
Definition gguf_parser.h:36
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
constexpr float GGUF_SMALL_VAL
Definition gguf_parser.h:37
Logging utilities for the TinyLlama implementation.
#define SAFE_MAX(a, b)
#define SAFE_MIN(a, b)
uint16_t fp32_to_fp16(float f)
Converts a 32-bit float to 16-bit floating point.
void dequantize_q3_k(const void *qblock_void, float *output, int num_weights_in_block)
Dequantizes a Q3_K quantized block to float32.
size_t ggml_type_block_size(GGMLType type)
Gets the block size for a GGML type.
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
static void get_scale_min_k4(int j, const uint8_t *q, uint8_t *d_val, uint8_t *m_val)
size_t ggml_type_size(GGMLType type)
Gets the size in bytes of a GGML type.
void matvec_q6k_q8k_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.
const char * ggml_type_name(GGMLType type)
Gets the string name of a GGML type.
void matvec_q4k_q8k_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.
float vec_dot_q6_k_q8_k_cpu(int n, const std::vector< block_q6_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
Computes dot product between Q6_K and Q8_K vectors on CPU.
void handle_i8_tensor(const void *input_data, float *output_data, size_t num_elements)
Handles conversion of int8 tensor data to float32.
void quantize_q4_k_m(const float *input, void *output_qblock_void, int num_elements)
Quantizes float32 data to Q4_K format.
static void get_scale_min_indices_q4_K(int j, const uint8_t *scales, uint8_t *scale_index, uint8_t *min_index)
void dequantize_vector_q6k_to_f32(const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q6_K blocks to a vector of float32.
void dequantize_q2_k(const void *qblock_void, float *output, int num_weights_in_block)
std::vector< block_q8_K > quantize_fp32_to_q8_K(const std::vector< float > &f_data)
Quantizes float32 data to Q8_K format.
float fp16_to_fp32(uint16_t h, bool is_gguf_scale_field)
Converts a 16-bit floating point number to 32-bit float.
static std::atomic< int > g_vec_dot_q4_k_q8_k_log_count
void dequantize_vector_q8_0_to_f32(const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q8_0 blocks to a vector of float32.
void quantize_q6_k(const float *input, void *output_qblock_void, int num_elements)
Quantizes float32 data to Q6_K format.
float vec_dot_q4_k_q8_k_cpu(int n, const std::vector< block_q4_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
Computes dot product between Q4_K and Q8_K vectors on CPU.
constexpr float K_MIN_VALUES[64]
void dequantize_vector_q4k_to_f32(const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q4_K blocks to a vector of float32.
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
constexpr float K_SCALE_VALUES[64]
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
2-bit K-quantized block structure
uint16_t dmin
uint16_t d
uint8_t qs[GGML_QK_K/4]
uint8_t scales[GGML_QK_K/16]
3-bit K-quantized block structure
uint8_t scales[12]
uint16_t dmin
uint8_t hmask[GGML_QK_K/8]
uint16_t d
uint8_t qs[GGML_QK_K/4]
4-bit K-quantized block structure
uint16_t d
uint8_t scales[12]
uint8_t qs[GGML_QK_K/2]
uint16_t dmin
6-bit K-quantized block structure
int8_t scales[GGML_QK_K/16]
uint16_t d
uint8_t ql[GGML_QK_K/2]
uint8_t qh[GGML_QK_K/4]
Simple 8-bit quantized block structure.
uint16_t d
int8_t qs[GGML_QK8_0]
8-bit K-quantized block structure with block sums
int16_t bsums[GGML_QK_K/16]
int8_t qs[GGML_QK_K]
uint16_t d