23 1.0f, 1.0625f, 1.125f, 1.1875f, 1.25f, 1.3125f, 1.375f, 1.4375f,
24 1.5f, 1.5625f, 1.625f, 1.6875f, 1.75f, 1.8125f, 1.875f, 1.9375f,
25 2.0f, 2.125f, 2.25f, 2.375f, 2.5f, 2.625f, 2.75f, 2.875f,
26 3.0f, 3.125f, 3.25f, 3.375f, 3.5f, 3.625f, 3.75f, 3.875f,
27 4.0f, 4.25f, 4.5f, 4.75f, 5.0f, 5.25f, 5.5f, 5.75f,
28 6.0f, 6.25f, 6.5f, 6.75f, 7.0f, 7.25f, 7.5f, 7.75f,
29 8.0f, 8.5f, 9.0f, 9.5f, 10.0f, 10.5f, 11.0f, 11.5f,
30 12.0f, 12.5f, 13.0f, 13.5f, 14.0f, 14.5f, 15.0f, 15.5f};
33 0.0f, -0.0078125f, -0.015625f, -0.0234375f, -0.03125f, -0.0390625f,
34 -0.046875f, -0.0546875f, -0.0625f, -0.0703125f, -0.078125f, -0.0859375f,
35 -0.09375f, -0.1015625f, -0.109375f, -0.1171875f, -0.125f, -0.140625f,
36 -0.15625f, -0.171875f, -0.1875f, -0.203125f, -0.21875f, -0.234375f,
37 -0.25f, -0.265625f, -0.28125f, -0.296875f, -0.3125f, -0.328125f,
38 -0.34375f, -0.359375f, -0.375f, -0.40625f, -0.4375f, -0.46875f,
39 -0.5f, -0.53125f, -0.5625f, -0.59375f, -0.625f, -0.65625f,
40 -0.6875f, -0.71875f, -0.75f, -0.78125f, -0.8125f, -0.84375f,
41 -0.875f, -0.9375f, -1.0f, -1.0625f, -1.125f, -1.1875f,
42 -1.25f, -1.3125f, -1.375f, -1.4375f, -1.5f, -1.5625f,
43 -1.625f, -1.6875f, -1.75f, -1.8125f};
48 uint16_t h_to_convert = h;
49 bool original_sign_bit_was_set = (h & 0x8000);
50 uint32_t sign = (h_to_convert >> 15) & 1;
51 uint32_t exp_fp16 = (h_to_convert >> 10) & 0x1f;
52 uint32_t mant_fp16 = h_to_convert & 0x3ff;
61 while ((mant_fp16 & 0x400) == 0) {
66 uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
67 uint32_t mant_fp32 = mant_fp16 << 13;
68 x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
70 }
else if (exp_fp16 == 0x1f) {
71 x = (sign << 31) | (0xff << 23) | (mant_fp16 << 13);
73 uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
74 uint32_t mant_fp32 = mant_fp16 << 13;
75 x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
79 std::memcpy(&f, &x,
sizeof(
float));
81 if (is_gguf_scale_field) {
82 static std::atomic<int> q8_scale_f_log_count{0};
85 if (is_gguf_scale_field && f < 0.0f && !(std::isnan(f) || std::isinf(f))) {
94 std::memcpy(&x, &f,
sizeof(
float));
96 uint32_t sign = (x >> 31) & 1;
97 uint32_t exp_fp32 = (x >> 23) & 0xff;
98 uint32_t mant_fp32 = x & 0x7fffff;
102 if (exp_fp32 == 0xff) {
103 u = (sign << 15) | 0x7c00 | (mant_fp32 != 0 ? 0x200 : 0);
105 int exp_fp16 = (int)exp_fp32 - 127 + 15;
107 if (exp_fp16 >= 0x1f) {
108 u = (sign << 15) | 0x7c00;
109 }
else if (exp_fp16 <= 0) {
110 if (exp_fp16 < -10) {
113 mant_fp32 = (mant_fp32 | 0x800000) >> (1 - exp_fp16);
115 if ((mant_fp32 >> 13) & 1) {
116 mant_fp32 += (1 << 13);
118 u = (sign << 15) | (mant_fp32 >> 13);
121 if ((mant_fp32 >> 13) & 1) {
122 mant_fp32 += (1 << 13);
123 if ((mant_fp32 >> 23) == 1) {
126 if (exp_fp16 >= 0x1f) {
127 u = (sign << 15) | 0x7c00;
132 u = (sign << 15) | (exp_fp16 << 10) | (mant_fp32 >> 13);
140std::vector<float> k_lookup_table_scale;
141std::vector<float> k_lookup_table_min;
145static inline void get_scale_min_k4(
int j,
const uint8_t * q, uint8_t * d_val, uint8_t * m_val) {
150 *m_val = q[j + 4] & 63;
152 *d_val = (q[j+4] & 0x0F) | ((q[j-4] >> 6) << 4);
153 *m_val = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
159 const uint8_t* scales,
160 uint8_t* scale_index,
161 uint8_t* min_index) {
162 assert(j >= 0 && j < 16);
164 *scale_index = scales[j % 8] >> (4 * (j / 8));
165 *scale_index &= 0x0F;
167 *min_index = scales[j % 4 + 8] >> (4 * (j / 4));
172 int num_weights_in_block,
bool log_this_block) {
175 <<
"Warning: dequantize_q4_k_m called with num_weights != GGML_QK_K ("
176 << num_weights_in_block <<
")" << std::endl;
177 std::memset(output, 0, num_weights_in_block *
sizeof(
float));
184 const uint8_t * q_bytes_ptr = qblock->
qs;
185 float * y_ptr = output;
187 int scale_group_idx = 0;
191 for (
int sixtyfour_chunk_idx = 0; sixtyfour_chunk_idx <
GGML_QK_K / 64; ++sixtyfour_chunk_idx) {
192 uint8_t sc_val1, m_val1;
193 uint8_t sc_val2, m_val2;
200 const float d1 = d_super_scale *
static_cast<float>(sc_val1);
201 const float m1 = d_super_min *
static_cast<float>(m_val1);
202 const float d2 = d_super_scale *
static_cast<float>(sc_val2);
203 const float m2 = d_super_min *
static_cast<float>(m_val2);
206 for (
int l = 0; l < 32; ++l) {
208 uint8_t quant_nibble = (q_bytes_ptr[l] & 0x0F);
209 *y_ptr++ = d1 *
static_cast<float>(quant_nibble) - m1;
213 for (
int l = 0; l < 32; ++l) {
214 uint8_t quant_nibble = (q_bytes_ptr[l] >> 4);
215 *y_ptr++ = d2 *
static_cast<float>(quant_nibble) - m2;
219 scale_group_idx += 2;
224 int num_weights_in_block,
bool log_this_block) {
227 <<
"Warning: dequantize_q6_k called with num_weights != GGML_QK_K ("
228 << num_weights_in_block <<
")" << std::endl;
229 std::memset(output, 0, num_weights_in_block *
sizeof(
float));
236 const uint8_t * p_ql = qblock->
ql;
237 const uint8_t * p_qh = qblock->
qh;
238 const int8_t * p_sc = qblock->
scales;
239 float * p_y = output;
243 for (
int half_idx = 0; half_idx < 2; ++half_idx) {
245 const uint8_t * ql = p_ql + (half_idx * 64);
246 const uint8_t * qh = p_qh + (half_idx * 32);
247 const int8_t * sc = p_sc + (half_idx * 8);
248 float * y = p_y + (half_idx * 128);
251 for (
int l = 0; l < 32; ++l) {
255 const int8_t q1 = (int8_t)(((ql[l + 0] & 0x0F) | (((qh[l] >> 0) & 0x03) << 4))) - 32;
256 const int8_t q2 = (int8_t)(((ql[l + 32] & 0x0F) | (((qh[l] >> 2) & 0x03) << 4))) - 32;
257 const int8_t q3 = (int8_t)(((ql[l + 0] >> 4) | (((qh[l] >> 4) & 0x03) << 4))) - 32;
258 const int8_t q4 = (int8_t)(((ql[l + 32] >> 4) | (((qh[l] >> 6) & 0x03) << 4))) - 32;
260 y[l + 0] = d * sc[is + 0] * q1;
261 y[l + 32] = d * sc[is + 2] * q2;
262 y[l + 64] = d * sc[is + 4] * q3;
263 y[l + 96] = d * sc[is + 6] * q4;
269 size_t num_elements) {
270 const int8_t* input_ptr =
static_cast<const int8_t*
>(input_data);
271 for (
size_t i = 0; i < num_elements; ++i) {
272 output_data[i] =
static_cast<float>(input_ptr[i]);
279 throw std::invalid_argument(
280 "quantize_q4_k_m currently only supports block size " +
286 std::memset(output_qblock->
scales, 0,
sizeof(output_qblock->
scales));
287 std::memset(output_qblock->
qs, 0,
sizeof(output_qblock->
qs));
289 float block_min_val = std::numeric_limits<float>::max();
290 float block_max_val = std::numeric_limits<float>::lowest();
291 for (
int i = 0; i < num_elements; ++i) {
292 block_min_val =
SAFE_MIN(block_min_val, input[i]);
293 block_max_val =
SAFE_MAX(block_max_val, input[i]);
296 if (block_max_val == block_min_val) {
299 if (block_max_val < GGUF_EPSILON && block_max_val > -
GGUF_EPSILON) {
301 block_min_val = 0.0f;
304 const float d_super_scale_candidate = (block_max_val - block_min_val) /
Q4K_SCALE_FACTOR;
305 const float d_super =
307 const float min_super = block_min_val;
312 for (
int j = 0; j <
GGML_QK_K / 16; ++j) {
313 const float* sub_block_input = input + j * 16;
315 float sub_min_val = sub_block_input[0];
316 float sub_max_val = sub_block_input[0];
317 for (
int i = 1; i < 16; ++i) {
318 sub_min_val =
SAFE_MIN(sub_min_val, sub_block_input[i]);
319 sub_max_val =
SAFE_MAX(sub_max_val, sub_block_input[i]);
322 float ideal_scale = 0.0f;
326 float ideal_min = sub_min_val;
328 uint8_t best_scale_idx = 0;
329 float min_scale_err = std::numeric_limits<float>::max();
331 for (uint8_t k = 0; k < 16; ++k) {
333 float err = std::abs(candidate_scale - ideal_scale);
334 if (err < min_scale_err) {
341 uint8_t best_min_idx = 0;
342 float min_min_err = std::numeric_limits<float>::max();
344 for (uint8_t l = 0; l < 16; ++l) {
346 float err = std::abs(candidate_min - ideal_min);
347 if (err < min_min_err) {
353 int scale_byte_idx = j % 8;
354 int scale_shift = 4 * (j / 8);
355 output_qblock->
scales[scale_byte_idx] |= (best_scale_idx << scale_shift);
357 int min_byte_idx = (j % 4) + 8;
358 int min_shift = 4 * (j / 4);
359 output_qblock->
scales[min_byte_idx] |= (best_min_idx << min_shift);
362 float actual_min = min_super *
K_MIN_VALUES[best_min_idx];
364 ? 1.0f / actual_scale
367 uint8_t packed_qs[8];
369 std::memset(packed_qs, 0,
sizeof(packed_qs));
371 for (
int i = 0; i < 16; ++i) {
372 float val = sub_block_input[i];
375 if (inv_actual_scale != 0.0f) {
377 static_cast<int>(std::round((val - actual_min) * inv_actual_scale)) +
Q4K_OFFSET;
381 int byte_idx_qs = i / 2;
382 int shift_qs = (i % 2) * 4;
383 packed_qs[byte_idx_qs] |= (
static_cast<uint8_t
>(quant_val) << shift_qs);
386 uint8_t* qs_target = output_qblock->
qs + j * 8;
387 for (
int i = 0; i < 8; ++i) {
388 uint8_t low_nibble_val = packed_qs[i] & 0x0F;
389 uint8_t high_nibble_val = (packed_qs[i] >> 4) & 0x0F;
390 qs_target[i] = low_nibble_val | (high_nibble_val << 4);
396 int num_weights_in_block) {
398 throw std::invalid_argument(
399 "dequantize_q2_k currently only supports block size " +
408 const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
409 const float dmin_float =
410 (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
415 const uint8_t* scales_ptr = qblock->
scales;
416 const uint8_t* qs_ptr = qblock->
qs;
417 int weight_index = 0;
418 float dequantized_scales[16];
420 for (
int i = 0; i < 8; ++i) {
421 uint8_t packed_scales = scales_ptr[i];
422 uint8_t scale_low = packed_scales & 0x0F;
423 uint8_t scale_high = packed_scales >> 4;
425 dequantized_scales[i * 2 + 0] =
426 d_float_clamped *
static_cast<float>(scale_low);
427 dequantized_scales[i * 2 + 1] =
428 d_float_clamped *
static_cast<float>(scale_high);
430 dequantized_scales[i * 2 + 0] =
432 dequantized_scales[i * 2 + 1] =
437 for (
int j = 0; j <
GGML_QK_K / 16; ++j) {
438 float sub_block_scale = dequantized_scales[j];
440 const uint8_t* qs_subblock_ptr = qs_ptr + j * 4;
442 for (
int i = 0; i < 4; ++i) {
443 uint8_t packed_weights = qs_subblock_ptr[i];
445 uint8_t q0 = (packed_weights >> 0) & 0x03;
446 uint8_t q1 = (packed_weights >> 2) & 0x03;
447 uint8_t q2 = (packed_weights >> 4) & 0x03;
448 uint8_t q3 = (packed_weights >> 6) & 0x03;
451 sub_block_scale *
static_cast<float>(q0) + dmin_float_clamped;
453 sub_block_scale *
static_cast<float>(q1) + dmin_float_clamped;
455 sub_block_scale *
static_cast<float>(q2) + dmin_float_clamped;
457 sub_block_scale *
static_cast<float>(q3) + dmin_float_clamped;
464 output[weight_index++] = val0;
466 output[weight_index++] = val1;
468 output[weight_index++] = val2;
470 output[weight_index++] = val3;
477 int num_weights_in_block) {
479 throw std::invalid_argument(
480 "dequantize_q3_k currently only supports block size " +
489 const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
490 const float dmin_float =
491 (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
493 const uint8_t* hmask_ptr = qblock->
hmask;
494 const uint8_t* qs_ptr = qblock->
qs;
495 const uint8_t* scales_ptr = qblock->
scales;
497 int weight_index = 0;
499 for (
int j = 0; j <
GGML_QK_K / 16; ++j) {
503 scale_idx = scales_ptr[j] & 0x3F;
505 scale_idx = scales_ptr[j + 4] & 0x3F;
508 assert(scale_idx < 64 &&
"Scale index out of bounds for Q3_K lookup");
511 const float final_sub_block_scale = d_float * sub_block_scale_factor;
512 const float final_sub_block_min = dmin_float;
514 for (
int i = 0; i < 4; ++i) {
515 uint8_t qs_byte = qs_ptr[j * 4 + i];
516 uint8_t hmask_byte = hmask_ptr[j];
518 for (
int bit_pos = 0; bit_pos < 8; bit_pos += 2) {
519 uint8_t lower_bits = (qs_byte >> bit_pos) & 0x3;
521 int hmask_bit_idx = (i * 4) + (bit_pos / 2);
523 uint8_t high_bit = (hmask_byte >> hmask_bit_idx) & 0x1;
525 uint8_t q_val = (high_bit << 2) | lower_bits;
527 float val = final_sub_block_scale *
static_cast<float>(q_val) +
530 if (!std::isfinite(val)) {
534 output[weight_index++] = val;
540 std::cout <<
"ERROR: Processed " << weight_index <<
" weights instead of "
544 output[weight_index++] = 0.0f;
552 throw std::invalid_argument(
553 "quantize_q6_k currently only supports block size " +
559 uint8_t* ql = output_qblock->
ql;
560 uint8_t* qh = output_qblock->
qh;
561 int8_t* scales = output_qblock->
scales;
566 for (
int i = 0; i < num_elements; ++i) {
567 amax =
SAFE_MAX(amax, std::abs(input[i]));
573 for (
int sub = 0; sub <
GGML_QK_K / 16; ++sub) {
574 const float* sub_in = input + sub * 16;
576 float sub_amax = 0.0f;
577 for (
int i = 0; i < 16; ++i) {
578 sub_amax =
SAFE_MAX(sub_amax, std::abs(sub_in[i]));
581 int8_t scale = (d_float > 0.0f) ? std::round(sub_amax / d_float) : 1;
582 if (scale == 0) scale = 1;
585 for (
int i = 0; i < 16; ++i) {
586 float val = sub_in[i];
587 int q =
static_cast<int>(std::round(val / (d_float * scale))) +
Q6K_OFFSET;
590 int idx = sub * 16 + i;
591 int ql_idx = idx / 2;
592 int ql_shift = (idx % 2) * 4;
593 ql[ql_idx] |= (q & 0x0F) << ql_shift;
594 int qh_idx = idx / 4;
595 int qh_shift = (idx % 4) * 2;
596 qh[qh_idx] |= ((q >> 4) & 0x03) << qh_shift;
649 return sizeof(float);
651 return sizeof(uint16_t);
653 return sizeof(int8_t);
674 return sizeof(int16_t);
676 return sizeof(int32_t);
678 return sizeof(uint16_t);
681 std::cout <<
" UNKNOWN GGML TYPE: " <<
static_cast<int>(type)
683 throw std::invalid_argument(
"Unknown GGML type in ggml_type_size: " +
684 std::to_string(
static_cast<int>(type)));
711 std::cout <<
"Warning: Unknown GGMLType in ggml_type_block_size: "
712 <<
static_cast<int>(type) << std::endl;
720 const std::vector<float>& f_data) {
722 throw std::runtime_error(
723 "Input vector size must be a multiple of GGML_QK_K (" +
727 size_t num_blocks = f_data.size() /
GGML_QK_K;
728 std::vector<block_q8_K> q_data(num_blocks);
729 const float* x = f_data.data();
732 static std::atomic<int> log_count_q8k_quant_scales = 0;
734 for (
size_t i = 0; i < num_blocks; ++i) {
737 amax =
SAFE_MAX(amax, std::abs(x[j]));
741 const float id = (d_fp32 != 0.f) ? 1.0f / d_fp32 : 0.0f;
744 if (log_count_q8k_quant_scales < 10) {
745 std::stringstream q8k_scale_log_ss;
746 q8k_scale_log_ss <<
"[Q8K_QUANT_SCALES] Block #" << i
747 <<
" Input amax=" << amax <<
" -> d_fp32=" << d_fp32
748 <<
" -> Stored d_fp16=0x" << std::hex << y[i].
d
751 log_count_q8k_quant_scales++;
754 int16_t block_sum[16] = {0};
756 const float val_scaled = x[j] * id;
758 int8_t q_val =
static_cast<int8_t
>(
761 block_sum[j / 16] += q_val;
764 std::memcpy(y[i].bsums, block_sum,
sizeof(block_sum));
773 const std::vector<block_q8_K>& y_vec,
774 bool log_this_call) {
776 throw std::runtime_error(
"vec_dot_q6_k_q8_k: n must be multiple of QK_K");
779 if (x_vec.size() != nb || y_vec.size() != nb) {
780 throw std::runtime_error(
"vec_dot_q6_k_q8_k: vector block count mismatch");
790 std::memset(sums, 0, 8 *
sizeof(
float));
794 static std::atomic<int> log_count_dot = 0;
795 bool should_log_this_block = log_this_call && log_count_dot < 5;
797 for (
size_t i = 0; i < nb; ++i) {
798 const uint8_t* ql = x[i].
ql;
799 const uint8_t* qh = x[i].
qh;
800 const int8_t* q8 = y[i].
qs;
801 std::memset(aux32, 0, 8 *
sizeof(int32_t));
804 for (
int j = 0; j <
GGML_QK_K; j += 128) {
805 for (
int l = 0; l < 32; ++l) {
806 a[l + 0] =
static_cast<int8_t
>(
807 ((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32);
808 a[l + 32] =
static_cast<int8_t
>(
809 ((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32);
810 a[l + 64] =
static_cast<int8_t
>(
811 ((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32);
812 a[l + 96] =
static_cast<int8_t
>(
813 ((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
822 for (
int j = 0; j <
GGML_QK_K / 16; ++j) {
823 int scale = x[i].
scales[is++];
824 for (
int l = 0; l < 8; ++l)
825 aux16[l] =
static_cast<int16_t
>(q8[l]) *
static_cast<int16_t
>(a[l]);
826 for (
int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
829 for (
int l = 0; l < 8; ++l)
830 aux16[l] =
static_cast<int16_t
>(q8[l]) *
static_cast<int16_t
>(a[l]);
831 for (
int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
836 int32_t sumi_mins = 0;
837 for (
int j = 0; j <
GGML_QK_K / 16; ++j) {
838 sumi_mins +=
static_cast<int32_t
>(y[i].
bsums[j]) *
839 static_cast<int32_t
>(x[i].scales[j]);
844 const float d = d_q6 * d_q8;
846 float block_contribution = 0.0f;
847 for (
int l = 0; l < 8; ++l) {
848 float term = d * (aux32[l] - 32 * sumi_mins / 8);
850 block_contribution += term;
853 if (i == 0 && should_log_this_block) {
854 std::stringstream ss_log;
855 ss_log <<
"[DOT_Q6K_Q8K] Call #" << (log_count_dot.load() + 1)
859 ss_log <<
" Q6_K Scale d_q6: " << d_q6 <<
" (Raw FP16: 0x" << std::hex
860 << x[i].
d << std::dec <<
")";
863 ss_log <<
" Q8_K Scale d_q8: " << d_q8;
866 ss_log <<
" Combined Scale d: " << d;
869 ss_log <<
" Q6_K Sub-scales (int8): ";
870 for (
int k = 0; k < 16; ++k) ss_log << (
int)x[i].
scales[k] <<
" ";
873 ss_log <<
" Int32 Sums (aux32, before compensation): ";
874 for (
int l = 0; l < 8; ++l) ss_log << aux32[l] <<
" ";
877 ss_log <<
" Compensation term (sumi_mins): " << sumi_mins
878 <<
", -32 * sumi_mins: " << (-32 * sumi_mins);
881 ss_log <<
" Block #0 Contribution to Sums (after compensation): "
882 << block_contribution;
887 for (
int l = 0; l < 8; ++l) {
891 if (should_log_this_block) {
898 const std::vector<block_q8_K>& vec_q8k,
899 std::vector<float>& out_f32,
int rows,
int cols,
902 throw std::runtime_error(
903 "matvec_q6k_q8k_cpu: cols must be divisible by GGML_QK_K");
905 size_t blocks_per_row = cols /
GGML_QK_K;
906 if (mat_q6k.size() != (
size_t)rows * blocks_per_row) {
907 throw std::runtime_error(
"matvec_q6k_q8k_cpu: mat_q6k size mismatch");
909 if (vec_q8k.size() != blocks_per_row) {
910 throw std::runtime_error(
"matvec_q6k_q8k_cpu: vec_q8k size mismatch");
912 out_f32.resize(rows);
913 for (
int r = 0; r < rows; ++r) {
914 const std::vector<block_q6_K> row_q6k(
915 mat_q6k.begin() + r * blocks_per_row,
916 mat_q6k.begin() + (r + 1) * blocks_per_row);
923 const std::vector<block_q8_K>& y_vec,
924 bool log_this_call) {
926 if (log_count_now >= 5) log_this_call =
false;
929 throw std::runtime_error(
"vec_dot_q4_k_q8_k: n must be multiple of QK_K");
932 if (x_vec.size() != nb || y_vec.size() != nb) {
933 throw std::runtime_error(
"vec_dot_q4_k_q8_k: vector block count mismatch");
940 for (
size_t i = 0; i < nb; ++i) {
942 const uint8_t* q4 = x[i].
qs;
943 for (
int j = 0; j <
GGML_QK_K / 2; ++j) {
944 q4_vals[2 * j + 0] =
static_cast<int8_t
>(q4[j] & 0xF);
945 q4_vals[2 * j + 1] =
static_cast<int8_t
>(q4[j] >> 4);
948 const int8_t* q8 = y[i].
qs;
950 for (
int sub = 0; sub < 16; ++sub) {
951 uint8_t scale_idx, min_idx;
955 for (
int k = 0; k < 16; ++k) {
956 int idx = sub * 16 + k;
957 float q4_val =
static_cast<float>(q4_vals[idx]) - 8.0f;
958 float q8_val =
static_cast<float>(q8[idx]);
959 sumf += (scale * q4_val + minv) * q8_val;
963 if (i == 0 && log_this_call) {
964 std::stringstream ss;
969 ss <<
"[Q4K_Q8K] Block #0: Q8_K input (first 16): ";
970 for (
int k = 0; k < 16; ++k) ss << (
int)q8[k] <<
" ";
973 ss <<
"[Q4K_Q8K] Block #0: Q4_K unpacked (first 16): ";
974 for (
int k = 0; k < 16; ++k) ss << (
int)q4_vals[k] <<
" ";
983 const std::vector<block_q8_K>& vec_q8k,
984 std::vector<float>& out_f32,
int rows,
int cols,
987 throw std::runtime_error(
988 "matvec_q4k_q8k_cpu: cols must be divisible by GGML_QK_K");
990 size_t blocks_per_row = cols /
GGML_QK_K;
991 if (mat_q4k.size() != (
size_t)rows * blocks_per_row) {
992 throw std::runtime_error(
"matvec_q4k_q8k_cpu: mat_q4k size mismatch");
994 if (vec_q8k.size() != blocks_per_row) {
995 throw std::runtime_error(
"matvec_q4k_q8k_cpu: vec_q8k size mismatch");
997 out_f32.resize(rows);
999#pragma omp parallel for
1000 for (
int r = 0; r < rows; ++r) {
1001 const std::vector<block_q4_K> row_q4k(
1002 mat_q4k.begin() + r * blocks_per_row,
1003 mat_q4k.begin() + (r + 1) * blocks_per_row);
1010 std::vector<float>& x,
int n,
bool log_this_block) {
1013 <<
"Error: n must be a multiple of GGML_QK_K for Q8_K dequantization."
1018 if (q_data.size() < num_blocks) {
1019 std::cerr <<
"Error: Not enough Q8_K blocks provided for dequantization."
1024 static std::atomic<int> log_count_q8k_dequant_scales = 0;
1026 for (
size_t i = 0; i < num_blocks; ++i) {
1032 if (log_this_block && log_count_q8k_dequant_scales < 10) {
1033 std::stringstream scale_log_ss;
1034 scale_log_ss <<
"[Q8K_DEQUANT_SCALES] Block #"
1035 << (log_count_q8k_dequant_scales.load()) <<
" Raw_d_fp16=0x"
1036 << std::hex << qblock->
d << std::dec <<
" -> d=" << d;
1038 log_count_q8k_dequant_scales++;
1042 x_block[j] = d *
static_cast<float>(qblock->
qs[j]);
1050 output[i] = d_fp32 *
static_cast<float>(qblock->
qs[i]);
1055 std::vector<float>& f32_weights,
1056 size_t total_num_elements,
1057 int log_first_n_blocks) {
1058 if (q_weights.empty()) {
1059 Logger::warning(
"[DEQUANT_VEC_Q6K] Input Q6_K weight vector is empty. Output float vector will be empty.");
1060 f32_weights.clear();
1064 f32_weights.resize(total_num_elements);
1067 if (q_weights.size() != expected_blocks) {
1068 Logger::error(
"[DEQUANT_VEC_Q6K] Mismatch in Q6_K block count. Expected: " +
1069 std::to_string(expected_blocks) +
", Got: " + std::to_string(q_weights.size()) +
1070 ". Total elements: " + std::to_string(total_num_elements));
1074 float* current_output_ptr = f32_weights.data();
1075 size_t elements_processed = 0;
1077 for (
size_t i = 0; i < q_weights.size(); ++i) {
1078 const block_q6_K* current_block_ptr = &q_weights[i];
1081 if (elements_processed +
GGML_QK_K > total_num_elements) {
1082 elements_in_this_block = total_num_elements - elements_processed;
1085 if (elements_in_this_block <= 0) {
1086 Logger::warning(
"[DEQUANT_VEC_Q6K] Zero or negative elements requested for block " + std::to_string(i) +
". Skipping.");
1090 bool log_this_specific_block = (log_first_n_blocks > 0 &&
static_cast<int>(i) < log_first_n_blocks);
1093 static std::atomic<bool> first_call_ever{
true};
1094 bool is_first_call = first_call_ever.exchange(
false);
1096 dequantize_q6_k(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
1098 current_output_ptr += elements_in_this_block;
1099 elements_processed += elements_in_this_block;
1103 if (elements_processed != total_num_elements) {
1104 Logger::warning(
"[DEQUANT_VEC_Q6K] Processed " + std::to_string(elements_processed) +
1105 " elements, but expected " + std::to_string(total_num_elements) +
".");
1110 std::vector<float>& f32_weights,
1111 size_t total_num_elements,
1112 int log_first_n_blocks) {
1113 if (q_weights.empty()) {
1114 Logger::warning(
"[DEQUANT_VEC_Q4K] Input Q4_K weight vector is empty. Output float vector will be empty.");
1115 f32_weights.clear();
1119 f32_weights.resize(total_num_elements);
1122 if (q_weights.size() != expected_blocks) {
1123 Logger::error(
"[DEQUANT_VEC_Q4K] Mismatch in Q4_K block count. Expected: " +
1124 std::to_string(expected_blocks) +
", Got: " + std::to_string(q_weights.size()) +
1125 ". Total elements: " + std::to_string(total_num_elements));
1128 float* current_output_ptr = f32_weights.data();
1129 size_t elements_processed = 0;
1131 for (
size_t i = 0; i < q_weights.size(); ++i) {
1132 const block_q4_K* current_block_ptr = &q_weights[i];
1135 if (elements_processed +
GGML_QK_K > total_num_elements) {
1136 elements_in_this_block = total_num_elements - elements_processed;
1139 if (elements_in_this_block <= 0) {
1140 Logger::warning(
"[DEQUANT_VEC_Q4K] Zero or negative elements requested for block " + std::to_string(i) +
". Skipping.");
1144 bool log_this_specific_block = (log_first_n_blocks > 0 &&
static_cast<int>(i) < log_first_n_blocks);
1147 static std::atomic<bool> first_call_ever{
true};
1148 bool is_first_call = first_call_ever.exchange(
false);
1151 dequantize_q4_k_m(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
1154 current_output_ptr += elements_in_this_block;
1155 elements_processed += elements_in_this_block;
1159 if (elements_processed != total_num_elements) {
1160 Logger::warning(
"[DEQUANT_VEC_Q4K] Processed " + std::to_string(elements_processed) +
1161 " elements, but expected " + std::to_string(total_num_elements) +
".");
1166 std::vector<float>& f32_weights,
1167 size_t total_num_elements,
1168 int log_first_n_blocks) {
1169 if (q_weights.empty()) {
1170 Logger::warning(
"[DEQUANT_VEC_Q8_0] Input Q8_0 weight vector is empty. Output float vector will be empty.");
1171 f32_weights.clear();
1175 f32_weights.resize(total_num_elements);
1179 if (q_weights.size() != expected_blocks) {
1180 Logger::error(
"[DEQUANT_VEC_Q8_0] Mismatch in Q8_0 block count. Expected: " +
1181 std::to_string(expected_blocks) +
", Got: " + std::to_string(q_weights.size()) +
1182 ". Total elements: " + std::to_string(total_num_elements));
1185 float* current_output_ptr = f32_weights.data();
1186 size_t elements_processed = 0;
1188 for (
size_t i = 0; i < q_weights.size(); ++i) {
1189 const block_q8_0* current_block_ptr = &q_weights[i];
1192 if (elements_processed +
GGML_QK8_0 > total_num_elements) {
1193 elements_in_this_block = total_num_elements - elements_processed;
1196 if (elements_in_this_block <= 0) {
1197 Logger::warning(
"[DEQUANT_VEC_Q8_0] Zero or negative elements requested for block " + std::to_string(i) +
". Skipping.");
1201 bool log_this_specific_block = (log_first_n_blocks > 0 &&
static_cast<int>(i) < log_first_n_blocks);
1203 static std::atomic<bool> first_call_ever{
true};
1204 bool is_first_call = first_call_ever.exchange(
false);
1212 std::memcpy(current_output_ptr, temp_block, elements_in_this_block *
sizeof(
float));
1215 current_output_ptr += elements_in_this_block;
1216 elements_processed += elements_in_this_block;
1219 if (elements_processed != total_num_elements) {
1220 Logger::warning(
"[DEQUANT_VEC_Q8_0] Processed " + std::to_string(elements_processed) +
1221 " elements, but expected " + std::to_string(total_num_elements) +
".");
static void debug(const std::string &message)
static void warning(const std::string &message)
static void error(const std::string &message)
GGMLType
Enumeration of GGML tensor data types.
Parser for GGUF (GPT-Generated Unified Format) files.
constexpr float TENSOR_SCALE_MAX
Constants for tensor value validation.
constexpr int8_t Q4K_OFFSET
Offset values for quantization methods.
constexpr int8_t Q6K_OFFSET
constexpr float Q8K_SCALE_FACTOR
constexpr float Q4K_SCALE_FACTOR
Scale factors for different quantization methods.
constexpr float TENSOR_SCALE_MIN
constexpr float Q6K_SCALE_FACTOR
constexpr float GGUF_EPSILON
Constants for numeric stability in calculations.
constexpr size_t GGML_QK8_0
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
constexpr float GGUF_SMALL_VAL
Logging utilities for the TinyLlama implementation.
uint16_t fp32_to_fp16(float f)
Converts a 32-bit float to 16-bit floating point.
void dequantize_q3_k(const void *qblock_void, float *output, int num_weights_in_block)
Dequantizes a Q3_K quantized block to float32.
size_t ggml_type_block_size(GGMLType type)
Gets the block size for a GGML type.
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
static void get_scale_min_k4(int j, const uint8_t *q, uint8_t *d_val, uint8_t *m_val)
size_t ggml_type_size(GGMLType type)
Gets the size in bytes of a GGML type.
void matvec_q6k_q8k_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.
const char * ggml_type_name(GGMLType type)
Gets the string name of a GGML type.
void matvec_q4k_q8k_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.
float vec_dot_q6_k_q8_k_cpu(int n, const std::vector< block_q6_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
Computes dot product between Q6_K and Q8_K vectors on CPU.
void handle_i8_tensor(const void *input_data, float *output_data, size_t num_elements)
Handles conversion of int8 tensor data to float32.
void quantize_q4_k_m(const float *input, void *output_qblock_void, int num_elements)
Quantizes float32 data to Q4_K format.
static void get_scale_min_indices_q4_K(int j, const uint8_t *scales, uint8_t *scale_index, uint8_t *min_index)
void dequantize_vector_q6k_to_f32(const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q6_K blocks to a vector of float32.
void dequantize_q2_k(const void *qblock_void, float *output, int num_weights_in_block)
std::vector< block_q8_K > quantize_fp32_to_q8_K(const std::vector< float > &f_data)
Quantizes float32 data to Q8_K format.
float fp16_to_fp32(uint16_t h, bool is_gguf_scale_field)
Converts a 16-bit floating point number to 32-bit float.
static std::atomic< int > g_vec_dot_q4_k_q8_k_log_count
void dequantize_vector_q8_0_to_f32(const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q8_0 blocks to a vector of float32.
void quantize_q6_k(const float *input, void *output_qblock_void, int num_elements)
Quantizes float32 data to Q6_K format.
float vec_dot_q4_k_q8_k_cpu(int n, const std::vector< block_q4_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
Computes dot product between Q4_K and Q8_K vectors on CPU.
constexpr float K_MIN_VALUES[64]
void dequantize_vector_q4k_to_f32(const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q4_K blocks to a vector of float32.
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
constexpr float K_SCALE_VALUES[64]
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
2-bit K-quantized block structure
uint8_t scales[GGML_QK_K/16]
3-bit K-quantized block structure
uint8_t hmask[GGML_QK_K/8]
4-bit K-quantized block structure
6-bit K-quantized block structure
int8_t scales[GGML_QK_K/16]
Simple 8-bit quantized block structure.
8-bit K-quantized block structure with block sums
int16_t bsums[GGML_QK_K/16]