TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
Classes | Macros | Functions
quantization.h File Reference

Weight quantization structures and functions for model compression. More...

#include <cstddef>
#include <cstdint>
#include <limits>
#include <vector>
#include "ggml_types.h"
#include "gguf_parser.h"
Include dependency graph for quantization.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  block_q4_K
 4-bit K-quantized block structure More...
 
struct  block_q6_K
 6-bit K-quantized block structure More...
 
struct  block_q2_K
 2-bit K-quantized block structure More...
 
struct  block_q3_K
 3-bit K-quantized block structure More...
 
struct  block_q8_K
 8-bit K-quantized block structure with block sums More...
 
struct  block_q8_0
 Simple 8-bit quantized block structure. More...
 

Macros

#define RESTRICT   __restrict__
 

Functions

float fp16_to_fp32 (uint16_t h, bool is_gguf_scale_field=false)
 Converts a 16-bit floating point number to 32-bit float.
 
uint16_t fp32_to_fp16 (float f)
 Converts a 32-bit float to 16-bit floating point.
 
const char * ggml_type_name (GGMLType type)
 Gets the string name of a GGML type.
 
size_t ggml_type_size (GGMLType type)
 Gets the size in bytes of a GGML type.
 
size_t ggml_type_block_size (GGMLType type)
 Gets the block size for a GGML type.
 
void dequantize_q2_k (const void *q_data, float *f_data, int num_weights_in_block, bool log_details_for_this_block=false)
 Dequantizes a Q2_K quantized block to float32.
 
void dequantize_q4_k_m (const block_q4_K *qblock, float *RESTRICT output_f32, int num_elements, bool log_this_block=false)
 Dequantizes a Q4_K quantized block to float32.
 
void dequantize_q6_k (const block_q6_K *qblock, float *RESTRICT output_f32, int num_elements, bool log_this_block=false)
 Dequantizes a Q6_K quantized block to float32.
 
void dequantize_vector_q6k_to_f32 (const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks=0)
 Dequantizes a vector of Q6_K blocks to a vector of float32.
 
void dequantize_q3_k (const void *q_data, float *f_data, int num_weights_in_block)
 Dequantizes a Q3_K quantized block to float32.
 
void handle_i8_tensor (const void *i8_data, float *f_data, size_t num_elements)
 Handles conversion of int8 tensor data to float32.
 
void quantize_q4_k_m (const float *f_data, void *q_data, int num_elements)
 Quantizes float32 data to Q4_K format.
 
void quantize_q6_k (const float *f_data, void *q_data, int num_elements)
 Quantizes float32 data to Q6_K format.
 
std::vector< block_q8_Kquantize_fp32_to_q8_K (const std::vector< float > &f_data)
 Quantizes float32 data to Q8_K format.
 
float vec_dot_q6_k_q8_k_cpu (int n, const std::vector< block_q6_K > &x, const std::vector< block_q8_K > &y, bool log_this_call)
 Computes dot product between Q6_K and Q8_K vectors on CPU.
 
void matvec_q6k_q8k_cpu (const std::vector< block_q6_K > &mat_q6k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
 Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.
 
float vec_dot_q4_k_q8_k_cpu (int n, const std::vector< block_q4_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
 Computes dot product between Q4_K and Q8_K vectors on CPU.
 
void matvec_q4k_q8k_cpu (const std::vector< block_q4_K > &mat_q4k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
 Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.
 
void dequantize_q8_0_block (const block_q8_0 *qblock, float *output)
 Dequantizes a Q8_0 block to float32.
 
void dequantize_vector_q4k_to_f32 (const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks=0)
 Dequantizes a vector of Q4_K blocks to a vector of float32.
 
void dequantize_vector_q8_0_to_f32 (const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks=0)
 Dequantizes a vector of Q8_0 blocks to a vector of float32.
 

Detailed Description

Weight quantization structures and functions for model compression.

This file contains the definitions for various quantization formats and functions to convert between them. The quantization methods include Q2_K, Q3_K, Q4_K, Q6_K, and Q8_0 formats, each offering different compression ratios and precision tradeoffs.

Definition in file quantization.h.

Macro Definition Documentation

◆ RESTRICT

#define RESTRICT   __restrict__

Definition at line 15 of file quantization.h.

Function Documentation

◆ dequantize_q2_k()

void dequantize_q2_k ( const void *  q_data,
float *  f_data,
int  num_weights_in_block,
bool  log_details_for_this_block = false 
)

Dequantizes a Q2_K quantized block to float32.

Parameters
q_dataPointer to quantized data
f_dataOutput float array
num_weights_in_blockNumber of weights to dequantize
log_details_for_this_blockWhether to log dequantization details

◆ dequantize_q3_k()

void dequantize_q3_k ( const void *  q_data,
float *  f_data,
int  num_weights_in_block 
)

Dequantizes a Q3_K quantized block to float32.

Parameters
q_dataPointer to quantized data
f_dataOutput float array
num_weights_in_blockNumber of weights to dequantize

Definition at line 476 of file quantization.cpp.

477 {
478 if (num_weights_in_block != GGML_QK_K) {
479 throw std::invalid_argument(
480 "dequantize_q3_k currently only supports block size " +
481 std::to_string(GGML_QK_K));
482 }
483
484 const block_q3_K* qblock = static_cast<const block_q3_K*>(qblock_void);
485
486 const float d_float_raw = fp16_to_fp32(qblock->d);
487 const float dmin_float_raw = fp16_to_fp32(qblock->dmin);
488
489 const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
490 const float dmin_float =
491 (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
492
493 const uint8_t* hmask_ptr = qblock->hmask;
494 const uint8_t* qs_ptr = qblock->qs;
495 const uint8_t* scales_ptr = qblock->scales;
496
497 int weight_index = 0;
498
499 for (int j = 0; j < GGML_QK_K / 16; ++j) {
500 uint8_t scale_idx;
501
502 if (j < 8) {
503 scale_idx = scales_ptr[j] & 0x3F;
504 } else {
505 scale_idx = scales_ptr[j + 4] & 0x3F;
506 }
507
508 assert(scale_idx < 64 && "Scale index out of bounds for Q3_K lookup");
509 const float sub_block_scale_factor = K_SCALE_VALUES[scale_idx];
510
511 const float final_sub_block_scale = d_float * sub_block_scale_factor;
512 const float final_sub_block_min = dmin_float;
513
514 for (int i = 0; i < 4; ++i) {
515 uint8_t qs_byte = qs_ptr[j * 4 + i];
516 uint8_t hmask_byte = hmask_ptr[j];
517
518 for (int bit_pos = 0; bit_pos < 8; bit_pos += 2) {
519 uint8_t lower_bits = (qs_byte >> bit_pos) & 0x3;
520
521 int hmask_bit_idx = (i * 4) + (bit_pos / 2);
522
523 uint8_t high_bit = (hmask_byte >> hmask_bit_idx) & 0x1;
524
525 uint8_t q_val = (high_bit << 2) | lower_bits;
526
527 float val = final_sub_block_scale * static_cast<float>(q_val) +
528 final_sub_block_min;
529
530 if (!std::isfinite(val)) {
531 val = 0.0f;
532 }
533
534 output[weight_index++] = val;
535 }
536 }
537 }
538
539 if (weight_index != GGML_QK_K) {
540 std::cout << "ERROR: Processed " << weight_index << " weights instead of "
541 << GGML_QK_K << std::endl;
542
543 while (weight_index < GGML_QK_K) {
544 output[weight_index++] = 0.0f;
545 }
546 }
547}
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
float fp16_to_fp32(uint16_t h, bool is_gguf_scale_field)
Converts a 16-bit floating point number to 32-bit float.
constexpr float K_SCALE_VALUES[64]
3-bit K-quantized block structure
uint8_t scales[12]
uint16_t dmin
uint8_t hmask[GGML_QK_K/8]
uint16_t d
uint8_t qs[GGML_QK_K/4]

References block_q3_K::d, block_q3_K::dmin, fp16_to_fp32(), GGML_QK_K, block_q3_K::hmask, K_SCALE_VALUES, block_q3_K::qs, and block_q3_K::scales.

◆ dequantize_q4_k_m()

void dequantize_q4_k_m ( const block_q4_K qblock,
float *RESTRICT  output_f32,
int  num_elements,
bool  log_this_block = false 
)

Dequantizes a Q4_K quantized block to float32.

Parameters
qblockPointer to Q4_K block
output_f32Output float array
num_elementsNumber of elements to dequantize
log_this_blockWhether to log dequantization details

◆ dequantize_q6_k()

void dequantize_q6_k ( const block_q6_K qblock,
float *RESTRICT  output_f32,
int  num_elements,
bool  log_this_block = false 
)

Dequantizes a Q6_K quantized block to float32.

Parameters
qblockPointer to Q6_K block
output_f32Output float array
num_elementsNumber of elements to dequantize
log_this_blockWhether to log dequantization details

◆ dequantize_q8_0_block()

void dequantize_q8_0_block ( const block_q8_0 qblock,
float *  output 
)

Dequantizes a Q8_0 block to float32.

Parameters
qblockPointer to Q8_0 block
outputOutput float array

Definition at line 1047 of file quantization.cpp.

1047 {
1048 const float d_fp32 = fp16_to_fp32(qblock->d, true);
1049 for (int i = 0; i < GGML_QK8_0; ++i) {
1050 output[i] = d_fp32 * static_cast<float>(qblock->qs[i]);
1051 }
1052}
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
uint16_t d
int8_t qs[GGML_QK8_0]

References block_q8_0::d, fp16_to_fp32(), GGML_QK8_0, and block_q8_0::qs.

Referenced by dequantize_vector_q8_0_to_f32(), TinyLlamaModel::initialize_gpu_and_rope(), TinyLlamaModel::lookup_embedding(), and matvec_q8_0_f32_vector_cpu().

◆ dequantize_vector_q4k_to_f32()

void dequantize_vector_q4k_to_f32 ( const std::vector< block_q4_K > &  q_weights,
std::vector< float > &  f32_weights,
size_t  total_num_elements,
int  log_first_n_blocks = 0 
)

Dequantizes a vector of Q4_K blocks to a vector of float32.

Parameters
q_weightsInput vector of Q4_K blocks
f32_weightsOutput vector of float32 values (will be resized)
total_num_elementsTotal number of float elements expected after dequantization
log_first_n_blocksNumber of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1109 of file quantization.cpp.

1112 {
1113 if (q_weights.empty()) {
1114 Logger::warning("[DEQUANT_VEC_Q4K] Input Q4_K weight vector is empty. Output float vector will be empty.");
1115 f32_weights.clear();
1116 return;
1117 }
1118
1119 f32_weights.resize(total_num_elements);
1120 size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
1121
1122 if (q_weights.size() != expected_blocks) {
1123 Logger::error("[DEQUANT_VEC_Q4K] Mismatch in Q4_K block count. Expected: " +
1124 std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
1125 ". Total elements: " + std::to_string(total_num_elements));
1126 }
1127
1128 float* current_output_ptr = f32_weights.data();
1129 size_t elements_processed = 0;
1130
1131 for (size_t i = 0; i < q_weights.size(); ++i) {
1132 const block_q4_K* current_block_ptr = &q_weights[i];
1133 int elements_in_this_block = GGML_QK_K;
1134
1135 if (elements_processed + GGML_QK_K > total_num_elements) {
1136 elements_in_this_block = total_num_elements - elements_processed;
1137 }
1138
1139 if (elements_in_this_block <= 0) {
1140 Logger::warning("[DEQUANT_VEC_Q4K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
1141 continue;
1142 }
1143
1144 bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
1145
1146 // Enhanced logging: Log block index for all blocks during the first dequantization call
1147 static std::atomic<bool> first_call_ever{true};
1148 bool is_first_call = first_call_ever.exchange(false);
1149
1150 // Call the Q4_K specific single-block dequantizer
1151 dequantize_q4_k_m(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
1152
1153
1154 current_output_ptr += elements_in_this_block;
1155 elements_processed += elements_in_this_block;
1156
1157 }
1158
1159 if (elements_processed != total_num_elements) {
1160 Logger::warning("[DEQUANT_VEC_Q4K] Processed " + std::to_string(elements_processed) +
1161 " elements, but expected " + std::to_string(total_num_elements) + ".");
1162 }
1163}
static void warning(const std::string &message)
Definition logger.cpp:139
static void error(const std::string &message)
Definition logger.cpp:143
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
4-bit K-quantized block structure

References dequantize_q4_k_m(), Logger::error(), GGML_QK_K, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ dequantize_vector_q6k_to_f32()

void dequantize_vector_q6k_to_f32 ( const std::vector< block_q6_K > &  q_weights,
std::vector< float > &  f32_weights,
size_t  total_num_elements,
int  log_first_n_blocks = 0 
)

Dequantizes a vector of Q6_K blocks to a vector of float32.

Parameters
q_weightsInput vector of Q6_K blocks
f32_weightsOutput vector of float32 values (will be resized)
total_num_elementsTotal number of float elements expected after dequantization
log_first_n_blocksNumber of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1054 of file quantization.cpp.

1057 {
1058 if (q_weights.empty()) {
1059 Logger::warning("[DEQUANT_VEC_Q6K] Input Q6_K weight vector is empty. Output float vector will be empty.");
1060 f32_weights.clear();
1061 return;
1062 }
1063
1064 f32_weights.resize(total_num_elements);
1065 size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
1066
1067 if (q_weights.size() != expected_blocks) {
1068 Logger::error("[DEQUANT_VEC_Q6K] Mismatch in Q6_K block count. Expected: " +
1069 std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
1070 ". Total elements: " + std::to_string(total_num_elements));
1071 }
1072
1073
1074 float* current_output_ptr = f32_weights.data();
1075 size_t elements_processed = 0;
1076
1077 for (size_t i = 0; i < q_weights.size(); ++i) {
1078 const block_q6_K* current_block_ptr = &q_weights[i];
1079 int elements_in_this_block = GGML_QK_K;
1080
1081 if (elements_processed + GGML_QK_K > total_num_elements) {
1082 elements_in_this_block = total_num_elements - elements_processed;
1083 }
1084
1085 if (elements_in_this_block <= 0) {
1086 Logger::warning("[DEQUANT_VEC_Q6K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
1087 continue;
1088 }
1089
1090 bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
1091
1092 // Enhanced logging: Log block index for all blocks during the first dequantization call
1093 static std::atomic<bool> first_call_ever{true};
1094 bool is_first_call = first_call_ever.exchange(false);
1095
1096 dequantize_q6_k(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
1097
1098 current_output_ptr += elements_in_this_block;
1099 elements_processed += elements_in_this_block;
1100
1101 }
1102
1103 if (elements_processed != total_num_elements) {
1104 Logger::warning("[DEQUANT_VEC_Q6K] Processed " + std::to_string(elements_processed) +
1105 " elements, but expected " + std::to_string(total_num_elements) + ".");
1106 }
1107}
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
6-bit K-quantized block structure

References dequantize_q6_k(), Logger::error(), GGML_QK_K, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ dequantize_vector_q8_0_to_f32()

void dequantize_vector_q8_0_to_f32 ( const std::vector< block_q8_0 > &  q_weights,
std::vector< float > &  f32_weights,
size_t  total_num_elements,
int  log_first_n_blocks = 0 
)

Dequantizes a vector of Q8_0 blocks to a vector of float32.

Parameters
q_weightsInput vector of Q8_0 blocks
f32_weightsOutput vector of float32 values (will be resized)
total_num_elementsTotal number of float elements expected after dequantization
log_first_n_blocksNumber of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1165 of file quantization.cpp.

1168 {
1169 if (q_weights.empty()) {
1170 Logger::warning("[DEQUANT_VEC_Q8_0] Input Q8_0 weight vector is empty. Output float vector will be empty.");
1171 f32_weights.clear();
1172 return;
1173 }
1174
1175 f32_weights.resize(total_num_elements);
1176
1177 size_t expected_blocks = (total_num_elements + GGML_QK8_0 - 1) / GGML_QK8_0;
1178
1179 if (q_weights.size() != expected_blocks) {
1180 Logger::error("[DEQUANT_VEC_Q8_0] Mismatch in Q8_0 block count. Expected: " +
1181 std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
1182 ". Total elements: " + std::to_string(total_num_elements));
1183 }
1184
1185 float* current_output_ptr = f32_weights.data();
1186 size_t elements_processed = 0;
1187
1188 for (size_t i = 0; i < q_weights.size(); ++i) {
1189 const block_q8_0* current_block_ptr = &q_weights[i];
1190 int elements_in_this_block = GGML_QK8_0;
1191
1192 if (elements_processed + GGML_QK8_0 > total_num_elements) {
1193 elements_in_this_block = total_num_elements - elements_processed;
1194 }
1195
1196 if (elements_in_this_block <= 0) {
1197 Logger::warning("[DEQUANT_VEC_Q8_0] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
1198 continue;
1199 }
1200
1201 bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
1202
1203 static std::atomic<bool> first_call_ever{true};
1204 bool is_first_call = first_call_ever.exchange(false);
1205
1206 if (elements_in_this_block == GGML_QK8_0) {
1207 dequantize_q8_0_block(current_block_ptr, current_output_ptr);
1208 } else {
1209 // Handle partial block
1210 float temp_block[GGML_QK8_0];
1211 dequantize_q8_0_block(current_block_ptr, temp_block);
1212 std::memcpy(current_output_ptr, temp_block, elements_in_this_block * sizeof(float));
1213 }
1214
1215 current_output_ptr += elements_in_this_block;
1216 elements_processed += elements_in_this_block;
1217 }
1218
1219 if (elements_processed != total_num_elements) {
1220 Logger::warning("[DEQUANT_VEC_Q8_0] Processed " + std::to_string(elements_processed) +
1221 " elements, but expected " + std::to_string(total_num_elements) + ".");
1222 }
1223}
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
Simple 8-bit quantized block structure.

References dequantize_q8_0_block(), Logger::error(), GGML_QK8_0, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ fp16_to_fp32()

float fp16_to_fp32 ( uint16_t  h,
bool  is_gguf_scale_field = false 
)

Converts a 16-bit floating point number to 32-bit float.

Parameters
hThe 16-bit float value to convert
is_gguf_scale_fieldWhether this value is from a GGUF scale field
Returns
The converted 32-bit float value

Definition at line 47 of file quantization.cpp.

47 {
48 uint16_t h_to_convert = h;
49 bool original_sign_bit_was_set = (h & 0x8000);
50 uint32_t sign = (h_to_convert >> 15) & 1;
51 uint32_t exp_fp16 = (h_to_convert >> 10) & 0x1f;
52 uint32_t mant_fp16 = h_to_convert & 0x3ff;
53 uint32_t x;
54
55 if (exp_fp16 == 0) {
56 if (mant_fp16 == 0) {
57 x = (sign << 31);
58
59 } else {
60 exp_fp16 = 1;
61 while ((mant_fp16 & 0x400) == 0) {
62 mant_fp16 <<= 1;
63 exp_fp16--;
64 }
65 mant_fp16 &= ~0x400;
66 uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
67 uint32_t mant_fp32 = mant_fp16 << 13;
68 x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
69 }
70 } else if (exp_fp16 == 0x1f) {
71 x = (sign << 31) | (0xff << 23) | (mant_fp16 << 13);
72 } else {
73 uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
74 uint32_t mant_fp32 = mant_fp16 << 13;
75 x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
76 }
77
78 float f;
79 std::memcpy(&f, &x, sizeof(float));
80
81 if (is_gguf_scale_field) {
82 static std::atomic<int> q8_scale_f_log_count{0};
83 }
84
85 if (is_gguf_scale_field && f < 0.0f && !(std::isnan(f) || std::isinf(f))) {
86 f = std::abs(f);
87 }
88
89 return f;
90}

Referenced by dequantize_q2_k(), dequantize_q3_k(), dequantize_q4_k_m(), dequantize_q6_k(), dequantize_q8_0_block(), dequantize_q8_k(), vec_dot_q4_k_q8_k_cpu(), and vec_dot_q6_k_q8_k_cpu().

◆ fp32_to_fp16()

uint16_t fp32_to_fp16 ( float  f)

Converts a 32-bit float to 16-bit floating point.

Parameters
fThe 32-bit float value to convert
Returns
The converted 16-bit float value

Definition at line 92 of file quantization.cpp.

92 {
93 uint32_t x;
94 std::memcpy(&x, &f, sizeof(float));
95
96 uint32_t sign = (x >> 31) & 1;
97 uint32_t exp_fp32 = (x >> 23) & 0xff;
98 uint32_t mant_fp32 = x & 0x7fffff;
99
100 uint16_t u;
101
102 if (exp_fp32 == 0xff) {
103 u = (sign << 15) | 0x7c00 | (mant_fp32 != 0 ? 0x200 : 0);
104 } else {
105 int exp_fp16 = (int)exp_fp32 - 127 + 15;
106
107 if (exp_fp16 >= 0x1f) {
108 u = (sign << 15) | 0x7c00;
109 } else if (exp_fp16 <= 0) {
110 if (exp_fp16 < -10) {
111 u = (sign << 15);
112 } else {
113 mant_fp32 = (mant_fp32 | 0x800000) >> (1 - exp_fp16);
114
115 if ((mant_fp32 >> 13) & 1) {
116 mant_fp32 += (1 << 13);
117 }
118 u = (sign << 15) | (mant_fp32 >> 13);
119 }
120 } else {
121 if ((mant_fp32 >> 13) & 1) {
122 mant_fp32 += (1 << 13);
123 if ((mant_fp32 >> 23) == 1) {
124 mant_fp32 = 0;
125 exp_fp16++;
126 if (exp_fp16 >= 0x1f) {
127 u = (sign << 15) | 0x7c00;
128 return u;
129 }
130 }
131 }
132 u = (sign << 15) | (exp_fp16 << 10) | (mant_fp32 >> 13);
133 }
134 }
135 return u;
136}

Referenced by quantize_fp32_to_q8_K(), quantize_q4_k_m(), and quantize_q6_k().

◆ ggml_type_block_size()

size_t ggml_type_block_size ( GGMLType  type)

Gets the block size for a GGML type.

Parameters
typeThe GGML type
Returns
Block size in elements

Definition at line 688 of file quantization.cpp.

688 {
689 switch (type) {
694
695 return GGML_QK_K;
696
699
700 return 32;
701
708 return 1;
709
710 default:
711 std::cout << "Warning: Unknown GGMLType in ggml_type_block_size: "
712 << static_cast<int>(type) << std::endl;
713 return 0;
714 }
715
716 return 0;
717}
@ GGML_TYPE_Q2_K
Definition ggml_types.h:31
@ GGML_TYPE_F32
Definition ggml_types.h:22
@ GGML_TYPE_I16
Definition ggml_types.h:38
@ GGML_TYPE_BF16
Definition ggml_types.h:40
@ GGML_TYPE_I8
Definition ggml_types.h:37
@ GGML_TYPE_F16
Definition ggml_types.h:23
@ GGML_TYPE_Q3_K
Definition ggml_types.h:32
@ GGML_TYPE_Q6_K
Definition ggml_types.h:35
@ GGML_TYPE_Q8_0
Definition ggml_types.h:29
@ GGML_TYPE_I32
Definition ggml_types.h:39
@ GGML_TYPE_Q4_K
Definition ggml_types.h:33
@ GGML_TYPE_Q4_0
Definition ggml_types.h:24

References GGML_QK_K, GGML_TYPE_BF16, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, and GGML_TYPE_Q8_0.

Referenced by load_gguf_meta().

◆ ggml_type_name()

const char * ggml_type_name ( GGMLType  type)

Gets the string name of a GGML type.

Parameters
typeThe GGML type
Returns
String representation of the type

Definition at line 601 of file quantization.cpp.

601 {
602 switch (type) {
604 return "F32";
606 return "F16";
608 return "Q4_0";
610 return "Q4_1";
612 return "Q5_0";
614 return "Q5_1";
616 return "Q8_0";
618 return "Q8_1";
620 return "Q2_K";
622 return "Q3_K";
624 return "Q4_K";
626 return "Q5_K";
628 return "Q6_K";
630 return "Q8_K";
632 return "I8";
634 return "I16";
636 return "I32";
638 return "BF16";
640 return "COUNT";
641 default:
642 return "Unknown";
643 }
644}
@ GGML_TYPE_Q8_1
Definition ggml_types.h:30
@ GGML_TYPE_Q5_0
Definition ggml_types.h:27
@ GGML_TYPE_Q8_K
Definition ggml_types.h:36
@ GGML_TYPE_Q4_1
Definition ggml_types.h:25
@ GGML_TYPE_Q5_K
Definition ggml_types.h:34
@ GGML_TYPE_Q5_1
Definition ggml_types.h:28
@ GGML_TYPE_COUNT
Definition ggml_types.h:41

References GGML_TYPE_BF16, GGML_TYPE_COUNT, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, GGML_TYPE_Q8_1, and GGML_TYPE_Q8_K.

Referenced by load_gguf_meta().

◆ ggml_type_size()

size_t ggml_type_size ( GGMLType  type)

Gets the size in bytes of a GGML type.

Parameters
typeThe GGML type
Returns
Size in bytes

Definition at line 646 of file quantization.cpp.

646 {
647 switch (type) {
649 return sizeof(float);
651 return sizeof(uint16_t);
653 return sizeof(int8_t);
655 return sizeof(block_q4_K);
657 return sizeof(block_q2_K);
659 return sizeof(block_q3_K);
661 return sizeof(block_q6_K);
663 return 18;
664
666 return 34;
668 return 40;
670 return 116;
672 return 290;
674 return sizeof(int16_t);
676 return sizeof(int32_t);
678 return sizeof(uint16_t);
680 default:
681 std::cout << " UNKNOWN GGML TYPE: " << static_cast<int>(type)
682 << std::endl;
683 throw std::invalid_argument("Unknown GGML type in ggml_type_size: " +
684 std::to_string(static_cast<int>(type)));
685 }
686}
2-bit K-quantized block structure

References GGML_TYPE_BF16, GGML_TYPE_COUNT, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, GGML_TYPE_Q8_1, and GGML_TYPE_Q8_K.

Referenced by load_gguf_meta().

◆ handle_i8_tensor()

void handle_i8_tensor ( const void *  i8_data,
float *  f_data,
size_t  num_elements 
)

Handles conversion of int8 tensor data to float32.

Parameters
i8_dataInput int8 data
f_dataOutput float array
num_elementsNumber of elements to convert

Definition at line 268 of file quantization.cpp.

269 {
270 const int8_t* input_ptr = static_cast<const int8_t*>(input_data);
271 for (size_t i = 0; i < num_elements; ++i) {
272 output_data[i] = static_cast<float>(input_ptr[i]);
273 }
274}

◆ matvec_q4k_q8k_cpu()

void matvec_q4k_q8k_cpu ( const std::vector< block_q4_K > &  mat_q4k,
const std::vector< block_q8_K > &  vec_q8k,
std::vector< float > &  out_f32,
int  rows,
int  cols,
bool  log_calls 
)

Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.

Parameters
mat_q4kQ4_K matrix
vec_q8kQ8_K vector
out_f32Output float vector
rowsNumber of matrix rows
colsNumber of matrix columns
log_callsWhether to log computation details

Definition at line 982 of file quantization.cpp.

985 {
986 if (cols % GGML_QK_K != 0) {
987 throw std::runtime_error(
988 "matvec_q4k_q8k_cpu: cols must be divisible by GGML_QK_K");
989 }
990 size_t blocks_per_row = cols / GGML_QK_K;
991 if (mat_q4k.size() != (size_t)rows * blocks_per_row) {
992 throw std::runtime_error("matvec_q4k_q8k_cpu: mat_q4k size mismatch");
993 }
994 if (vec_q8k.size() != blocks_per_row) {
995 throw std::runtime_error("matvec_q4k_q8k_cpu: vec_q8k size mismatch");
996 }
997 out_f32.resize(rows);
998
999#pragma omp parallel for
1000 for (int r = 0; r < rows; ++r) {
1001 const std::vector<block_q4_K> row_q4k(
1002 mat_q4k.begin() + r * blocks_per_row,
1003 mat_q4k.begin() + (r + 1) * blocks_per_row);
1004
1005 out_f32[r] = vec_dot_q4_k_q8_k_cpu(cols, row_q4k, vec_q8k, log_calls);
1006 }
1007}
float vec_dot_q4_k_q8_k_cpu(int n, const std::vector< block_q4_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
Computes dot product between Q4_K and Q8_K vectors on CPU.

References GGML_QK_K, and vec_dot_q4_k_q8_k_cpu().

◆ matvec_q6k_q8k_cpu()

void matvec_q6k_q8k_cpu ( const std::vector< block_q6_K > &  mat_q6k,
const std::vector< block_q8_K > &  vec_q8k,
std::vector< float > &  out_f32,
int  rows,
int  cols,
bool  log_calls 
)

Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.

Parameters
mat_q6kQ6_K matrix
vec_q8kQ8_K vector
out_f32Output float vector
rowsNumber of matrix rows
colsNumber of matrix columns
log_callsWhether to log computation details

Definition at line 897 of file quantization.cpp.

900 {
901 if (cols % GGML_QK_K != 0) {
902 throw std::runtime_error(
903 "matvec_q6k_q8k_cpu: cols must be divisible by GGML_QK_K");
904 }
905 size_t blocks_per_row = cols / GGML_QK_K;
906 if (mat_q6k.size() != (size_t)rows * blocks_per_row) {
907 throw std::runtime_error("matvec_q6k_q8k_cpu: mat_q6k size mismatch");
908 }
909 if (vec_q8k.size() != blocks_per_row) {
910 throw std::runtime_error("matvec_q6k_q8k_cpu: vec_q8k size mismatch");
911 }
912 out_f32.resize(rows);
913 for (int r = 0; r < rows; ++r) {
914 const std::vector<block_q6_K> row_q6k(
915 mat_q6k.begin() + r * blocks_per_row,
916 mat_q6k.begin() + (r + 1) * blocks_per_row);
917
918 out_f32[r] = vec_dot_q6_k_q8_k_cpu(cols, row_q6k, vec_q8k, log_calls);
919 }
920}
float vec_dot_q6_k_q8_k_cpu(int n, const std::vector< block_q6_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
Computes dot product between Q6_K and Q8_K vectors on CPU.

References GGML_QK_K, and vec_dot_q6_k_q8_k_cpu().

◆ quantize_fp32_to_q8_K()

std::vector< block_q8_K > quantize_fp32_to_q8_K ( const std::vector< float > &  f_data)

Quantizes float32 data to Q8_K format.

Parameters
f_dataInput float vector
Returns
Vector of Q8_K blocks

Definition at line 719 of file quantization.cpp.

720 {
721 if (f_data.size() % GGML_QK_K != 0) {
722 throw std::runtime_error(
723 "Input vector size must be a multiple of GGML_QK_K (" +
724 std::to_string(GGML_QK_K) + ")");
725 }
726
727 size_t num_blocks = f_data.size() / GGML_QK_K;
728 std::vector<block_q8_K> q_data(num_blocks);
729 const float* x = f_data.data();
730 block_q8_K* y = q_data.data();
731
732 static std::atomic<int> log_count_q8k_quant_scales = 0;
733
734 for (size_t i = 0; i < num_blocks; ++i) {
735 float amax = 0.0f;
736 for (int j = 0; j < GGML_QK_K; ++j) {
737 amax = SAFE_MAX(amax, std::abs(x[j]));
738 }
739
740 const float d_fp32 = amax / Q8K_SCALE_FACTOR;
741 const float id = (d_fp32 != 0.f) ? 1.0f / d_fp32 : 0.0f;
742 y[i].d = fp32_to_fp16(d_fp32);
743
744 if (log_count_q8k_quant_scales < 10) {
745 std::stringstream q8k_scale_log_ss;
746 q8k_scale_log_ss << "[Q8K_QUANT_SCALES] Block #" << i
747 << " Input amax=" << amax << " -> d_fp32=" << d_fp32
748 << " -> Stored d_fp16=0x" << std::hex << y[i].d
749 << std::dec;
750 Logger::debug(q8k_scale_log_ss.str());
751 log_count_q8k_quant_scales++;
752 }
753
754 int16_t block_sum[16] = {0};
755 for (int j = 0; j < GGML_QK_K; ++j) {
756 const float val_scaled = x[j] * id;
757
758 int8_t q_val = static_cast<int8_t>(
759 SAFE_MAX(-128.0f, SAFE_MIN(127.0f, std::round(val_scaled))));
760 y[i].qs[j] = q_val;
761 block_sum[j / 16] += q_val;
762 }
763
764 std::memcpy(y[i].bsums, block_sum, sizeof(block_sum));
765
766 x += GGML_QK_K;
767 }
768
769 return q_data;
770}
static void debug(const std::string &message)
Definition logger.cpp:131
constexpr float Q8K_SCALE_FACTOR
Definition gguf_parser.h:58
#define SAFE_MAX(a, b)
#define SAFE_MIN(a, b)
uint16_t fp32_to_fp16(float f)
Converts a 32-bit float to 16-bit floating point.
8-bit K-quantized block structure with block sums
int8_t qs[GGML_QK_K]
uint16_t d

References block_q8_K::d, Logger::debug(), fp32_to_fp16(), GGML_QK_K, Q8K_SCALE_FACTOR, block_q8_K::qs, SAFE_MAX, and SAFE_MIN.

◆ quantize_q4_k_m()

void quantize_q4_k_m ( const float *  f_data,
void *  q_data,
int  num_elements 
)

Quantizes float32 data to Q4_K format.

Parameters
f_dataInput float array
q_dataOutput quantized data
num_elementsNumber of elements to quantize

Definition at line 276 of file quantization.cpp.

277 {
278 if (num_elements != GGML_QK_K) {
279 throw std::invalid_argument(
280 "quantize_q4_k_m currently only supports block size " +
281 std::to_string(GGML_QK_K));
282 }
283
284 block_q4_K* output_qblock = static_cast<block_q4_K*>(output_qblock_void);
285
286 std::memset(output_qblock->scales, 0, sizeof(output_qblock->scales));
287 std::memset(output_qblock->qs, 0, sizeof(output_qblock->qs));
288
289 float block_min_val = std::numeric_limits<float>::max();
290 float block_max_val = std::numeric_limits<float>::lowest();
291 for (int i = 0; i < num_elements; ++i) {
292 block_min_val = SAFE_MIN(block_min_val, input[i]);
293 block_max_val = SAFE_MAX(block_max_val, input[i]);
294 }
295
296 if (block_max_val == block_min_val) {
297 block_max_val = block_min_val + GGUF_SMALL_VAL;
298 }
299 if (block_max_val < GGUF_EPSILON && block_max_val > -GGUF_EPSILON) {
300 block_max_val = GGUF_SMALL_VAL;
301 block_min_val = 0.0f;
302 }
303
304 const float d_super_scale_candidate = (block_max_val - block_min_val) / Q4K_SCALE_FACTOR;
305 const float d_super =
306 d_super_scale_candidate > GGUF_EPSILON ? d_super_scale_candidate : GGUF_EPSILON;
307 const float min_super = block_min_val;
308
309 output_qblock->d = fp32_to_fp16(d_super);
310 output_qblock->dmin = fp32_to_fp16(min_super);
311
312 for (int j = 0; j < GGML_QK_K / 16; ++j) {
313 const float* sub_block_input = input + j * 16;
314
315 float sub_min_val = sub_block_input[0];
316 float sub_max_val = sub_block_input[0];
317 for (int i = 1; i < 16; ++i) {
318 sub_min_val = SAFE_MIN(sub_min_val, sub_block_input[i]);
319 sub_max_val = SAFE_MAX(sub_max_val, sub_block_input[i]);
320 }
321
322 float ideal_scale = 0.0f;
323 if (sub_max_val > sub_min_val + GGUF_EPSILON) {
324 ideal_scale = (sub_max_val - sub_min_val) / Q4K_SCALE_FACTOR;
325 }
326 float ideal_min = sub_min_val;
327
328 uint8_t best_scale_idx = 0;
329 float min_scale_err = std::numeric_limits<float>::max();
330 if (d_super > GGUF_EPSILON) {
331 for (uint8_t k = 0; k < 16; ++k) {
332 float candidate_scale = d_super * K_SCALE_VALUES[k];
333 float err = std::abs(candidate_scale - ideal_scale);
334 if (err < min_scale_err) {
335 min_scale_err = err;
336 best_scale_idx = k;
337 }
338 }
339 }
340
341 uint8_t best_min_idx = 0;
342 float min_min_err = std::numeric_limits<float>::max();
343
344 for (uint8_t l = 0; l < 16; ++l) {
345 float candidate_min = min_super * K_MIN_VALUES[l];
346 float err = std::abs(candidate_min - ideal_min);
347 if (err < min_min_err) {
348 min_min_err = err;
349 best_min_idx = l;
350 }
351 }
352
353 int scale_byte_idx = j % 8;
354 int scale_shift = 4 * (j / 8);
355 output_qblock->scales[scale_byte_idx] |= (best_scale_idx << scale_shift);
356
357 int min_byte_idx = (j % 4) + 8;
358 int min_shift = 4 * (j / 4);
359 output_qblock->scales[min_byte_idx] |= (best_min_idx << min_shift);
360
361 float actual_scale = d_super * K_SCALE_VALUES[best_scale_idx];
362 float actual_min = min_super * K_MIN_VALUES[best_min_idx];
363 float inv_actual_scale = (actual_scale > GGUF_EPSILON || actual_scale < -GGUF_EPSILON)
364 ? 1.0f / actual_scale
365 : 0.0f;
366
367 uint8_t packed_qs[8];
368
369 std::memset(packed_qs, 0, sizeof(packed_qs));
370
371 for (int i = 0; i < 16; ++i) {
372 float val = sub_block_input[i];
373
374 int quant_val = 0;
375 if (inv_actual_scale != 0.0f) {
376 quant_val =
377 static_cast<int>(std::round((val - actual_min) * inv_actual_scale)) + Q4K_OFFSET;
378 }
379 quant_val = SAFE_MAX(0, SAFE_MIN(15, quant_val));
380
381 int byte_idx_qs = i / 2;
382 int shift_qs = (i % 2) * 4;
383 packed_qs[byte_idx_qs] |= (static_cast<uint8_t>(quant_val) << shift_qs);
384 }
385
386 uint8_t* qs_target = output_qblock->qs + j * 8;
387 for (int i = 0; i < 8; ++i) {
388 uint8_t low_nibble_val = packed_qs[i] & 0x0F;
389 uint8_t high_nibble_val = (packed_qs[i] >> 4) & 0x0F;
390 qs_target[i] = low_nibble_val | (high_nibble_val << 4);
391 }
392 }
393}
constexpr int8_t Q4K_OFFSET
Offset values for quantization methods.
Definition gguf_parser.h:63
constexpr float Q4K_SCALE_FACTOR
Scale factors for different quantization methods.
Definition gguf_parser.h:56
constexpr float GGUF_EPSILON
Constants for numeric stability in calculations.
Definition gguf_parser.h:36
constexpr float GGUF_SMALL_VAL
Definition gguf_parser.h:37
constexpr float K_MIN_VALUES[64]
uint16_t d
uint8_t scales[12]
uint8_t qs[GGML_QK_K/2]
uint16_t dmin

References block_q4_K::d, block_q4_K::dmin, fp32_to_fp16(), GGML_QK_K, GGUF_EPSILON, GGUF_SMALL_VAL, K_MIN_VALUES, K_SCALE_VALUES, Q4K_OFFSET, Q4K_SCALE_FACTOR, block_q4_K::qs, SAFE_MAX, SAFE_MIN, and block_q4_K::scales.

◆ quantize_q6_k()

void quantize_q6_k ( const float *  f_data,
void *  q_data,
int  num_elements 
)

Quantizes float32 data to Q6_K format.

Parameters
f_dataInput float array
q_dataOutput quantized data
num_elementsNumber of elements to quantize

Definition at line 549 of file quantization.cpp.

550 {
551 if (num_elements != GGML_QK_K) {
552 throw std::invalid_argument(
553 "quantize_q6_k currently only supports block size " +
554 std::to_string(GGML_QK_K));
555 }
556
557 block_q6_K* output_qblock = static_cast<block_q6_K*>(output_qblock_void);
558
559 uint8_t* ql = output_qblock->ql;
560 uint8_t* qh = output_qblock->qh;
561 int8_t* scales = output_qblock->scales;
562 std::memset(ql, 0, GGML_QK_K / 2);
563 std::memset(qh, 0, GGML_QK_K / 4);
564
565 float amax = 0.0f;
566 for (int i = 0; i < num_elements; ++i) {
567 amax = SAFE_MAX(amax, std::abs(input[i]));
568 }
569
570 const float d_float = (amax > GGUF_EPSILON) ? (amax / Q6K_SCALE_FACTOR) : GGUF_EPSILON;
571 output_qblock->d = fp32_to_fp16(d_float);
572
573 for (int sub = 0; sub < GGML_QK_K / 16; ++sub) {
574 const float* sub_in = input + sub * 16;
575
576 float sub_amax = 0.0f;
577 for (int i = 0; i < 16; ++i) {
578 sub_amax = SAFE_MAX(sub_amax, std::abs(sub_in[i]));
579 }
580
581 int8_t scale = (d_float > 0.0f) ? std::round(sub_amax / d_float) : 1;
582 if (scale == 0) scale = 1;
583 scales[sub] = scale;
584
585 for (int i = 0; i < 16; ++i) {
586 float val = sub_in[i];
587 int q = static_cast<int>(std::round(val / (d_float * scale))) + Q6K_OFFSET;
588 q = SAFE_MAX(0, SAFE_MIN(63, q));
589
590 int idx = sub * 16 + i;
591 int ql_idx = idx / 2;
592 int ql_shift = (idx % 2) * 4;
593 ql[ql_idx] |= (q & 0x0F) << ql_shift;
594 int qh_idx = idx / 4;
595 int qh_shift = (idx % 4) * 2;
596 qh[qh_idx] |= ((q >> 4) & 0x03) << qh_shift;
597 }
598 }
599}
constexpr int8_t Q6K_OFFSET
Definition gguf_parser.h:64
constexpr float Q6K_SCALE_FACTOR
Definition gguf_parser.h:57
int8_t scales[GGML_QK_K/16]
uint16_t d
uint8_t ql[GGML_QK_K/2]
uint8_t qh[GGML_QK_K/4]

References block_q6_K::d, fp32_to_fp16(), GGML_QK_K, GGUF_EPSILON, Q6K_OFFSET, Q6K_SCALE_FACTOR, block_q6_K::qh, block_q6_K::ql, SAFE_MAX, SAFE_MIN, and block_q6_K::scales.

◆ vec_dot_q4_k_q8_k_cpu()

float vec_dot_q4_k_q8_k_cpu ( int  n,
const std::vector< block_q4_K > &  x_vec,
const std::vector< block_q8_K > &  y_vec,
bool  log_this_call 
)

Computes dot product between Q4_K and Q8_K vectors on CPU.

Parameters
nNumber of blocks
x_vecQ4_K vector
y_vecQ8_K vector
log_this_callWhether to log computation details
Returns
Dot product result

Definition at line 922 of file quantization.cpp.

924 {
925 int log_count_now = g_vec_dot_q4_k_q8_k_log_count.fetch_add(1);
926 if (log_count_now >= 5) log_this_call = false;
927
928 if (n % GGML_QK_K != 0) {
929 throw std::runtime_error("vec_dot_q4_k_q8_k: n must be multiple of QK_K");
930 }
931 size_t nb = n / GGML_QK_K;
932 if (x_vec.size() != nb || y_vec.size() != nb) {
933 throw std::runtime_error("vec_dot_q4_k_q8_k: vector block count mismatch");
934 }
935
936 const block_q4_K* x = x_vec.data();
937 const block_q8_K* y = y_vec.data();
938
939 float sumf = 0.0f;
940 for (size_t i = 0; i < nb; ++i) {
941 int8_t q4_vals[GGML_QK_K];
942 const uint8_t* q4 = x[i].qs;
943 for (int j = 0; j < GGML_QK_K / 2; ++j) {
944 q4_vals[2 * j + 0] = static_cast<int8_t>(q4[j] & 0xF);
945 q4_vals[2 * j + 1] = static_cast<int8_t>(q4[j] >> 4);
946 }
947
948 const int8_t* q8 = y[i].qs;
949
950 for (int sub = 0; sub < 16; ++sub) {
951 uint8_t scale_idx, min_idx;
952 get_scale_min_indices_q4_K(sub, x[i].scales, &scale_idx, &min_idx);
953 float scale = fp16_to_fp32(x[i].d) * K_SCALE_VALUES[scale_idx];
954 float minv = fp16_to_fp32(x[i].dmin) * K_MIN_VALUES[min_idx];
955 for (int k = 0; k < 16; ++k) {
956 int idx = sub * 16 + k;
957 float q4_val = static_cast<float>(q4_vals[idx]) - 8.0f;
958 float q8_val = static_cast<float>(q8[idx]);
959 sumf += (scale * q4_val + minv) * q8_val;
960 }
961 }
962
963 if (i == 0 && log_this_call) {
964 std::stringstream ss;
965 ss << "[Q4K_Q8K] Block #0: d: " << fp16_to_fp32(x[i].d)
966 << ", dmin: " << fp16_to_fp32(x[i].dmin);
967 Logger::debug(ss.str());
968 ss.str("");
969 ss << "[Q4K_Q8K] Block #0: Q8_K input (first 16): ";
970 for (int k = 0; k < 16; ++k) ss << (int)q8[k] << " ";
971 Logger::debug(ss.str());
972 ss.str("");
973 ss << "[Q4K_Q8K] Block #0: Q4_K unpacked (first 16): ";
974 for (int k = 0; k < 16; ++k) ss << (int)q4_vals[k] << " ";
975 Logger::debug(ss.str());
976 ss.str("");
977 }
978 }
979 return sumf;
980}
static void get_scale_min_indices_q4_K(int j, const uint8_t *scales, uint8_t *scale_index, uint8_t *min_index)
static std::atomic< int > g_vec_dot_q4_k_q8_k_log_count

References Logger::debug(), fp16_to_fp32(), g_vec_dot_q4_k_q8_k_log_count, get_scale_min_indices_q4_K(), GGML_QK_K, K_MIN_VALUES, K_SCALE_VALUES, block_q4_K::qs, and block_q8_K::qs.

Referenced by matvec_q4k_q8k_cpu().

◆ vec_dot_q6_k_q8_k_cpu()

float vec_dot_q6_k_q8_k_cpu ( int  n,
const std::vector< block_q6_K > &  x,
const std::vector< block_q8_K > &  y,
bool  log_this_call 
)

Computes dot product between Q6_K and Q8_K vectors on CPU.

Parameters
nNumber of blocks
xQ6_K vector
yQ8_K vector
log_this_callWhether to log computation details
Returns
Dot product result

Definition at line 772 of file quantization.cpp.

774 {
775 if (n % GGML_QK_K != 0) {
776 throw std::runtime_error("vec_dot_q6_k_q8_k: n must be multiple of QK_K");
777 }
778 size_t nb = n / GGML_QK_K;
779 if (x_vec.size() != nb || y_vec.size() != nb) {
780 throw std::runtime_error("vec_dot_q6_k_q8_k: vector block count mismatch");
781 }
782
783 const block_q6_K* x = x_vec.data();
784 const block_q8_K* y = y_vec.data();
785
786 int8_t aux8[GGML_QK_K];
787 int16_t aux16[8];
788 float sums[8];
789 int32_t aux32[8];
790 std::memset(sums, 0, 8 * sizeof(float));
791
792 float sumf = 0.0f;
793
794 static std::atomic<int> log_count_dot = 0;
795 bool should_log_this_block = log_this_call && log_count_dot < 5;
796
797 for (size_t i = 0; i < nb; ++i) {
798 const uint8_t* ql = x[i].ql;
799 const uint8_t* qh = x[i].qh;
800 const int8_t* q8 = y[i].qs;
801 std::memset(aux32, 0, 8 * sizeof(int32_t));
802
803 int8_t* a = aux8;
804 for (int j = 0; j < GGML_QK_K; j += 128) {
805 for (int l = 0; l < 32; ++l) {
806 a[l + 0] = static_cast<int8_t>(
807 ((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32);
808 a[l + 32] = static_cast<int8_t>(
809 ((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32);
810 a[l + 64] = static_cast<int8_t>(
811 ((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32);
812 a[l + 96] = static_cast<int8_t>(
813 ((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
814 }
815 a += 128;
816 ql += 64;
817 qh += 32;
818 }
819
820 a = aux8;
821 int is = 0;
822 for (int j = 0; j < GGML_QK_K / 16; ++j) {
823 int scale = x[i].scales[is++];
824 for (int l = 0; l < 8; ++l)
825 aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
826 for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
827 q8 += 8;
828 a += 8;
829 for (int l = 0; l < 8; ++l)
830 aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
831 for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
832 q8 += 8;
833 a += 8;
834 }
835
836 int32_t sumi_mins = 0;
837 for (int j = 0; j < GGML_QK_K / 16; ++j) {
838 sumi_mins += static_cast<int32_t>(y[i].bsums[j]) *
839 static_cast<int32_t>(x[i].scales[j]);
840 }
841
842 const float d_q6 = fp16_to_fp32(x[i].d);
843 const float d_q8 = fp16_to_fp32(y[i].d);
844 const float d = d_q6 * d_q8;
845
846 float block_contribution = 0.0f;
847 for (int l = 0; l < 8; ++l) {
848 float term = d * (aux32[l] - 32 * sumi_mins / 8);
849 sums[l] += term;
850 block_contribution += term;
851 }
852
853 if (i == 0 && should_log_this_block) {
854 std::stringstream ss_log;
855 ss_log << "[DOT_Q6K_Q8K] Call #" << (log_count_dot.load() + 1)
856 << ", Block #0:";
857 Logger::debug(ss_log.str());
858 ss_log.str("");
859 ss_log << " Q6_K Scale d_q6: " << d_q6 << " (Raw FP16: 0x" << std::hex
860 << x[i].d << std::dec << ")";
861 Logger::debug(ss_log.str());
862 ss_log.str("");
863 ss_log << " Q8_K Scale d_q8: " << d_q8;
864 Logger::debug(ss_log.str());
865 ss_log.str("");
866 ss_log << " Combined Scale d: " << d;
867 Logger::debug(ss_log.str());
868 ss_log.str("");
869 ss_log << " Q6_K Sub-scales (int8): ";
870 for (int k = 0; k < 16; ++k) ss_log << (int)x[i].scales[k] << " ";
871 Logger::debug(ss_log.str());
872 ss_log.str("");
873 ss_log << " Int32 Sums (aux32, before compensation): ";
874 for (int l = 0; l < 8; ++l) ss_log << aux32[l] << " ";
875 Logger::debug(ss_log.str());
876 ss_log.str("");
877 ss_log << " Compensation term (sumi_mins): " << sumi_mins
878 << ", -32 * sumi_mins: " << (-32 * sumi_mins);
879 Logger::debug(ss_log.str());
880 ss_log.str("");
881 ss_log << " Block #0 Contribution to Sums (after compensation): "
882 << block_contribution;
883 Logger::debug(ss_log.str());
884 }
885 }
886
887 for (int l = 0; l < 8; ++l) {
888 sumf += sums[l];
889 }
890
891 if (should_log_this_block) {
892 log_count_dot++;
893 }
894 return sumf;
895}
int16_t bsums[GGML_QK_K/16]

References block_q8_K::bsums, block_q6_K::d, Logger::debug(), fp16_to_fp32(), GGML_QK_K, block_q6_K::qh, block_q6_K::ql, block_q8_K::qs, and block_q6_K::scales.

Referenced by matvec_q6k_q8k_cpu().