Weight quantization structures and functions for model compression. More...

#include <cstddef>
#include <cstdint>
#include <limits>
#include <vector>
#include "ggml_types.h"
#include "gguf_parser.h"

Include dependency graph for quantization.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes
struct	block_q4_K
	4-bit K-quantized block structure More...

struct	block_q6_K
	6-bit K-quantized block structure More...

struct	block_q2_K
	2-bit K-quantized block structure More...

struct	block_q3_K
	3-bit K-quantized block structure More...

struct	block_q8_K
	8-bit K-quantized block structure with block sums More...

struct	block_q8_0
	Simple 8-bit quantized block structure. More...

Macros
#define	RESTRICT __restrict__

Functions
float	fp16_to_fp32 (uint16_t h, bool is_gguf_scale_field=false)
	Converts a 16-bit floating point number to 32-bit float.

uint16_t	fp32_to_fp16 (float f)
	Converts a 32-bit float to 16-bit floating point.

const char *	ggml_type_name (GGMLType type)
	Gets the string name of a GGML type.

size_t	ggml_type_size (GGMLType type)
	Gets the size in bytes of a GGML type.

size_t	ggml_type_block_size (GGMLType type)
	Gets the block size for a GGML type.

void	dequantize_q2_k (const void q_data, float f_data, int num_weights_in_block, bool log_details_for_this_block=false)
	Dequantizes a Q2_K quantized block to float32.

void	dequantize_q4_k_m (const block_q4_K qblock, float RESTRICT output_f32, int num_elements, bool log_this_block=false)
	Dequantizes a Q4_K quantized block to float32.

void	dequantize_q6_k (const block_q6_K qblock, float RESTRICT output_f32, int num_elements, bool log_this_block=false)
	Dequantizes a Q6_K quantized block to float32.

void	dequantize_vector_q6k_to_f32 (const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks=0)
	Dequantizes a vector of Q6_K blocks to a vector of float32.

void	dequantize_q3_k (const void q_data, float f_data, int num_weights_in_block)
	Dequantizes a Q3_K quantized block to float32.

void	handle_i8_tensor (const void i8_data, float f_data, size_t num_elements)
	Handles conversion of int8 tensor data to float32.

void	quantize_q4_k_m (const float f_data, void q_data, int num_elements)
	Quantizes float32 data to Q4_K format.

void	quantize_q6_k (const float f_data, void q_data, int num_elements)
	Quantizes float32 data to Q6_K format.

std::vector< block_q8_K >	quantize_fp32_to_q8_K (const std::vector< float > &f_data)
	Quantizes float32 data to Q8_K format.

float	vec_dot_q6_k_q8_k_cpu (int n, const std::vector< block_q6_K > &x, const std::vector< block_q8_K > &y, bool log_this_call)
	Computes dot product between Q6_K and Q8_K vectors on CPU.

void	matvec_q6k_q8k_cpu (const std::vector< block_q6_K > &mat_q6k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
	Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.

float	vec_dot_q4_k_q8_k_cpu (int n, const std::vector< block_q4_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
	Computes dot product between Q4_K and Q8_K vectors on CPU.

void	matvec_q4k_q8k_cpu (const std::vector< block_q4_K > &mat_q4k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
	Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.

void	dequantize_q8_0_block (const block_q8_0 qblock, float output)
	Dequantizes a Q8_0 block to float32.

void	dequantize_vector_q4k_to_f32 (const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks=0)
	Dequantizes a vector of Q4_K blocks to a vector of float32.

void	dequantize_vector_q8_0_to_f32 (const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks=0)
	Dequantizes a vector of Q8_0 blocks to a vector of float32.

Detailed Description

Weight quantization structures and functions for model compression.

This file contains the definitions for various quantization formats and functions to convert between them. The quantization methods include Q2_K, Q3_K, Q4_K, Q6_K, and Q8_0 formats, each offering different compression ratios and precision tradeoffs.

Definition in file quantization.h.

Macro Definition Documentation

◆ RESTRICT

#define RESTRICT __restrict__

Definition at line 15 of file quantization.h.

Function Documentation

◆ dequantize_q2_k()

void dequantize_q2_k	(	const void *	q_data,
		float *	f_data,
		int	num_weights_in_block,
		bool	log_details_for_this_block = `false`
	)

Dequantizes a Q2_K quantized block to float32.

Parameters

q_data	Pointer to quantized data
f_data	Output float array
num_weights_in_block	Number of weights to dequantize
log_details_for_this_block	Whether to log dequantization details

◆ dequantize_q3_k()

void dequantize_q3_k	(	const void *	q_data,
		float *	f_data,
		int	num_weights_in_block
	)

Dequantizes a Q3_K quantized block to float32.

Parameters

q_data	Pointer to quantized data
f_data	Output float array
num_weights_in_block	Number of weights to dequantize

Definition at line 476 of file quantization.cpp.

                                               {
  if (num_weights_in_block != GGML_QK_K) {
    throw std::invalid_argument(
        "dequantize_q3_k currently only supports block size " +
        std::to_string(GGML_QK_K));
  }
 
  const block_q3_K* qblock = static_cast<const block_q3_K*>(qblock_void);
 
  const float d_float_raw = fp16_to_fp32(qblock->d);
  const float dmin_float_raw = fp16_to_fp32(qblock->dmin);
 
  const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
  const float dmin_float =
      (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
 
  const uint8_t* hmask_ptr = qblock->hmask;
  const uint8_t* qs_ptr = qblock->qs;
  const uint8_t* scales_ptr = qblock->scales;
 
  int weight_index = 0;
 
  for (int j = 0; j < GGML_QK_K / 16; ++j) {
    uint8_t scale_idx;
 
    if (j < 8) {
      scale_idx = scales_ptr[j] & 0x3F;
    } else {
      scale_idx = scales_ptr[j + 4] & 0x3F;
    }
 
    assert(scale_idx < 64 && "Scale index out of bounds for Q3_K lookup");
    const float sub_block_scale_factor = K_SCALE_VALUES[scale_idx];
 
    const float final_sub_block_scale = d_float * sub_block_scale_factor;
    const float final_sub_block_min = dmin_float;
 
    for (int i = 0; i < 4; ++i) {
      uint8_t qs_byte = qs_ptr[j * 4 + i];
      uint8_t hmask_byte = hmask_ptr[j];
 
      for (int bit_pos = 0; bit_pos < 8; bit_pos += 2) {
        uint8_t lower_bits = (qs_byte >> bit_pos) & 0x3;
 
        int hmask_bit_idx = (i * 4) + (bit_pos / 2);
 
        uint8_t high_bit = (hmask_byte >> hmask_bit_idx) & 0x1;
 
        uint8_t q_val = (high_bit << 2) | lower_bits;
 
        float val = final_sub_block_scale * static_cast<float>(q_val) +
                    final_sub_block_min;
 
        if (!std::isfinite(val)) {
          val = 0.0f;
        }
 
        output[weight_index++] = val;
      }
    }
  }
 
  if (weight_index != GGML_QK_K) {
    std::cout << "ERROR: Processed " << weight_index << " weights instead of "
              << GGML_QK_K << std::endl;
 
    while (weight_index < GGML_QK_K) {
      output[weight_index++] = 0.0f;
    }
  }
}

References block_q3_K::d, block_q3_K::dmin, fp16_to_fp32(), GGML_QK_K, block_q3_K::hmask, K_SCALE_VALUES, block_q3_K::qs, and block_q3_K::scales.

◆ dequantize_q4_k_m()

void dequantize_q4_k_m	(	const block_q4_K *	qblock,
		float *RESTRICT	output_f32,
		int	num_elements,
		bool	log_this_block = `false`
	)

Dequantizes a Q4_K quantized block to float32.

Parameters

qblock	Pointer to Q4_K block
output_f32	Output float array
num_elements	Number of elements to dequantize
log_this_block	Whether to log dequantization details

◆ dequantize_q6_k()

void dequantize_q6_k	(	const block_q6_K *	qblock,
		float *RESTRICT	output_f32,
		int	num_elements,
		bool	log_this_block = `false`
	)

Dequantizes a Q6_K quantized block to float32.

Parameters

qblock	Pointer to Q6_K block
output_f32	Output float array
num_elements	Number of elements to dequantize
log_this_block	Whether to log dequantization details

◆ dequantize_q8_0_block()

void dequantize_q8_0_block	(	const block_q8_0 *	qblock,
		float *	output
	)

Dequantizes a Q8_0 block to float32.

Parameters

qblock	Pointer to Q8_0 block
output	Output float array

Definition at line 1047 of file quantization.cpp.

                                                                    {
  const float d_fp32 = fp16_to_fp32(qblock->d, true);
  for (int i = 0; i < GGML_QK8_0; ++i) {
    output[i] = d_fp32 * static_cast<float>(qblock->qs[i]);
  }
}

References block_q8_0::d, fp16_to_fp32(), GGML_QK8_0, and block_q8_0::qs.

Referenced by dequantize_vector_q8_0_to_f32(), TinyLlamaModel::initialize_gpu_and_rope(), TinyLlamaModel::lookup_embedding(), and matvec_q8_0_f32_vector_cpu().

◆ dequantize_vector_q4k_to_f32()

void dequantize_vector_q4k_to_f32	(	const std::vector< block_q4_K > &	q_weights,
		std::vector< float > &	f32_weights,
		size_t	total_num_elements,
		int	log_first_n_blocks = `0`
	)

Dequantizes a vector of Q4_K blocks to a vector of float32.

Parameters

q_weights	Input vector of Q4_K blocks
f32_weights	Output vector of float32 values (will be resized)
total_num_elements	Total number of float elements expected after dequantization
log_first_n_blocks	Number of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1109 of file quantization.cpp.

                                                          {
    if (q_weights.empty()) {
        Logger::warning("[DEQUANT_VEC_Q4K] Input Q4_K weight vector is empty. Output float vector will be empty.");
        f32_weights.clear();
        return;
    }
 
    f32_weights.resize(total_num_elements);
    size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
 
    if (q_weights.size() != expected_blocks) {
        Logger::error("[DEQUANT_VEC_Q4K] Mismatch in Q4_K block count. Expected: " +
                      std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
                      ". Total elements: " + std::to_string(total_num_elements));
    }
    
    float* current_output_ptr = f32_weights.data();
    size_t elements_processed = 0;
 
    for (size_t i = 0; i < q_weights.size(); ++i) {
        const block_q4_K* current_block_ptr = &q_weights[i];
        int elements_in_this_block = GGML_QK_K;
 
        if (elements_processed + GGML_QK_K > total_num_elements) {
            elements_in_this_block = total_num_elements - elements_processed;
        }
        
        if (elements_in_this_block <= 0) {
            Logger::warning("[DEQUANT_VEC_Q4K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
            continue; 
        }
 
        bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
        
        // Enhanced logging: Log block index for all blocks during the first dequantization call
        static std::atomic<bool> first_call_ever{true};
        bool is_first_call = first_call_ever.exchange(false);
        
        // Call the Q4_K specific single-block dequantizer
        dequantize_q4_k_m(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
        
        
        current_output_ptr += elements_in_this_block;
        elements_processed += elements_in_this_block;
 
    }
 
    if (elements_processed != total_num_elements) {
        Logger::warning("[DEQUANT_VEC_Q4K] Processed " + std::to_string(elements_processed) +
                        " elements, but expected " + std::to_string(total_num_elements) + ".");
    }
}

References dequantize_q4_k_m(), Logger::error(), GGML_QK_K, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ dequantize_vector_q6k_to_f32()

void dequantize_vector_q6k_to_f32	(	const std::vector< block_q6_K > &	q_weights,
		std::vector< float > &	f32_weights,
		size_t	total_num_elements,
		int	log_first_n_blocks = `0`
	)

Dequantizes a vector of Q6_K blocks to a vector of float32.

Parameters

q_weights	Input vector of Q6_K blocks
f32_weights	Output vector of float32 values (will be resized)
total_num_elements	Total number of float elements expected after dequantization
log_first_n_blocks	Number of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1054 of file quantization.cpp.

                                                          {
    if (q_weights.empty()) {
        Logger::warning("[DEQUANT_VEC_Q6K] Input Q6_K weight vector is empty. Output float vector will be empty.");
        f32_weights.clear();
        return;
    }
 
    f32_weights.resize(total_num_elements);
    size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
 
    if (q_weights.size() != expected_blocks) {
        Logger::error("[DEQUANT_VEC_Q6K] Mismatch in Q6_K block count. Expected: " +
                      std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
                      ". Total elements: " + std::to_string(total_num_elements));
    }
    
 
    float* current_output_ptr = f32_weights.data();
    size_t elements_processed = 0;
 
    for (size_t i = 0; i < q_weights.size(); ++i) {
        const block_q6_K* current_block_ptr = &q_weights[i];
        int elements_in_this_block = GGML_QK_K;
 
        if (elements_processed + GGML_QK_K > total_num_elements) {
            elements_in_this_block = total_num_elements - elements_processed;
        }
        
        if (elements_in_this_block <= 0) {
            Logger::warning("[DEQUANT_VEC_Q6K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
            continue; 
        }
 
        bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
        
        // Enhanced logging: Log block index for all blocks during the first dequantization call
        static std::atomic<bool> first_call_ever{true};
        bool is_first_call = first_call_ever.exchange(false);
                
        dequantize_q6_k(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
                
        current_output_ptr += elements_in_this_block;
        elements_processed += elements_in_this_block;
 
    }
 
    if (elements_processed != total_num_elements) {
        Logger::warning("[DEQUANT_VEC_Q6K] Processed " + std::to_string(elements_processed) +
                        " elements, but expected " + std::to_string(total_num_elements) + ".");
    }
}

References dequantize_q6_k(), Logger::error(), GGML_QK_K, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ dequantize_vector_q8_0_to_f32()

void dequantize_vector_q8_0_to_f32	(	const std::vector< block_q8_0 > &	q_weights,
		std::vector< float > &	f32_weights,
		size_t	total_num_elements,
		int	log_first_n_blocks = `0`
	)

Dequantizes a vector of Q8_0 blocks to a vector of float32.

Parameters

q_weights	Input vector of Q8_0 blocks
f32_weights	Output vector of float32 values (will be resized)
total_num_elements	Total number of float elements expected after dequantization
log_first_n_blocks	Number of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1165 of file quantization.cpp.

                                                           {
    if (q_weights.empty()) {
        Logger::warning("[DEQUANT_VEC_Q8_0] Input Q8_0 weight vector is empty. Output float vector will be empty.");
        f32_weights.clear();
        return;
    }
 
    f32_weights.resize(total_num_elements);
    
    size_t expected_blocks = (total_num_elements + GGML_QK8_0 - 1) / GGML_QK8_0;
 
    if (q_weights.size() != expected_blocks) {
        Logger::error("[DEQUANT_VEC_Q8_0] Mismatch in Q8_0 block count. Expected: " +
                      std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
                      ". Total elements: " + std::to_string(total_num_elements));
    }
    
    float* current_output_ptr = f32_weights.data();
    size_t elements_processed = 0;
 
    for (size_t i = 0; i < q_weights.size(); ++i) {
        const block_q8_0* current_block_ptr = &q_weights[i];
        int elements_in_this_block = GGML_QK8_0;
 
        if (elements_processed + GGML_QK8_0 > total_num_elements) {
            elements_in_this_block = total_num_elements - elements_processed;
        }
        
        if (elements_in_this_block <= 0) {
            Logger::warning("[DEQUANT_VEC_Q8_0] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
            continue; 
        }
 
        bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
        
        static std::atomic<bool> first_call_ever{true};
        bool is_first_call = first_call_ever.exchange(false);
        
        if (elements_in_this_block == GGML_QK8_0) {
            dequantize_q8_0_block(current_block_ptr, current_output_ptr);
        } else {
            // Handle partial block
            float temp_block[GGML_QK8_0];
            dequantize_q8_0_block(current_block_ptr, temp_block);
            std::memcpy(current_output_ptr, temp_block, elements_in_this_block * sizeof(float));
        }
                
        current_output_ptr += elements_in_this_block;
        elements_processed += elements_in_this_block;
    }
 
    if (elements_processed != total_num_elements) {
        Logger::warning("[DEQUANT_VEC_Q8_0] Processed " + std::to_string(elements_processed) +
                        " elements, but expected " + std::to_string(total_num_elements) + ".");
    }
}

References dequantize_q8_0_block(), Logger::error(), GGML_QK8_0, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ fp16_to_fp32()

float fp16_to_fp32	(	uint16_t	h,
		bool	is_gguf_scale_field = `false`
	)

Converts a 16-bit floating point number to 32-bit float.

Parameters

h	The 16-bit float value to convert
is_gguf_scale_field	Whether this value is from a GGUF scale field

Returns: The converted 32-bit float value

Definition at line 47 of file quantization.cpp.

                                                         {
  uint16_t h_to_convert = h;
  bool original_sign_bit_was_set = (h & 0x8000);
  uint32_t sign = (h_to_convert >> 15) & 1;
  uint32_t exp_fp16 = (h_to_convert >> 10) & 0x1f;
  uint32_t mant_fp16 = h_to_convert & 0x3ff;
  uint32_t x;
 
  if (exp_fp16 == 0) {
    if (mant_fp16 == 0) {
      x = (sign << 31);
 
    } else {
      exp_fp16 = 1;
      while ((mant_fp16 & 0x400) == 0) {
        mant_fp16 <<= 1;
        exp_fp16--;
      }
      mant_fp16 &= ~0x400;
      uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
      uint32_t mant_fp32 = mant_fp16 << 13;
      x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
    }
  } else if (exp_fp16 == 0x1f) {
    x = (sign << 31) | (0xff << 23) | (mant_fp16 << 13);
  } else {
    uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
    uint32_t mant_fp32 = mant_fp16 << 13;
    x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
  }
 
  float f;
  std::memcpy(&f, &x, sizeof(float));
 
  if (is_gguf_scale_field) {
    static std::atomic<int> q8_scale_f_log_count{0};
  }
 
  if (is_gguf_scale_field && f < 0.0f && !(std::isnan(f) || std::isinf(f))) {
    f = std::abs(f);
  }
  
  return f;
}

Referenced by dequantize_q2_k(), dequantize_q3_k(), dequantize_q4_k_m(), dequantize_q6_k(), dequantize_q8_0_block(), dequantize_q8_k(), vec_dot_q4_k_q8_k_cpu(), and vec_dot_q6_k_q8_k_cpu().

◆ fp32_to_fp16()

uint16_t fp32_to_fp16 ( float f )

Converts a 32-bit float to 16-bit floating point.

Parameters

f	The 32-bit float value to convert

Returns: The converted 16-bit float value

Definition at line 92 of file quantization.cpp.

                               {
  uint32_t x;
  std::memcpy(&x, &f, sizeof(float));
 
  uint32_t sign = (x >> 31) & 1;
  uint32_t exp_fp32 = (x >> 23) & 0xff;
  uint32_t mant_fp32 = x & 0x7fffff;
 
  uint16_t u;
 
  if (exp_fp32 == 0xff) {
    u = (sign << 15) | 0x7c00 | (mant_fp32 != 0 ? 0x200 : 0);
  } else {
    int exp_fp16 = (int)exp_fp32 - 127 + 15;
 
    if (exp_fp16 >= 0x1f) {
      u = (sign << 15) | 0x7c00;
    } else if (exp_fp16 <= 0) {
      if (exp_fp16 < -10) {
        u = (sign << 15);
      } else {
        mant_fp32 = (mant_fp32 | 0x800000) >> (1 - exp_fp16);
 
        if ((mant_fp32 >> 13) & 1) {
          mant_fp32 += (1 << 13);
        }
        u = (sign << 15) | (mant_fp32 >> 13);
      }
    } else {
      if ((mant_fp32 >> 13) & 1) {
        mant_fp32 += (1 << 13);
        if ((mant_fp32 >> 23) == 1) {
          mant_fp32 = 0;
          exp_fp16++;
          if (exp_fp16 >= 0x1f) {
            u = (sign << 15) | 0x7c00;
            return u;
          }
        }
      }
      u = (sign << 15) | (exp_fp16 << 10) | (mant_fp32 >> 13);
    }
  }
  return u;
}

Referenced by quantize_fp32_to_q8_K(), quantize_q4_k_m(), and quantize_q6_k().

◆ ggml_type_block_size()

size_t ggml_type_block_size ( GGMLType type )

Gets the block size for a GGML type.

Parameters

type	The GGML type

Returns: Block size in elements

Definition at line 688 of file quantization.cpp.

                                           {
  switch (type) {
    case GGMLType::GGML_TYPE_Q2_K:
    case GGMLType::GGML_TYPE_Q3_K:
    case GGMLType::GGML_TYPE_Q4_K:
    case GGMLType::GGML_TYPE_Q6_K:
 
      return GGML_QK_K;
 
    case GGMLType::GGML_TYPE_Q4_0:
    case GGMLType::GGML_TYPE_Q8_0:
 
      return 32;
 
    case GGMLType::GGML_TYPE_F32:
    case GGMLType::GGML_TYPE_F16:
    case GGMLType::GGML_TYPE_I8:
    case GGMLType::GGML_TYPE_I16:
    case GGMLType::GGML_TYPE_I32:
    case GGMLType::GGML_TYPE_BF16:
      return 1;
 
    default:
      std::cout << "Warning: Unknown GGMLType in ggml_type_block_size: "
                << static_cast<int>(type) << std::endl;
      return 0;
  }
 
  return 0;
}

References GGML_QK_K, GGML_TYPE_BF16, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, and GGML_TYPE_Q8_0.

Referenced by load_gguf_meta().

◆ ggml_type_name()

const char * ggml_type_name ( GGMLType type )

Gets the string name of a GGML type.

Parameters

type	The GGML type

Returns: String representation of the type

Definition at line 601 of file quantization.cpp.

                                          {
  switch (type) {
    case GGMLType::GGML_TYPE_F32:
      return "F32";
    case GGMLType::GGML_TYPE_F16:
      return "F16";
    case GGMLType::GGML_TYPE_Q4_0:
      return "Q4_0";
    case GGMLType::GGML_TYPE_Q4_1:
      return "Q4_1";
    case GGMLType::GGML_TYPE_Q5_0:
      return "Q5_0";
    case GGMLType::GGML_TYPE_Q5_1:
      return "Q5_1";
    case GGMLType::GGML_TYPE_Q8_0:
      return "Q8_0";
    case GGMLType::GGML_TYPE_Q8_1:
      return "Q8_1";
    case GGMLType::GGML_TYPE_Q2_K:
      return "Q2_K";
    case GGMLType::GGML_TYPE_Q3_K:
      return "Q3_K";
    case GGMLType::GGML_TYPE_Q4_K:
      return "Q4_K";
    case GGMLType::GGML_TYPE_Q5_K:
      return "Q5_K";
    case GGMLType::GGML_TYPE_Q6_K:
      return "Q6_K";
    case GGMLType::GGML_TYPE_Q8_K:
      return "Q8_K";
    case GGMLType::GGML_TYPE_I8:
      return "I8";
    case GGMLType::GGML_TYPE_I16:
      return "I16";
    case GGMLType::GGML_TYPE_I32:
      return "I32";
    case GGMLType::GGML_TYPE_BF16:
      return "BF16";
    case GGMLType::GGML_TYPE_COUNT:
      return "COUNT";
    default:
      return "Unknown";
  }
}

References GGML_TYPE_BF16, GGML_TYPE_COUNT, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, GGML_TYPE_Q8_1, and GGML_TYPE_Q8_K.

Referenced by load_gguf_meta().

◆ ggml_type_size()

size_t ggml_type_size ( GGMLType type )

Gets the size in bytes of a GGML type.

Parameters

type	The GGML type

Returns: Size in bytes

Definition at line 646 of file quantization.cpp.

                                     {
  switch (type) {
    case GGMLType::GGML_TYPE_F32:
      return sizeof(float);
    case GGMLType::GGML_TYPE_F16:
      return sizeof(uint16_t);
    case GGMLType::GGML_TYPE_I8:
      return sizeof(int8_t);
    case GGMLType::GGML_TYPE_Q4_K:
      return sizeof(block_q4_K);
    case GGMLType::GGML_TYPE_Q2_K:
      return sizeof(block_q2_K);
    case GGMLType::GGML_TYPE_Q3_K:
      return sizeof(block_q3_K);
    case GGMLType::GGML_TYPE_Q6_K:
      return sizeof(block_q6_K);
    case GGMLType::GGML_TYPE_Q4_0:
      return 18;
 
    case GGMLType::GGML_TYPE_Q8_0:
      return 34;
    case GGMLType::GGML_TYPE_Q8_1:
      return 40;
    case GGMLType::GGML_TYPE_Q5_K:
      return 116;
    case GGMLType::GGML_TYPE_Q8_K:
      return 290;
    case GGMLType::GGML_TYPE_I16:
      return sizeof(int16_t);
    case GGMLType::GGML_TYPE_I32:
      return sizeof(int32_t);
    case GGMLType::GGML_TYPE_BF16:
      return sizeof(uint16_t);
    case GGMLType::GGML_TYPE_COUNT:
    default:
      std::cout << "  UNKNOWN GGML TYPE: " << static_cast<int>(type)
                << std::endl;
      throw std::invalid_argument("Unknown GGML type in ggml_type_size: " +
                                  std::to_string(static_cast<int>(type)));
  }
}

References GGML_TYPE_BF16, GGML_TYPE_COUNT, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, GGML_TYPE_Q8_1, and GGML_TYPE_Q8_K.

Referenced by load_gguf_meta().

◆ handle_i8_tensor()

void handle_i8_tensor	(	const void *	i8_data,
		float *	f_data,
		size_t	num_elements
	)

Handles conversion of int8 tensor data to float32.

Parameters

i8_data	Input int8 data
f_data	Output float array
num_elements	Number of elements to convert

Definition at line 268 of file quantization.cpp.

                                           {
  const int8_t* input_ptr = static_cast<const int8_t*>(input_data);
  for (size_t i = 0; i < num_elements; ++i) {
    output_data[i] = static_cast<float>(input_ptr[i]);
  }
}

◆ matvec_q4k_q8k_cpu()

void matvec_q4k_q8k_cpu	(	const std::vector< block_q4_K > &	mat_q4k,
		const std::vector< block_q8_K > &	vec_q8k,
		std::vector< float > &	out_f32,
		int	rows,
		int	cols,
		bool	log_calls
	)

Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.

Parameters

mat_q4k	Q4_K matrix
vec_q8k	Q8_K vector
out_f32	Output float vector
rows	Number of matrix rows
cols	Number of matrix columns
log_calls	Whether to log computation details

Definition at line 982 of file quantization.cpp.

                                        {
  if (cols % GGML_QK_K != 0) {
    throw std::runtime_error(
        "matvec_q4k_q8k_cpu: cols must be divisible by GGML_QK_K");
  }
  size_t blocks_per_row = cols / GGML_QK_K;
  if (mat_q4k.size() != (size_t)rows * blocks_per_row) {
    throw std::runtime_error("matvec_q4k_q8k_cpu: mat_q4k size mismatch");
  }
  if (vec_q8k.size() != blocks_per_row) {
    throw std::runtime_error("matvec_q4k_q8k_cpu: vec_q8k size mismatch");
  }
  out_f32.resize(rows);
 
#pragma omp parallel for
  for (int r = 0; r < rows; ++r) {
    const std::vector<block_q4_K> row_q4k(
        mat_q4k.begin() + r * blocks_per_row,
        mat_q4k.begin() + (r + 1) * blocks_per_row);
 
    out_f32[r] = vec_dot_q4_k_q8_k_cpu(cols, row_q4k, vec_q8k, log_calls);
  }
}

References GGML_QK_K, and vec_dot_q4_k_q8_k_cpu().

◆ matvec_q6k_q8k_cpu()

void matvec_q6k_q8k_cpu	(	const std::vector< block_q6_K > &	mat_q6k,
		const std::vector< block_q8_K > &	vec_q8k,
		std::vector< float > &	out_f32,
		int	rows,
		int	cols,
		bool	log_calls
	)

Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.

Parameters

mat_q6k	Q6_K matrix
vec_q8k	Q8_K vector
out_f32	Output float vector
rows	Number of matrix rows
cols	Number of matrix columns
log_calls	Whether to log computation details

Definition at line 897 of file quantization.cpp.

                                        {
  if (cols % GGML_QK_K != 0) {
    throw std::runtime_error(
        "matvec_q6k_q8k_cpu: cols must be divisible by GGML_QK_K");
  }
  size_t blocks_per_row = cols / GGML_QK_K;
  if (mat_q6k.size() != (size_t)rows * blocks_per_row) {
    throw std::runtime_error("matvec_q6k_q8k_cpu: mat_q6k size mismatch");
  }
  if (vec_q8k.size() != blocks_per_row) {
    throw std::runtime_error("matvec_q6k_q8k_cpu: vec_q8k size mismatch");
  }
  out_f32.resize(rows);
  for (int r = 0; r < rows; ++r) {
    const std::vector<block_q6_K> row_q6k(
        mat_q6k.begin() + r * blocks_per_row,
        mat_q6k.begin() + (r + 1) * blocks_per_row);
 
    out_f32[r] = vec_dot_q6_k_q8_k_cpu(cols, row_q6k, vec_q8k, log_calls);
  }
}

References GGML_QK_K, and vec_dot_q6_k_q8_k_cpu().

◆ quantize_fp32_to_q8_K()

std::vector< block_q8_K > quantize_fp32_to_q8_K ( const std::vector< float > & f_data )

Quantizes float32 data to Q8_K format.

Parameters

f_data Input float vector

Returns: Vector of Q8_K blocks

Definition at line 719 of file quantization.cpp.

                                    {
  if (f_data.size() % GGML_QK_K != 0) {
    throw std::runtime_error(
        "Input vector size must be a multiple of GGML_QK_K (" +
        std::to_string(GGML_QK_K) + ")");
  }
 
  size_t num_blocks = f_data.size() / GGML_QK_K;
  std::vector<block_q8_K> q_data(num_blocks);
  const float* x = f_data.data();
  block_q8_K* y = q_data.data();
 
  static std::atomic<int> log_count_q8k_quant_scales = 0;
 
  for (size_t i = 0; i < num_blocks; ++i) {
    float amax = 0.0f;
    for (int j = 0; j < GGML_QK_K; ++j) {
      amax = SAFE_MAX(amax, std::abs(x[j]));
    }
 
    const float d_fp32 = amax / Q8K_SCALE_FACTOR;
    const float id = (d_fp32 != 0.f) ? 1.0f / d_fp32 : 0.0f;
    y[i].d = fp32_to_fp16(d_fp32);
 
    if (log_count_q8k_quant_scales < 10) {
      std::stringstream q8k_scale_log_ss;
      q8k_scale_log_ss << "[Q8K_QUANT_SCALES] Block #" << i
                       << " Input amax=" << amax << " -> d_fp32=" << d_fp32
                       << " -> Stored d_fp16=0x" << std::hex << y[i].d
                       << std::dec;
      Logger::debug(q8k_scale_log_ss.str());
      log_count_q8k_quant_scales++;
    }
 
    int16_t block_sum[16] = {0};
    for (int j = 0; j < GGML_QK_K; ++j) {
      const float val_scaled = x[j] * id;
 
      int8_t q_val = static_cast<int8_t>(
          SAFE_MAX(-128.0f, SAFE_MIN(127.0f, std::round(val_scaled))));
      y[i].qs[j] = q_val;
      block_sum[j / 16] += q_val;
    }
 
    std::memcpy(y[i].bsums, block_sum, sizeof(block_sum));
 
    x += GGML_QK_K;
  }
 
  return q_data;
}

References block_q8_K::d, Logger::debug(), fp32_to_fp16(), GGML_QK_K, Q8K_SCALE_FACTOR, block_q8_K::qs, SAFE_MAX, and SAFE_MIN.

◆ quantize_q4_k_m()

void quantize_q4_k_m	(	const float *	f_data,
		void *	q_data,
		int	num_elements
	)

Quantizes float32 data to Q4_K format.

Parameters

f_data	Input float array
q_data	Output quantized data
num_elements	Number of elements to quantize

Definition at line 276 of file quantization.cpp.

                                       {
  if (num_elements != GGML_QK_K) {
    throw std::invalid_argument(
        "quantize_q4_k_m currently only supports block size " +
        std::to_string(GGML_QK_K));
  }
 
  block_q4_K* output_qblock = static_cast<block_q4_K*>(output_qblock_void);
 
  std::memset(output_qblock->scales, 0, sizeof(output_qblock->scales));
  std::memset(output_qblock->qs, 0, sizeof(output_qblock->qs));
 
  float block_min_val = std::numeric_limits<float>::max();
  float block_max_val = std::numeric_limits<float>::lowest();
  for (int i = 0; i < num_elements; ++i) {
    block_min_val = SAFE_MIN(block_min_val, input[i]);
    block_max_val = SAFE_MAX(block_max_val, input[i]);
  }
 
  if (block_max_val == block_min_val) {
    block_max_val = block_min_val + GGUF_SMALL_VAL;
  }
  if (block_max_val < GGUF_EPSILON && block_max_val > -GGUF_EPSILON) {
    block_max_val = GGUF_SMALL_VAL;
    block_min_val = 0.0f;
  }
 
  const float d_super_scale_candidate = (block_max_val - block_min_val) / Q4K_SCALE_FACTOR;
  const float d_super =
      d_super_scale_candidate > GGUF_EPSILON ? d_super_scale_candidate : GGUF_EPSILON;
  const float min_super = block_min_val;
 
  output_qblock->d = fp32_to_fp16(d_super);
  output_qblock->dmin = fp32_to_fp16(min_super);
 
  for (int j = 0; j < GGML_QK_K / 16; ++j) {
    const float* sub_block_input = input + j * 16;
 
    float sub_min_val = sub_block_input[0];
    float sub_max_val = sub_block_input[0];
    for (int i = 1; i < 16; ++i) {
      sub_min_val = SAFE_MIN(sub_min_val, sub_block_input[i]);
      sub_max_val = SAFE_MAX(sub_max_val, sub_block_input[i]);
    }
 
    float ideal_scale = 0.0f;
    if (sub_max_val > sub_min_val + GGUF_EPSILON) {
      ideal_scale = (sub_max_val - sub_min_val) / Q4K_SCALE_FACTOR;
    }
    float ideal_min = sub_min_val;
 
    uint8_t best_scale_idx = 0;
    float min_scale_err = std::numeric_limits<float>::max();
    if (d_super > GGUF_EPSILON) {
      for (uint8_t k = 0; k < 16; ++k) {
        float candidate_scale = d_super * K_SCALE_VALUES[k];
        float err = std::abs(candidate_scale - ideal_scale);
        if (err < min_scale_err) {
          min_scale_err = err;
          best_scale_idx = k;
        }
      }
    }
 
    uint8_t best_min_idx = 0;
    float min_min_err = std::numeric_limits<float>::max();
 
    for (uint8_t l = 0; l < 16; ++l) {
      float candidate_min = min_super * K_MIN_VALUES[l];
      float err = std::abs(candidate_min - ideal_min);
      if (err < min_min_err) {
        min_min_err = err;
        best_min_idx = l;
      }
    }
 
    int scale_byte_idx = j % 8;
    int scale_shift = 4 * (j / 8);
    output_qblock->scales[scale_byte_idx] |= (best_scale_idx << scale_shift);
 
    int min_byte_idx = (j % 4) + 8;
    int min_shift = 4 * (j / 4);
    output_qblock->scales[min_byte_idx] |= (best_min_idx << min_shift);
 
    float actual_scale = d_super * K_SCALE_VALUES[best_scale_idx];
    float actual_min = min_super * K_MIN_VALUES[best_min_idx];
    float inv_actual_scale = (actual_scale > GGUF_EPSILON || actual_scale < -GGUF_EPSILON)
                                 ? 1.0f / actual_scale
                                 : 0.0f;
 
    uint8_t packed_qs[8];
 
    std::memset(packed_qs, 0, sizeof(packed_qs));
 
    for (int i = 0; i < 16; ++i) {
      float val = sub_block_input[i];
 
      int quant_val = 0;
      if (inv_actual_scale != 0.0f) {
        quant_val =
            static_cast<int>(std::round((val - actual_min) * inv_actual_scale)) + Q4K_OFFSET;
      }
      quant_val = SAFE_MAX(0, SAFE_MIN(15, quant_val));
 
      int byte_idx_qs = i / 2;
      int shift_qs = (i % 2) * 4;
      packed_qs[byte_idx_qs] |= (static_cast<uint8_t>(quant_val) << shift_qs);
    }
 
    uint8_t* qs_target = output_qblock->qs + j * 8;
    for (int i = 0; i < 8; ++i) {
      uint8_t low_nibble_val = packed_qs[i] & 0x0F;
      uint8_t high_nibble_val = (packed_qs[i] >> 4) & 0x0F;
      qs_target[i] = low_nibble_val | (high_nibble_val << 4);
    }
  }
}

References block_q4_K::d, block_q4_K::dmin, fp32_to_fp16(), GGML_QK_K, GGUF_EPSILON, GGUF_SMALL_VAL, K_MIN_VALUES, K_SCALE_VALUES, Q4K_OFFSET, Q4K_SCALE_FACTOR, block_q4_K::qs, SAFE_MAX, SAFE_MIN, and block_q4_K::scales.

◆ quantize_q6_k()

void quantize_q6_k	(	const float *	f_data,
		void *	q_data,
		int	num_elements
	)

Quantizes float32 data to Q6_K format.

Parameters

f_data	Input float array
q_data	Output quantized data
num_elements	Number of elements to quantize

Definition at line 549 of file quantization.cpp.

                                     {
  if (num_elements != GGML_QK_K) {
    throw std::invalid_argument(
        "quantize_q6_k currently only supports block size " +
        std::to_string(GGML_QK_K));
  }
 
  block_q6_K* output_qblock = static_cast<block_q6_K*>(output_qblock_void);
 
  uint8_t* ql = output_qblock->ql;
  uint8_t* qh = output_qblock->qh;
  int8_t* scales = output_qblock->scales;
  std::memset(ql, 0, GGML_QK_K / 2);
  std::memset(qh, 0, GGML_QK_K / 4);
 
  float amax = 0.0f;
  for (int i = 0; i < num_elements; ++i) {
    amax = SAFE_MAX(amax, std::abs(input[i]));
  }
 
  const float d_float = (amax > GGUF_EPSILON) ? (amax / Q6K_SCALE_FACTOR) : GGUF_EPSILON;
  output_qblock->d = fp32_to_fp16(d_float);
 
  for (int sub = 0; sub < GGML_QK_K / 16; ++sub) {
    const float* sub_in = input + sub * 16;
 
    float sub_amax = 0.0f;
    for (int i = 0; i < 16; ++i) {
      sub_amax = SAFE_MAX(sub_amax, std::abs(sub_in[i]));
    }
 
    int8_t scale = (d_float > 0.0f) ? std::round(sub_amax / d_float) : 1;
    if (scale == 0) scale = 1;
    scales[sub] = scale;
 
    for (int i = 0; i < 16; ++i) {
      float val = sub_in[i];
      int q = static_cast<int>(std::round(val / (d_float * scale))) + Q6K_OFFSET;
      q = SAFE_MAX(0, SAFE_MIN(63, q));
 
      int idx = sub * 16 + i;
      int ql_idx = idx / 2;
      int ql_shift = (idx % 2) * 4;
      ql[ql_idx] |= (q & 0x0F) << ql_shift;
      int qh_idx = idx / 4;
      int qh_shift = (idx % 4) * 2;
      qh[qh_idx] |= ((q >> 4) & 0x03) << qh_shift;
    }
  }
}

References block_q6_K::d, fp32_to_fp16(), GGML_QK_K, GGUF_EPSILON, Q6K_OFFSET, Q6K_SCALE_FACTOR, block_q6_K::qh, block_q6_K::ql, SAFE_MAX, SAFE_MIN, and block_q6_K::scales.

◆ vec_dot_q4_k_q8_k_cpu()

float vec_dot_q4_k_q8_k_cpu	(	int	n,
		const std::vector< block_q4_K > &	x_vec,
		const std::vector< block_q8_K > &	y_vec,
		bool	log_this_call
	)

Computes dot product between Q4_K and Q8_K vectors on CPU.

Parameters

n	Number of blocks
x_vec	Q4_K vector
y_vec	Q8_K vector
log_this_call	Whether to log computation details

Returns: Dot product result

Definition at line 922 of file quantization.cpp.

                                                {
  int log_count_now = g_vec_dot_q4_k_q8_k_log_count.fetch_add(1);
  if (log_count_now >= 5) log_this_call = false;
 
  if (n % GGML_QK_K != 0) {
    throw std::runtime_error("vec_dot_q4_k_q8_k: n must be multiple of QK_K");
  }
  size_t nb = n / GGML_QK_K;
  if (x_vec.size() != nb || y_vec.size() != nb) {
    throw std::runtime_error("vec_dot_q4_k_q8_k: vector block count mismatch");
  }
 
  const block_q4_K* x = x_vec.data();
  const block_q8_K* y = y_vec.data();
 
  float sumf = 0.0f;
  for (size_t i = 0; i < nb; ++i) {
    int8_t q4_vals[GGML_QK_K];
    const uint8_t* q4 = x[i].qs;
    for (int j = 0; j < GGML_QK_K / 2; ++j) {
      q4_vals[2 * j + 0] = static_cast<int8_t>(q4[j] & 0xF);
      q4_vals[2 * j + 1] = static_cast<int8_t>(q4[j] >> 4);
    }
 
    const int8_t* q8 = y[i].qs;
 
    for (int sub = 0; sub < 16; ++sub) {
      uint8_t scale_idx, min_idx;
      get_scale_min_indices_q4_K(sub, x[i].scales, &scale_idx, &min_idx);
      float scale = fp16_to_fp32(x[i].d) * K_SCALE_VALUES[scale_idx];
      float minv = fp16_to_fp32(x[i].dmin) * K_MIN_VALUES[min_idx];
      for (int k = 0; k < 16; ++k) {
        int idx = sub * 16 + k;
        float q4_val = static_cast<float>(q4_vals[idx]) - 8.0f;
        float q8_val = static_cast<float>(q8[idx]);
        sumf += (scale * q4_val + minv) * q8_val;
      }
    }
 
    if (i == 0 && log_this_call) {
      std::stringstream ss;
      ss << "[Q4K_Q8K] Block #0: d: " << fp16_to_fp32(x[i].d)
         << ", dmin: " << fp16_to_fp32(x[i].dmin);
      Logger::debug(ss.str());
      ss.str("");
      ss << "[Q4K_Q8K] Block #0: Q8_K input (first 16): ";
      for (int k = 0; k < 16; ++k) ss << (int)q8[k] << " ";
      Logger::debug(ss.str());
      ss.str("");
      ss << "[Q4K_Q8K] Block #0: Q4_K unpacked (first 16): ";
      for (int k = 0; k < 16; ++k) ss << (int)q4_vals[k] << " ";
      Logger::debug(ss.str());
      ss.str("");
    }
  }
  return sumf;
}

References Logger::debug(), fp16_to_fp32(), g_vec_dot_q4_k_q8_k_log_count, get_scale_min_indices_q4_K(), GGML_QK_K, K_MIN_VALUES, K_SCALE_VALUES, block_q4_K::qs, and block_q8_K::qs.

Referenced by matvec_q4k_q8k_cpu().

◆ vec_dot_q6_k_q8_k_cpu()

float vec_dot_q6_k_q8_k_cpu	(	int	n,
		const std::vector< block_q6_K > &	x,
		const std::vector< block_q8_K > &	y,
		bool	log_this_call
	)

Computes dot product between Q6_K and Q8_K vectors on CPU.

Parameters

n	Number of blocks
x	Q6_K vector
y	Q8_K vector
log_this_call	Whether to log computation details

Returns: Dot product result

Definition at line 772 of file quantization.cpp.

                                                {
  if (n % GGML_QK_K != 0) {
    throw std::runtime_error("vec_dot_q6_k_q8_k: n must be multiple of QK_K");
  }
  size_t nb = n / GGML_QK_K;
  if (x_vec.size() != nb || y_vec.size() != nb) {
    throw std::runtime_error("vec_dot_q6_k_q8_k: vector block count mismatch");
  }
 
  const block_q6_K* x = x_vec.data();
  const block_q8_K* y = y_vec.data();
 
  int8_t aux8[GGML_QK_K];
  int16_t aux16[8];
  float sums[8];
  int32_t aux32[8];
  std::memset(sums, 0, 8 * sizeof(float));
 
  float sumf = 0.0f;
 
  static std::atomic<int> log_count_dot = 0;
  bool should_log_this_block = log_this_call && log_count_dot < 5;
 
  for (size_t i = 0; i < nb; ++i) {
    const uint8_t* ql = x[i].ql;
    const uint8_t* qh = x[i].qh;
    const int8_t* q8 = y[i].qs;
    std::memset(aux32, 0, 8 * sizeof(int32_t));
 
    int8_t* a = aux8;
    for (int j = 0; j < GGML_QK_K; j += 128) {
      for (int l = 0; l < 32; ++l) {
        a[l + 0] = static_cast<int8_t>(
            ((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32);
        a[l + 32] = static_cast<int8_t>(
            ((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32);
        a[l + 64] = static_cast<int8_t>(
            ((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32);
        a[l + 96] = static_cast<int8_t>(
            ((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
      }
      a += 128;
      ql += 64;
      qh += 32;
    }
 
    a = aux8;
    int is = 0;
    for (int j = 0; j < GGML_QK_K / 16; ++j) {
      int scale = x[i].scales[is++];
      for (int l = 0; l < 8; ++l)
        aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
      q8 += 8;
      a += 8;
      for (int l = 0; l < 8; ++l)
        aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
      q8 += 8;
      a += 8;
    }
 
    int32_t sumi_mins = 0;
    for (int j = 0; j < GGML_QK_K / 16; ++j) {
      sumi_mins += static_cast<int32_t>(y[i].bsums[j]) *
                   static_cast<int32_t>(x[i].scales[j]);
    }
 
    const float d_q6 = fp16_to_fp32(x[i].d);
    const float d_q8 = fp16_to_fp32(y[i].d);
    const float d = d_q6 * d_q8;
 
    float block_contribution = 0.0f;
    for (int l = 0; l < 8; ++l) {
      float term = d * (aux32[l] - 32 * sumi_mins / 8);
      sums[l] += term;
      block_contribution += term;
    }
 
    if (i == 0 && should_log_this_block) {
      std::stringstream ss_log;
      ss_log << "[DOT_Q6K_Q8K] Call #" << (log_count_dot.load() + 1)
             << ", Block #0:";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Q6_K Scale d_q6: " << d_q6 << " (Raw FP16: 0x" << std::hex
             << x[i].d << std::dec << ")";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Q8_K Scale d_q8: " << d_q8;
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Combined Scale d: " << d;
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Q6_K Sub-scales (int8): ";
      for (int k = 0; k < 16; ++k) ss_log << (int)x[i].scales[k] << " ";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Int32 Sums (aux32, before compensation): ";
      for (int l = 0; l < 8; ++l) ss_log << aux32[l] << " ";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Compensation term (sumi_mins): " << sumi_mins
             << ", -32 * sumi_mins: " << (-32 * sumi_mins);
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Block #0 Contribution to Sums (after compensation): "
             << block_contribution;
      Logger::debug(ss_log.str());
    }
  }
 
  for (int l = 0; l < 8; ++l) {
    sumf += sums[l];
  }
 
  if (should_log_this_block) {
    log_count_dot++;
  }
  return sumf;
}

References block_q8_K::bsums, block_q6_K::d, Logger::debug(), fp16_to_fp32(), GGML_QK_K, block_q6_K::qh, block_q6_K::ql, block_q8_K::qs, and block_q6_K::scales.

Referenced by matvec_q6k_q8k_cpu().

Classes

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ RESTRICT

Function Documentation

◆ dequantize_q2_k()

◆ dequantize_q3_k()

◆ dequantize_q4_k_m()

◆ dequantize_q6_k()

◆ dequantize_q8_0_block()

◆ dequantize_vector_q4k_to_f32()

◆ dequantize_vector_q6k_to_f32()

◆ dequantize_vector_q8_0_to_f32()

◆ fp16_to_fp32()

◆ fp32_to_fp16()

◆ ggml_type_block_size()

◆ ggml_type_name()

◆ ggml_type_size()

◆ handle_i8_tensor()

◆ matvec_q4k_q8k_cpu()

◆ matvec_q6k_q8k_cpu()

◆ quantize_fp32_to_q8_K()

◆ quantize_q4_k_m()

◆ quantize_q6_k()

◆ vec_dot_q4_k_q8_k_cpu()

◆ vec_dot_q6_k_q8_k_cpu()