#include "quantization.h"
#include "model_macros.h"
#include <algorithm>
#include <atomic>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <limits>
#include <numeric>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include "logger.h"
#include "gguf_parser.h"

Include dependency graph for quantization.cpp:

Functions
float	fp16_to_fp32 (uint16_t h, bool is_gguf_scale_field)
	Converts a 16-bit floating point number to 32-bit float.

uint16_t	fp32_to_fp16 (float f)
	Converts a 32-bit float to 16-bit floating point.

static void	get_scale_min_k4 (int j, const uint8_t q, uint8_t d_val, uint8_t *m_val)

static void	get_scale_min_indices_q4_K (int j, const uint8_t scales, uint8_t scale_index, uint8_t *min_index)

void	dequantize_q4_k_m (const block_q4_K qblock, float output, int num_weights_in_block, bool log_this_block)

void	dequantize_q6_k (const block_q6_K qblock, float output, int num_weights_in_block, bool log_this_block)

void	handle_i8_tensor (const void input_data, float output_data, size_t num_elements)
	Handles conversion of int8 tensor data to float32.

void	quantize_q4_k_m (const float input, void output_qblock_void, int num_elements)
	Quantizes float32 data to Q4_K format.

void	dequantize_q2_k (const void qblock_void, float output, int num_weights_in_block)

void	dequantize_q3_k (const void qblock_void, float output, int num_weights_in_block)
	Dequantizes a Q3_K quantized block to float32.

void	quantize_q6_k (const float input, void output_qblock_void, int num_elements)
	Quantizes float32 data to Q6_K format.

const char *	ggml_type_name (GGMLType type)
	Gets the string name of a GGML type.

size_t	ggml_type_size (GGMLType type)
	Gets the size in bytes of a GGML type.

size_t	ggml_type_block_size (GGMLType type)
	Gets the block size for a GGML type.

std::vector< block_q8_K >	quantize_fp32_to_q8_K (const std::vector< float > &f_data)
	Quantizes float32 data to Q8_K format.

float	vec_dot_q6_k_q8_k_cpu (int n, const std::vector< block_q6_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
	Computes dot product between Q6_K and Q8_K vectors on CPU.

void	matvec_q6k_q8k_cpu (const std::vector< block_q6_K > &mat_q6k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
	Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.

float	vec_dot_q4_k_q8_k_cpu (int n, const std::vector< block_q4_K > &x_vec, const std::vector< block_q8_K > &y_vec, bool log_this_call)
	Computes dot product between Q4_K and Q8_K vectors on CPU.

void	matvec_q4k_q8k_cpu (const std::vector< block_q4_K > &mat_q4k, const std::vector< block_q8_K > &vec_q8k, std::vector< float > &out_f32, int rows, int cols, bool log_calls)
	Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.

void	dequantize_q8_k (const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)

void	dequantize_q8_0_block (const block_q8_0 qblock, float output)
	Dequantizes a Q8_0 block to float32.

void	dequantize_vector_q6k_to_f32 (const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
	Dequantizes a vector of Q6_K blocks to a vector of float32.

void	dequantize_vector_q4k_to_f32 (const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
	Dequantizes a vector of Q4_K blocks to a vector of float32.

void	dequantize_vector_q8_0_to_f32 (const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
	Dequantizes a vector of Q8_0 blocks to a vector of float32.

Variables
constexpr float	K_SCALE_VALUES [64]

constexpr float	K_MIN_VALUES [64]

static std::atomic< int >	g_vec_dot_q4_k_q8_k_log_count {0}

Function Documentation

◆ dequantize_q2_k()

void dequantize_q2_k	(	const void *	qblock_void,
		float *	output,
		int	num_weights_in_block
	)

Definition at line 395 of file quantization.cpp.

                                               {
  if (num_weights_in_block != GGML_QK_K) {
    throw std::invalid_argument(
        "dequantize_q2_k currently only supports block size " +
        std::to_string(GGML_QK_K));
  }
 
  const block_q2_K* qblock = static_cast<const block_q2_K*>(qblock_void);
 
  const float d_float_raw = fp16_to_fp32(qblock->d);
  const float dmin_float_raw = fp16_to_fp32(qblock->dmin);
 
  const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
  const float dmin_float =
      (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
 
  const float d_float_clamped = SAFE_MIN(SAFE_MAX(d_float, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
  const float dmin_float_clamped = SAFE_MIN(SAFE_MAX(dmin_float, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
 
  const uint8_t* scales_ptr = qblock->scales;
  const uint8_t* qs_ptr = qblock->qs;
  int weight_index = 0;
  float dequantized_scales[16];
 
  for (int i = 0; i < 8; ++i) {
    uint8_t packed_scales = scales_ptr[i];
    uint8_t scale_low = packed_scales & 0x0F;
    uint8_t scale_high = packed_scales >> 4;
 
    dequantized_scales[i * 2 + 0] =
        d_float_clamped * static_cast<float>(scale_low);
    dequantized_scales[i * 2 + 1] =
        d_float_clamped * static_cast<float>(scale_high);
 
    dequantized_scales[i * 2 + 0] =
        SAFE_MIN(SAFE_MAX(dequantized_scales[i * 2 + 0], TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
    dequantized_scales[i * 2 + 1] =
        SAFE_MIN(SAFE_MAX(dequantized_scales[i * 2 + 1], TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
  }
 
  weight_index = 0;
  for (int j = 0; j < GGML_QK_K / 16; ++j) {
    float sub_block_scale = dequantized_scales[j];
 
    const uint8_t* qs_subblock_ptr = qs_ptr + j * 4;
 
    for (int i = 0; i < 4; ++i) {
      uint8_t packed_weights = qs_subblock_ptr[i];
 
      uint8_t q0 = (packed_weights >> 0) & 0x03;
      uint8_t q1 = (packed_weights >> 2) & 0x03;
      uint8_t q2 = (packed_weights >> 4) & 0x03;
      uint8_t q3 = (packed_weights >> 6) & 0x03;
 
      float val0 =
          sub_block_scale * static_cast<float>(q0) + dmin_float_clamped;
      float val1 =
          sub_block_scale * static_cast<float>(q1) + dmin_float_clamped;
      float val2 =
          sub_block_scale * static_cast<float>(q2) + dmin_float_clamped;
      float val3 =
          sub_block_scale * static_cast<float>(q3) + dmin_float_clamped;
 
      val0 = SAFE_MIN(SAFE_MAX(val0, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
      val1 = SAFE_MIN(SAFE_MAX(val1, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
      val2 = SAFE_MIN(SAFE_MAX(val2, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
      val3 = SAFE_MIN(SAFE_MAX(val3, TENSOR_SCALE_MIN), TENSOR_SCALE_MAX);
 
      output[weight_index++] = val0;
 
      output[weight_index++] = val1;
 
      output[weight_index++] = val2;
 
      output[weight_index++] = val3;
    }
  }
  assert(weight_index == GGML_QK_K);
}

References block_q2_K::d, block_q2_K::dmin, fp16_to_fp32(), GGML_QK_K, block_q2_K::qs, SAFE_MAX, SAFE_MIN, block_q2_K::scales, TENSOR_SCALE_MAX, and TENSOR_SCALE_MIN.

◆ dequantize_q3_k()

void dequantize_q3_k	(	const void *	q_data,
		float *	f_data,
		int	num_weights_in_block
	)

Dequantizes a Q3_K quantized block to float32.

Parameters

q_data	Pointer to quantized data
f_data	Output float array
num_weights_in_block	Number of weights to dequantize

Definition at line 476 of file quantization.cpp.

                                               {
  if (num_weights_in_block != GGML_QK_K) {
    throw std::invalid_argument(
        "dequantize_q3_k currently only supports block size " +
        std::to_string(GGML_QK_K));
  }
 
  const block_q3_K* qblock = static_cast<const block_q3_K*>(qblock_void);
 
  const float d_float_raw = fp16_to_fp32(qblock->d);
  const float dmin_float_raw = fp16_to_fp32(qblock->dmin);
 
  const float d_float = (!std::isfinite(d_float_raw)) ? 0.0f : d_float_raw;
  const float dmin_float =
      (!std::isfinite(dmin_float_raw)) ? 0.0f : dmin_float_raw;
 
  const uint8_t* hmask_ptr = qblock->hmask;
  const uint8_t* qs_ptr = qblock->qs;
  const uint8_t* scales_ptr = qblock->scales;
 
  int weight_index = 0;
 
  for (int j = 0; j < GGML_QK_K / 16; ++j) {
    uint8_t scale_idx;
 
    if (j < 8) {
      scale_idx = scales_ptr[j] & 0x3F;
    } else {
      scale_idx = scales_ptr[j + 4] & 0x3F;
    }
 
    assert(scale_idx < 64 && "Scale index out of bounds for Q3_K lookup");
    const float sub_block_scale_factor = K_SCALE_VALUES[scale_idx];
 
    const float final_sub_block_scale = d_float * sub_block_scale_factor;
    const float final_sub_block_min = dmin_float;
 
    for (int i = 0; i < 4; ++i) {
      uint8_t qs_byte = qs_ptr[j * 4 + i];
      uint8_t hmask_byte = hmask_ptr[j];
 
      for (int bit_pos = 0; bit_pos < 8; bit_pos += 2) {
        uint8_t lower_bits = (qs_byte >> bit_pos) & 0x3;
 
        int hmask_bit_idx = (i * 4) + (bit_pos / 2);
 
        uint8_t high_bit = (hmask_byte >> hmask_bit_idx) & 0x1;
 
        uint8_t q_val = (high_bit << 2) | lower_bits;
 
        float val = final_sub_block_scale * static_cast<float>(q_val) +
                    final_sub_block_min;
 
        if (!std::isfinite(val)) {
          val = 0.0f;
        }
 
        output[weight_index++] = val;
      }
    }
  }
 
  if (weight_index != GGML_QK_K) {
    std::cout << "ERROR: Processed " << weight_index << " weights instead of "
              << GGML_QK_K << std::endl;
 
    while (weight_index < GGML_QK_K) {
      output[weight_index++] = 0.0f;
    }
  }
}

References block_q3_K::d, block_q3_K::dmin, fp16_to_fp32(), GGML_QK_K, block_q3_K::hmask, K_SCALE_VALUES, block_q3_K::qs, and block_q3_K::scales.

◆ dequantize_q4_k_m()

void dequantize_q4_k_m	(	const block_q4_K *	qblock,
		float *	output,
		int	num_weights_in_block,
		bool	log_this_block
	)

Definition at line 171 of file quantization.cpp.

                                                                      {
  if (num_weights_in_block != GGML_QK_K) {
    std::cout
        << "Warning: dequantize_q4_k_m called with num_weights != GGML_QK_K ("
        << num_weights_in_block << ")" << std::endl;
    std::memset(output, 0, num_weights_in_block * sizeof(float));
    return;
  }
 
  const float d_super_scale = fp16_to_fp32(qblock->d, false);
  const float d_super_min   = fp16_to_fp32(qblock->dmin, false);
  
  const uint8_t * q_bytes_ptr = qblock->qs; // Pointer to the start of quantized data for the block
  float * y_ptr = output;             // Pointer to the start of output float data for the block
 
  int scale_group_idx = 0; // This will be the 'is' in llama.cpp reference (0, 2, 4, 6)
 
  // Loop 4 times, processing 64 elements (four 16-element sub-sub-blocks) in each iteration.
  // This corresponds to the outer j loop in dequantize_row_q4_K from llama.cpp (j from 0 to QK_K, step 64)
  for (int sixtyfour_chunk_idx = 0; sixtyfour_chunk_idx < GGML_QK_K / 64; ++sixtyfour_chunk_idx) {
    uint8_t sc_val1, m_val1;
    uint8_t sc_val2, m_val2;
 
    // Get scales/mins for the two 32-element halves of this 64-element chunk
    // The `j` for get_scale_min_k4 is the 32-element group index (0-7)
    get_scale_min_k4(scale_group_idx + 0, qblock->scales, &sc_val1, &m_val1);
    get_scale_min_k4(scale_group_idx + 1, qblock->scales, &sc_val2, &m_val2);
 
    const float d1 = d_super_scale * static_cast<float>(sc_val1);
    const float m1 = d_super_min   * static_cast<float>(m_val1);
    const float d2 = d_super_scale * static_cast<float>(sc_val2);
    const float m2 = d_super_min   * static_cast<float>(m_val2);
 
    // Dequantize the first 32 elements of this 64-element chunk
    for (int l = 0; l < 32; ++l) {
      // q_bytes_ptr points to the start of the 32 bytes for these 64 quants
      uint8_t quant_nibble = (q_bytes_ptr[l] & 0x0F); // First nibble for first element
      *y_ptr++ = d1 * static_cast<float>(quant_nibble) - m1;
    }
 
    // Dequantize the second 32 elements of this 64-element chunk
    for (int l = 0; l < 32; ++l) {
      uint8_t quant_nibble = (q_bytes_ptr[l] >> 4); // Upper nibble from the same byte
      *y_ptr++ = d2 * static_cast<float>(quant_nibble) - m2;
    }
    
    q_bytes_ptr += 32; // Advance q_bytes_ptr by 32 bytes (covering 64 quants)
    scale_group_idx += 2; // Advance the scale group index for get_scale_min_k4
  }
}

References block_q4_K::d, block_q4_K::dmin, fp16_to_fp32(), get_scale_min_k4(), GGML_QK_K, block_q4_K::qs, and block_q4_K::scales.

Referenced by dequantize_vector_q4k_to_f32(), TinyLlamaModel::initialize_gpu_and_rope(), TinyLlamaModel::lookup_embedding(), and matvec_q4k_f32_vector_cpu().

◆ dequantize_q6_k()

void dequantize_q6_k	(	const block_q6_K *	qblock,
		float *	output,
		int	num_weights_in_block,
		bool	log_this_block
	)

Definition at line 223 of file quantization.cpp.

                                                                    {
  if (num_weights_in_block != GGML_QK_K) {
    std::cout
        << "Warning: dequantize_q6_k called with num_weights != GGML_QK_K ("
        << num_weights_in_block << ")" << std::endl;
    std::memset(output, 0, num_weights_in_block * sizeof(float));
    return;
  }
 
  const float d = fp16_to_fp32(qblock->d, false);
 
  // Pointers to the start of the whole block's data
  const uint8_t * p_ql = qblock->ql;
  const uint8_t * p_qh = qblock->qh;
  const int8_t  * p_sc = qblock->scales;
  float * p_y  = output;
 
  // Process the 256 elements of the block.
  // The llama.cpp code structure processes it in two 128-element chunks.
  for (int half_idx = 0; half_idx < 2; ++half_idx) { // Process first 128, then next 128
      // Set up pointers for the current 128-element half
      const uint8_t * ql = p_ql + (half_idx * 64); // Each half uses 64 bytes of ql
      const uint8_t * qh = p_qh + (half_idx * 32); // Each half uses 32 bytes of qh
      const int8_t  * sc = p_sc + (half_idx * 8);  // Each half uses 8 scales for these 128 elements
      float         * y  = p_y  + (half_idx * 128); // Output pointer for this half
 
      // Inner loop processes 32 sets of 4 values = 128 floats
      for (int l = 0; l < 32; ++l) {
          int is = l / 16; // Scale sub-group index within this half's 8 scales (0 for l=0..15, 1 for l=16..31)
 
          // Extract the four 6-bit quantized values, already offset by -32
          const int8_t q1 = (int8_t)(((ql[l +  0] & 0x0F) | (((qh[l] >> 0) & 0x03) << 4))) - 32;
          const int8_t q2 = (int8_t)(((ql[l + 32] & 0x0F) | (((qh[l] >> 2) & 0x03) << 4))) - 32;
          const int8_t q3 = (int8_t)(((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 0x03) << 4))) - 32;
          const int8_t q4 = (int8_t)(((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 0x03) << 4))) - 32;
 
          y[l +  0] = d * sc[is + 0] * q1;
          y[l + 32] = d * sc[is + 2] * q2;
          y[l + 64] = d * sc[is + 4] * q3;
          y[l + 96] = d * sc[is + 6] * q4;
      }
  }
}

References block_q6_K::d, fp16_to_fp32(), GGML_QK_K, block_q6_K::qh, block_q6_K::ql, and block_q6_K::scales.

Referenced by dequantize_vector_q6k_to_f32(), TinyLlamaModel::initialize_gpu_and_rope(), TinyLlamaModel::lookup_embedding(), and matvec_q6k_f32_vector_cpu().

◆ dequantize_q8_0_block()

void dequantize_q8_0_block	(	const block_q8_0 *	qblock,
		float *	output
	)

Dequantizes a Q8_0 block to float32.

Parameters

qblock	Pointer to Q8_0 block
output	Output float array

Definition at line 1047 of file quantization.cpp.

                                                                    {
  const float d_fp32 = fp16_to_fp32(qblock->d, true);
  for (int i = 0; i < GGML_QK8_0; ++i) {
    output[i] = d_fp32 * static_cast<float>(qblock->qs[i]);
  }
}

References block_q8_0::d, fp16_to_fp32(), GGML_QK8_0, and block_q8_0::qs.

Referenced by dequantize_vector_q8_0_to_f32(), TinyLlamaModel::initialize_gpu_and_rope(), TinyLlamaModel::lookup_embedding(), and matvec_q8_0_f32_vector_cpu().

◆ dequantize_q8_k()

void dequantize_q8_k	(	const std::vector< block_q8_K > &	q_data,
		std::vector< float > &	x,
		int	n,
		bool	log_this_block
	)

Definition at line 1009 of file quantization.cpp.

                                                                      {
  if (n % GGML_QK_K != 0) {
    std::cerr
        << "Error: n must be a multiple of GGML_QK_K for Q8_K dequantization."
        << std::endl;
    return;
  }
  size_t num_blocks = n / GGML_QK_K;
  if (q_data.size() < num_blocks) {
    std::cerr << "Error: Not enough Q8_K blocks provided for dequantization."
              << std::endl;
    return;
  }
 
  static std::atomic<int> log_count_q8k_dequant_scales = 0;
 
  for (size_t i = 0; i < num_blocks; ++i) {
    const block_q8_K* qblock = &q_data[i];
    float* x_block = &x[i * GGML_QK_K];
 
    const float d = fp16_to_fp32(qblock->d, true);
 
    if (log_this_block && log_count_q8k_dequant_scales < 10) {
      std::stringstream scale_log_ss;
      scale_log_ss << "[Q8K_DEQUANT_SCALES] Block #"
                   << (log_count_q8k_dequant_scales.load()) << " Raw_d_fp16=0x"
                   << std::hex << qblock->d << std::dec << " -> d=" << d;
      Logger::debug(scale_log_ss.str());
      log_count_q8k_dequant_scales++;
    }
 
    for (int j = 0; j < GGML_QK_K; ++j) {
      x_block[j] = d * static_cast<float>(qblock->qs[j]);
    }
  }
}

References block_q8_K::d, Logger::debug(), fp16_to_fp32(), GGML_QK_K, and block_q8_K::qs.

◆ dequantize_vector_q4k_to_f32()

void dequantize_vector_q4k_to_f32	(	const std::vector< block_q4_K > &	q_weights,
		std::vector< float > &	f32_weights,
		size_t	total_num_elements,
		int	log_first_n_blocks = `0`
	)

Dequantizes a vector of Q4_K blocks to a vector of float32.

Parameters

q_weights	Input vector of Q4_K blocks
f32_weights	Output vector of float32 values (will be resized)
total_num_elements	Total number of float elements expected after dequantization
log_first_n_blocks	Number of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1109 of file quantization.cpp.

                                                          {
    if (q_weights.empty()) {
        Logger::warning("[DEQUANT_VEC_Q4K] Input Q4_K weight vector is empty. Output float vector will be empty.");
        f32_weights.clear();
        return;
    }
 
    f32_weights.resize(total_num_elements);
    size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
 
    if (q_weights.size() != expected_blocks) {
        Logger::error("[DEQUANT_VEC_Q4K] Mismatch in Q4_K block count. Expected: " +
                      std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
                      ". Total elements: " + std::to_string(total_num_elements));
    }
    
    float* current_output_ptr = f32_weights.data();
    size_t elements_processed = 0;
 
    for (size_t i = 0; i < q_weights.size(); ++i) {
        const block_q4_K* current_block_ptr = &q_weights[i];
        int elements_in_this_block = GGML_QK_K;
 
        if (elements_processed + GGML_QK_K > total_num_elements) {
            elements_in_this_block = total_num_elements - elements_processed;
        }
        
        if (elements_in_this_block <= 0) {
            Logger::warning("[DEQUANT_VEC_Q4K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
            continue; 
        }
 
        bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
        
        // Enhanced logging: Log block index for all blocks during the first dequantization call
        static std::atomic<bool> first_call_ever{true};
        bool is_first_call = first_call_ever.exchange(false);
        
        // Call the Q4_K specific single-block dequantizer
        dequantize_q4_k_m(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
        
        
        current_output_ptr += elements_in_this_block;
        elements_processed += elements_in_this_block;
 
    }
 
    if (elements_processed != total_num_elements) {
        Logger::warning("[DEQUANT_VEC_Q4K] Processed " + std::to_string(elements_processed) +
                        " elements, but expected " + std::to_string(total_num_elements) + ".");
    }
}

References dequantize_q4_k_m(), Logger::error(), GGML_QK_K, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ dequantize_vector_q6k_to_f32()

void dequantize_vector_q6k_to_f32	(	const std::vector< block_q6_K > &	q_weights,
		std::vector< float > &	f32_weights,
		size_t	total_num_elements,
		int	log_first_n_blocks = `0`
	)

Dequantizes a vector of Q6_K blocks to a vector of float32.

Parameters

q_weights	Input vector of Q6_K blocks
f32_weights	Output vector of float32 values (will be resized)
total_num_elements	Total number of float elements expected after dequantization
log_first_n_blocks	Number of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1054 of file quantization.cpp.

                                                          {
    if (q_weights.empty()) {
        Logger::warning("[DEQUANT_VEC_Q6K] Input Q6_K weight vector is empty. Output float vector will be empty.");
        f32_weights.clear();
        return;
    }
 
    f32_weights.resize(total_num_elements);
    size_t expected_blocks = (total_num_elements + GGML_QK_K - 1) / GGML_QK_K;
 
    if (q_weights.size() != expected_blocks) {
        Logger::error("[DEQUANT_VEC_Q6K] Mismatch in Q6_K block count. Expected: " +
                      std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
                      ". Total elements: " + std::to_string(total_num_elements));
    }
    
 
    float* current_output_ptr = f32_weights.data();
    size_t elements_processed = 0;
 
    for (size_t i = 0; i < q_weights.size(); ++i) {
        const block_q6_K* current_block_ptr = &q_weights[i];
        int elements_in_this_block = GGML_QK_K;
 
        if (elements_processed + GGML_QK_K > total_num_elements) {
            elements_in_this_block = total_num_elements - elements_processed;
        }
        
        if (elements_in_this_block <= 0) {
            Logger::warning("[DEQUANT_VEC_Q6K] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
            continue; 
        }
 
        bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
        
        // Enhanced logging: Log block index for all blocks during the first dequantization call
        static std::atomic<bool> first_call_ever{true};
        bool is_first_call = first_call_ever.exchange(false);
                
        dequantize_q6_k(current_block_ptr, current_output_ptr, elements_in_this_block, log_this_specific_block);
                
        current_output_ptr += elements_in_this_block;
        elements_processed += elements_in_this_block;
 
    }
 
    if (elements_processed != total_num_elements) {
        Logger::warning("[DEQUANT_VEC_Q6K] Processed " + std::to_string(elements_processed) +
                        " elements, but expected " + std::to_string(total_num_elements) + ".");
    }
}

References dequantize_q6_k(), Logger::error(), GGML_QK_K, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ dequantize_vector_q8_0_to_f32()

void dequantize_vector_q8_0_to_f32	(	const std::vector< block_q8_0 > &	q_weights,
		std::vector< float > &	f32_weights,
		size_t	total_num_elements,
		int	log_first_n_blocks = `0`
	)

Dequantizes a vector of Q8_0 blocks to a vector of float32.

Parameters

q_weights	Input vector of Q8_0 blocks
f32_weights	Output vector of float32 values (will be resized)
total_num_elements	Total number of float elements expected after dequantization
log_first_n_blocks	Number of initial blocks to log dequantization details for (0 for no logging)

Definition at line 1165 of file quantization.cpp.

                                                           {
    if (q_weights.empty()) {
        Logger::warning("[DEQUANT_VEC_Q8_0] Input Q8_0 weight vector is empty. Output float vector will be empty.");
        f32_weights.clear();
        return;
    }
 
    f32_weights.resize(total_num_elements);
    
    size_t expected_blocks = (total_num_elements + GGML_QK8_0 - 1) / GGML_QK8_0;
 
    if (q_weights.size() != expected_blocks) {
        Logger::error("[DEQUANT_VEC_Q8_0] Mismatch in Q8_0 block count. Expected: " +
                      std::to_string(expected_blocks) + ", Got: " + std::to_string(q_weights.size()) +
                      ". Total elements: " + std::to_string(total_num_elements));
    }
    
    float* current_output_ptr = f32_weights.data();
    size_t elements_processed = 0;
 
    for (size_t i = 0; i < q_weights.size(); ++i) {
        const block_q8_0* current_block_ptr = &q_weights[i];
        int elements_in_this_block = GGML_QK8_0;
 
        if (elements_processed + GGML_QK8_0 > total_num_elements) {
            elements_in_this_block = total_num_elements - elements_processed;
        }
        
        if (elements_in_this_block <= 0) {
            Logger::warning("[DEQUANT_VEC_Q8_0] Zero or negative elements requested for block " + std::to_string(i) + ". Skipping.");
            continue; 
        }
 
        bool log_this_specific_block = (log_first_n_blocks > 0 && static_cast<int>(i) < log_first_n_blocks);
        
        static std::atomic<bool> first_call_ever{true};
        bool is_first_call = first_call_ever.exchange(false);
        
        if (elements_in_this_block == GGML_QK8_0) {
            dequantize_q8_0_block(current_block_ptr, current_output_ptr);
        } else {
            // Handle partial block
            float temp_block[GGML_QK8_0];
            dequantize_q8_0_block(current_block_ptr, temp_block);
            std::memcpy(current_output_ptr, temp_block, elements_in_this_block * sizeof(float));
        }
                
        current_output_ptr += elements_in_this_block;
        elements_processed += elements_in_this_block;
    }
 
    if (elements_processed != total_num_elements) {
        Logger::warning("[DEQUANT_VEC_Q8_0] Processed " + std::to_string(elements_processed) +
                        " elements, but expected " + std::to_string(total_num_elements) + ".");
    }
}

References dequantize_q8_0_block(), Logger::error(), GGML_QK8_0, and Logger::warning().

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), and TinyLlamaModel::initialize_weights().

◆ fp16_to_fp32()

float fp16_to_fp32	(	uint16_t	h,
		bool	is_gguf_scale_field = `false`
	)

Converts a 16-bit floating point number to 32-bit float.

Parameters

h	The 16-bit float value to convert
is_gguf_scale_field	Whether this value is from a GGUF scale field

Returns: The converted 32-bit float value

Definition at line 47 of file quantization.cpp.

                                                         {
  uint16_t h_to_convert = h;
  bool original_sign_bit_was_set = (h & 0x8000);
  uint32_t sign = (h_to_convert >> 15) & 1;
  uint32_t exp_fp16 = (h_to_convert >> 10) & 0x1f;
  uint32_t mant_fp16 = h_to_convert & 0x3ff;
  uint32_t x;
 
  if (exp_fp16 == 0) {
    if (mant_fp16 == 0) {
      x = (sign << 31);
 
    } else {
      exp_fp16 = 1;
      while ((mant_fp16 & 0x400) == 0) {
        mant_fp16 <<= 1;
        exp_fp16--;
      }
      mant_fp16 &= ~0x400;
      uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
      uint32_t mant_fp32 = mant_fp16 << 13;
      x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
    }
  } else if (exp_fp16 == 0x1f) {
    x = (sign << 31) | (0xff << 23) | (mant_fp16 << 13);
  } else {
    uint32_t exp_fp32 = (exp_fp16 - 15 + 127);
    uint32_t mant_fp32 = mant_fp16 << 13;
    x = (sign << 31) | (exp_fp32 << 23) | mant_fp32;
  }
 
  float f;
  std::memcpy(&f, &x, sizeof(float));
 
  if (is_gguf_scale_field) {
    static std::atomic<int> q8_scale_f_log_count{0};
  }
 
  if (is_gguf_scale_field && f < 0.0f && !(std::isnan(f) || std::isinf(f))) {
    f = std::abs(f);
  }
  
  return f;
}

Referenced by dequantize_q2_k(), dequantize_q3_k(), dequantize_q4_k_m(), dequantize_q6_k(), dequantize_q8_0_block(), dequantize_q8_k(), vec_dot_q4_k_q8_k_cpu(), and vec_dot_q6_k_q8_k_cpu().

◆ fp32_to_fp16()

uint16_t fp32_to_fp16 ( float f )

Converts a 32-bit float to 16-bit floating point.

Parameters

f	The 32-bit float value to convert

Returns: The converted 16-bit float value

Definition at line 92 of file quantization.cpp.

                               {
  uint32_t x;
  std::memcpy(&x, &f, sizeof(float));
 
  uint32_t sign = (x >> 31) & 1;
  uint32_t exp_fp32 = (x >> 23) & 0xff;
  uint32_t mant_fp32 = x & 0x7fffff;
 
  uint16_t u;
 
  if (exp_fp32 == 0xff) {
    u = (sign << 15) | 0x7c00 | (mant_fp32 != 0 ? 0x200 : 0);
  } else {
    int exp_fp16 = (int)exp_fp32 - 127 + 15;
 
    if (exp_fp16 >= 0x1f) {
      u = (sign << 15) | 0x7c00;
    } else if (exp_fp16 <= 0) {
      if (exp_fp16 < -10) {
        u = (sign << 15);
      } else {
        mant_fp32 = (mant_fp32 | 0x800000) >> (1 - exp_fp16);
 
        if ((mant_fp32 >> 13) & 1) {
          mant_fp32 += (1 << 13);
        }
        u = (sign << 15) | (mant_fp32 >> 13);
      }
    } else {
      if ((mant_fp32 >> 13) & 1) {
        mant_fp32 += (1 << 13);
        if ((mant_fp32 >> 23) == 1) {
          mant_fp32 = 0;
          exp_fp16++;
          if (exp_fp16 >= 0x1f) {
            u = (sign << 15) | 0x7c00;
            return u;
          }
        }
      }
      u = (sign << 15) | (exp_fp16 << 10) | (mant_fp32 >> 13);
    }
  }
  return u;
}

Referenced by quantize_fp32_to_q8_K(), quantize_q4_k_m(), and quantize_q6_k().

◆ get_scale_min_indices_q4_K()

static void get_scale_min_indices_q4_K	(	int	j,
		const uint8_t *	scales,
		uint8_t *	scale_index,
		uint8_t *	min_index
	)

inlinestatic

Definition at line 157 of file quantization.cpp.

                        {
  assert(j >= 0 && j < 16);
 
  *scale_index = scales[j % 8] >> (4 * (j / 8));
  *scale_index &= 0x0F;
 
  *min_index = scales[j % 4 + 8] >> (4 * (j / 4));
  *min_index &= 0x0F;
}

Referenced by vec_dot_q4_k_q8_k_cpu().

◆ get_scale_min_k4()

static void get_scale_min_k4	(	int	j,
		const uint8_t *	q,
		uint8_t *	d_val,
		uint8_t *	m_val
	)

inlinestatic

Definition at line 145 of file quantization.cpp.

                                                                                                {
    // j is the index for the 32-element group (0 to 7 for a QK_K block)
    // q is x[i].scales which is uint8_t scales[12] in block_q4_K
    if (j < 4) { // For first 4 groups of 32 (i.e., first 128 elements, using scales[0..7])
        *d_val = q[j] & 63;     // Takes lower 6 bits from q[j] (scales[0..3])
        *m_val = q[j + 4] & 63; // Takes lower 6 bits from q[j+4] (scales[4..7])
    } else { // For next 4 groups of 32 (i.e., next 128 elements, using scales[8..11] and bits from scales[0..7])
        *d_val = (q[j+4] & 0x0F) | ((q[j-4] >> 6) << 4); // Lower 4 bits from q[8..11] and upper 2 bits from q[0..3]
        *m_val = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4); // Upper 4 bits from q[8..11] and upper 2 bits from q[4..7]
    }
}

Referenced by dequantize_q4_k_m().

◆ ggml_type_block_size()

size_t ggml_type_block_size ( GGMLType type )

Gets the block size for a GGML type.

Parameters

type	The GGML type

Returns: Block size in elements

Definition at line 688 of file quantization.cpp.

                                           {
  switch (type) {
    case GGMLType::GGML_TYPE_Q2_K:
    case GGMLType::GGML_TYPE_Q3_K:
    case GGMLType::GGML_TYPE_Q4_K:
    case GGMLType::GGML_TYPE_Q6_K:
 
      return GGML_QK_K;
 
    case GGMLType::GGML_TYPE_Q4_0:
    case GGMLType::GGML_TYPE_Q8_0:
 
      return 32;
 
    case GGMLType::GGML_TYPE_F32:
    case GGMLType::GGML_TYPE_F16:
    case GGMLType::GGML_TYPE_I8:
    case GGMLType::GGML_TYPE_I16:
    case GGMLType::GGML_TYPE_I32:
    case GGMLType::GGML_TYPE_BF16:
      return 1;
 
    default:
      std::cout << "Warning: Unknown GGMLType in ggml_type_block_size: "
                << static_cast<int>(type) << std::endl;
      return 0;
  }
 
  return 0;
}

References GGML_QK_K, GGML_TYPE_BF16, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, and GGML_TYPE_Q8_0.

Referenced by load_gguf_meta().

◆ ggml_type_name()

const char * ggml_type_name ( GGMLType type )

Gets the string name of a GGML type.

Parameters

type	The GGML type

Returns: String representation of the type

Definition at line 601 of file quantization.cpp.

                                          {
  switch (type) {
    case GGMLType::GGML_TYPE_F32:
      return "F32";
    case GGMLType::GGML_TYPE_F16:
      return "F16";
    case GGMLType::GGML_TYPE_Q4_0:
      return "Q4_0";
    case GGMLType::GGML_TYPE_Q4_1:
      return "Q4_1";
    case GGMLType::GGML_TYPE_Q5_0:
      return "Q5_0";
    case GGMLType::GGML_TYPE_Q5_1:
      return "Q5_1";
    case GGMLType::GGML_TYPE_Q8_0:
      return "Q8_0";
    case GGMLType::GGML_TYPE_Q8_1:
      return "Q8_1";
    case GGMLType::GGML_TYPE_Q2_K:
      return "Q2_K";
    case GGMLType::GGML_TYPE_Q3_K:
      return "Q3_K";
    case GGMLType::GGML_TYPE_Q4_K:
      return "Q4_K";
    case GGMLType::GGML_TYPE_Q5_K:
      return "Q5_K";
    case GGMLType::GGML_TYPE_Q6_K:
      return "Q6_K";
    case GGMLType::GGML_TYPE_Q8_K:
      return "Q8_K";
    case GGMLType::GGML_TYPE_I8:
      return "I8";
    case GGMLType::GGML_TYPE_I16:
      return "I16";
    case GGMLType::GGML_TYPE_I32:
      return "I32";
    case GGMLType::GGML_TYPE_BF16:
      return "BF16";
    case GGMLType::GGML_TYPE_COUNT:
      return "COUNT";
    default:
      return "Unknown";
  }
}

References GGML_TYPE_BF16, GGML_TYPE_COUNT, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, GGML_TYPE_Q8_1, and GGML_TYPE_Q8_K.

Referenced by load_gguf_meta().

◆ ggml_type_size()

size_t ggml_type_size ( GGMLType type )

Gets the size in bytes of a GGML type.

Parameters

type	The GGML type

Returns: Size in bytes

Definition at line 646 of file quantization.cpp.

                                     {
  switch (type) {
    case GGMLType::GGML_TYPE_F32:
      return sizeof(float);
    case GGMLType::GGML_TYPE_F16:
      return sizeof(uint16_t);
    case GGMLType::GGML_TYPE_I8:
      return sizeof(int8_t);
    case GGMLType::GGML_TYPE_Q4_K:
      return sizeof(block_q4_K);
    case GGMLType::GGML_TYPE_Q2_K:
      return sizeof(block_q2_K);
    case GGMLType::GGML_TYPE_Q3_K:
      return sizeof(block_q3_K);
    case GGMLType::GGML_TYPE_Q6_K:
      return sizeof(block_q6_K);
    case GGMLType::GGML_TYPE_Q4_0:
      return 18;
 
    case GGMLType::GGML_TYPE_Q8_0:
      return 34;
    case GGMLType::GGML_TYPE_Q8_1:
      return 40;
    case GGMLType::GGML_TYPE_Q5_K:
      return 116;
    case GGMLType::GGML_TYPE_Q8_K:
      return 290;
    case GGMLType::GGML_TYPE_I16:
      return sizeof(int16_t);
    case GGMLType::GGML_TYPE_I32:
      return sizeof(int32_t);
    case GGMLType::GGML_TYPE_BF16:
      return sizeof(uint16_t);
    case GGMLType::GGML_TYPE_COUNT:
    default:
      std::cout << "  UNKNOWN GGML TYPE: " << static_cast<int>(type)
                << std::endl;
      throw std::invalid_argument("Unknown GGML type in ggml_type_size: " +
                                  std::to_string(static_cast<int>(type)));
  }
}

References GGML_TYPE_BF16, GGML_TYPE_COUNT, GGML_TYPE_F16, GGML_TYPE_F32, GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_I8, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, GGML_TYPE_Q8_1, and GGML_TYPE_Q8_K.

Referenced by load_gguf_meta().

◆ handle_i8_tensor()

void handle_i8_tensor	(	const void *	i8_data,
		float *	f_data,
		size_t	num_elements
	)

Handles conversion of int8 tensor data to float32.

Parameters

i8_data	Input int8 data
f_data	Output float array
num_elements	Number of elements to convert

Definition at line 268 of file quantization.cpp.

                                           {
  const int8_t* input_ptr = static_cast<const int8_t*>(input_data);
  for (size_t i = 0; i < num_elements; ++i) {
    output_data[i] = static_cast<float>(input_ptr[i]);
  }
}

◆ matvec_q4k_q8k_cpu()

void matvec_q4k_q8k_cpu	(	const std::vector< block_q4_K > &	mat_q4k,
		const std::vector< block_q8_K > &	vec_q8k,
		std::vector< float > &	out_f32,
		int	rows,
		int	cols,
		bool	log_calls
	)

Computes matrix-vector product between Q4_K matrix and Q8_K vector on CPU.

Parameters

mat_q4k	Q4_K matrix
vec_q8k	Q8_K vector
out_f32	Output float vector
rows	Number of matrix rows
cols	Number of matrix columns
log_calls	Whether to log computation details

Definition at line 982 of file quantization.cpp.

                                        {
  if (cols % GGML_QK_K != 0) {
    throw std::runtime_error(
        "matvec_q4k_q8k_cpu: cols must be divisible by GGML_QK_K");
  }
  size_t blocks_per_row = cols / GGML_QK_K;
  if (mat_q4k.size() != (size_t)rows * blocks_per_row) {
    throw std::runtime_error("matvec_q4k_q8k_cpu: mat_q4k size mismatch");
  }
  if (vec_q8k.size() != blocks_per_row) {
    throw std::runtime_error("matvec_q4k_q8k_cpu: vec_q8k size mismatch");
  }
  out_f32.resize(rows);
 
#pragma omp parallel for
  for (int r = 0; r < rows; ++r) {
    const std::vector<block_q4_K> row_q4k(
        mat_q4k.begin() + r * blocks_per_row,
        mat_q4k.begin() + (r + 1) * blocks_per_row);
 
    out_f32[r] = vec_dot_q4_k_q8_k_cpu(cols, row_q4k, vec_q8k, log_calls);
  }
}

References GGML_QK_K, and vec_dot_q4_k_q8_k_cpu().

◆ matvec_q6k_q8k_cpu()

void matvec_q6k_q8k_cpu	(	const std::vector< block_q6_K > &	mat_q6k,
		const std::vector< block_q8_K > &	vec_q8k,
		std::vector< float > &	out_f32,
		int	rows,
		int	cols,
		bool	log_calls
	)

Computes matrix-vector product between Q6_K matrix and Q8_K vector on CPU.

Parameters

mat_q6k	Q6_K matrix
vec_q8k	Q8_K vector
out_f32	Output float vector
rows	Number of matrix rows
cols	Number of matrix columns
log_calls	Whether to log computation details

Definition at line 897 of file quantization.cpp.

                                        {
  if (cols % GGML_QK_K != 0) {
    throw std::runtime_error(
        "matvec_q6k_q8k_cpu: cols must be divisible by GGML_QK_K");
  }
  size_t blocks_per_row = cols / GGML_QK_K;
  if (mat_q6k.size() != (size_t)rows * blocks_per_row) {
    throw std::runtime_error("matvec_q6k_q8k_cpu: mat_q6k size mismatch");
  }
  if (vec_q8k.size() != blocks_per_row) {
    throw std::runtime_error("matvec_q6k_q8k_cpu: vec_q8k size mismatch");
  }
  out_f32.resize(rows);
  for (int r = 0; r < rows; ++r) {
    const std::vector<block_q6_K> row_q6k(
        mat_q6k.begin() + r * blocks_per_row,
        mat_q6k.begin() + (r + 1) * blocks_per_row);
 
    out_f32[r] = vec_dot_q6_k_q8_k_cpu(cols, row_q6k, vec_q8k, log_calls);
  }
}

References GGML_QK_K, and vec_dot_q6_k_q8_k_cpu().

◆ quantize_fp32_to_q8_K()

std::vector< block_q8_K > quantize_fp32_to_q8_K ( const std::vector< float > & f_data )

Quantizes float32 data to Q8_K format.

Parameters

f_data Input float vector

Returns: Vector of Q8_K blocks

Definition at line 719 of file quantization.cpp.

                                    {
  if (f_data.size() % GGML_QK_K != 0) {
    throw std::runtime_error(
        "Input vector size must be a multiple of GGML_QK_K (" +
        std::to_string(GGML_QK_K) + ")");
  }
 
  size_t num_blocks = f_data.size() / GGML_QK_K;
  std::vector<block_q8_K> q_data(num_blocks);
  const float* x = f_data.data();
  block_q8_K* y = q_data.data();
 
  static std::atomic<int> log_count_q8k_quant_scales = 0;
 
  for (size_t i = 0; i < num_blocks; ++i) {
    float amax = 0.0f;
    for (int j = 0; j < GGML_QK_K; ++j) {
      amax = SAFE_MAX(amax, std::abs(x[j]));
    }
 
    const float d_fp32 = amax / Q8K_SCALE_FACTOR;
    const float id = (d_fp32 != 0.f) ? 1.0f / d_fp32 : 0.0f;
    y[i].d = fp32_to_fp16(d_fp32);
 
    if (log_count_q8k_quant_scales < 10) {
      std::stringstream q8k_scale_log_ss;
      q8k_scale_log_ss << "[Q8K_QUANT_SCALES] Block #" << i
                       << " Input amax=" << amax << " -> d_fp32=" << d_fp32
                       << " -> Stored d_fp16=0x" << std::hex << y[i].d
                       << std::dec;
      Logger::debug(q8k_scale_log_ss.str());
      log_count_q8k_quant_scales++;
    }
 
    int16_t block_sum[16] = {0};
    for (int j = 0; j < GGML_QK_K; ++j) {
      const float val_scaled = x[j] * id;
 
      int8_t q_val = static_cast<int8_t>(
          SAFE_MAX(-128.0f, SAFE_MIN(127.0f, std::round(val_scaled))));
      y[i].qs[j] = q_val;
      block_sum[j / 16] += q_val;
    }
 
    std::memcpy(y[i].bsums, block_sum, sizeof(block_sum));
 
    x += GGML_QK_K;
  }
 
  return q_data;
}

References block_q8_K::d, Logger::debug(), fp32_to_fp16(), GGML_QK_K, Q8K_SCALE_FACTOR, block_q8_K::qs, SAFE_MAX, and SAFE_MIN.

◆ quantize_q4_k_m()

void quantize_q4_k_m	(	const float *	f_data,
		void *	q_data,
		int	num_elements
	)

Quantizes float32 data to Q4_K format.

Parameters

f_data	Input float array
q_data	Output quantized data
num_elements	Number of elements to quantize

Definition at line 276 of file quantization.cpp.

                                       {
  if (num_elements != GGML_QK_K) {
    throw std::invalid_argument(
        "quantize_q4_k_m currently only supports block size " +
        std::to_string(GGML_QK_K));
  }
 
  block_q4_K* output_qblock = static_cast<block_q4_K*>(output_qblock_void);
 
  std::memset(output_qblock->scales, 0, sizeof(output_qblock->scales));
  std::memset(output_qblock->qs, 0, sizeof(output_qblock->qs));
 
  float block_min_val = std::numeric_limits<float>::max();
  float block_max_val = std::numeric_limits<float>::lowest();
  for (int i = 0; i < num_elements; ++i) {
    block_min_val = SAFE_MIN(block_min_val, input[i]);
    block_max_val = SAFE_MAX(block_max_val, input[i]);
  }
 
  if (block_max_val == block_min_val) {
    block_max_val = block_min_val + GGUF_SMALL_VAL;
  }
  if (block_max_val < GGUF_EPSILON && block_max_val > -GGUF_EPSILON) {
    block_max_val = GGUF_SMALL_VAL;
    block_min_val = 0.0f;
  }
 
  const float d_super_scale_candidate = (block_max_val - block_min_val) / Q4K_SCALE_FACTOR;
  const float d_super =
      d_super_scale_candidate > GGUF_EPSILON ? d_super_scale_candidate : GGUF_EPSILON;
  const float min_super = block_min_val;
 
  output_qblock->d = fp32_to_fp16(d_super);
  output_qblock->dmin = fp32_to_fp16(min_super);
 
  for (int j = 0; j < GGML_QK_K / 16; ++j) {
    const float* sub_block_input = input + j * 16;
 
    float sub_min_val = sub_block_input[0];
    float sub_max_val = sub_block_input[0];
    for (int i = 1; i < 16; ++i) {
      sub_min_val = SAFE_MIN(sub_min_val, sub_block_input[i]);
      sub_max_val = SAFE_MAX(sub_max_val, sub_block_input[i]);
    }
 
    float ideal_scale = 0.0f;
    if (sub_max_val > sub_min_val + GGUF_EPSILON) {
      ideal_scale = (sub_max_val - sub_min_val) / Q4K_SCALE_FACTOR;
    }
    float ideal_min = sub_min_val;
 
    uint8_t best_scale_idx = 0;
    float min_scale_err = std::numeric_limits<float>::max();
    if (d_super > GGUF_EPSILON) {
      for (uint8_t k = 0; k < 16; ++k) {
        float candidate_scale = d_super * K_SCALE_VALUES[k];
        float err = std::abs(candidate_scale - ideal_scale);
        if (err < min_scale_err) {
          min_scale_err = err;
          best_scale_idx = k;
        }
      }
    }
 
    uint8_t best_min_idx = 0;
    float min_min_err = std::numeric_limits<float>::max();
 
    for (uint8_t l = 0; l < 16; ++l) {
      float candidate_min = min_super * K_MIN_VALUES[l];
      float err = std::abs(candidate_min - ideal_min);
      if (err < min_min_err) {
        min_min_err = err;
        best_min_idx = l;
      }
    }
 
    int scale_byte_idx = j % 8;
    int scale_shift = 4 * (j / 8);
    output_qblock->scales[scale_byte_idx] |= (best_scale_idx << scale_shift);
 
    int min_byte_idx = (j % 4) + 8;
    int min_shift = 4 * (j / 4);
    output_qblock->scales[min_byte_idx] |= (best_min_idx << min_shift);
 
    float actual_scale = d_super * K_SCALE_VALUES[best_scale_idx];
    float actual_min = min_super * K_MIN_VALUES[best_min_idx];
    float inv_actual_scale = (actual_scale > GGUF_EPSILON || actual_scale < -GGUF_EPSILON)
                                 ? 1.0f / actual_scale
                                 : 0.0f;
 
    uint8_t packed_qs[8];
 
    std::memset(packed_qs, 0, sizeof(packed_qs));
 
    for (int i = 0; i < 16; ++i) {
      float val = sub_block_input[i];
 
      int quant_val = 0;
      if (inv_actual_scale != 0.0f) {
        quant_val =
            static_cast<int>(std::round((val - actual_min) * inv_actual_scale)) + Q4K_OFFSET;
      }
      quant_val = SAFE_MAX(0, SAFE_MIN(15, quant_val));
 
      int byte_idx_qs = i / 2;
      int shift_qs = (i % 2) * 4;
      packed_qs[byte_idx_qs] |= (static_cast<uint8_t>(quant_val) << shift_qs);
    }
 
    uint8_t* qs_target = output_qblock->qs + j * 8;
    for (int i = 0; i < 8; ++i) {
      uint8_t low_nibble_val = packed_qs[i] & 0x0F;
      uint8_t high_nibble_val = (packed_qs[i] >> 4) & 0x0F;
      qs_target[i] = low_nibble_val | (high_nibble_val << 4);
    }
  }
}

References block_q4_K::d, block_q4_K::dmin, fp32_to_fp16(), GGML_QK_K, GGUF_EPSILON, GGUF_SMALL_VAL, K_MIN_VALUES, K_SCALE_VALUES, Q4K_OFFSET, Q4K_SCALE_FACTOR, block_q4_K::qs, SAFE_MAX, SAFE_MIN, and block_q4_K::scales.

◆ quantize_q6_k()

void quantize_q6_k	(	const float *	f_data,
		void *	q_data,
		int	num_elements
	)

Quantizes float32 data to Q6_K format.

Parameters

f_data	Input float array
q_data	Output quantized data
num_elements	Number of elements to quantize

Definition at line 549 of file quantization.cpp.

                                     {
  if (num_elements != GGML_QK_K) {
    throw std::invalid_argument(
        "quantize_q6_k currently only supports block size " +
        std::to_string(GGML_QK_K));
  }
 
  block_q6_K* output_qblock = static_cast<block_q6_K*>(output_qblock_void);
 
  uint8_t* ql = output_qblock->ql;
  uint8_t* qh = output_qblock->qh;
  int8_t* scales = output_qblock->scales;
  std::memset(ql, 0, GGML_QK_K / 2);
  std::memset(qh, 0, GGML_QK_K / 4);
 
  float amax = 0.0f;
  for (int i = 0; i < num_elements; ++i) {
    amax = SAFE_MAX(amax, std::abs(input[i]));
  }
 
  const float d_float = (amax > GGUF_EPSILON) ? (amax / Q6K_SCALE_FACTOR) : GGUF_EPSILON;
  output_qblock->d = fp32_to_fp16(d_float);
 
  for (int sub = 0; sub < GGML_QK_K / 16; ++sub) {
    const float* sub_in = input + sub * 16;
 
    float sub_amax = 0.0f;
    for (int i = 0; i < 16; ++i) {
      sub_amax = SAFE_MAX(sub_amax, std::abs(sub_in[i]));
    }
 
    int8_t scale = (d_float > 0.0f) ? std::round(sub_amax / d_float) : 1;
    if (scale == 0) scale = 1;
    scales[sub] = scale;
 
    for (int i = 0; i < 16; ++i) {
      float val = sub_in[i];
      int q = static_cast<int>(std::round(val / (d_float * scale))) + Q6K_OFFSET;
      q = SAFE_MAX(0, SAFE_MIN(63, q));
 
      int idx = sub * 16 + i;
      int ql_idx = idx / 2;
      int ql_shift = (idx % 2) * 4;
      ql[ql_idx] |= (q & 0x0F) << ql_shift;
      int qh_idx = idx / 4;
      int qh_shift = (idx % 4) * 2;
      qh[qh_idx] |= ((q >> 4) & 0x03) << qh_shift;
    }
  }
}

References block_q6_K::d, fp32_to_fp16(), GGML_QK_K, GGUF_EPSILON, Q6K_OFFSET, Q6K_SCALE_FACTOR, block_q6_K::qh, block_q6_K::ql, SAFE_MAX, SAFE_MIN, and block_q6_K::scales.

◆ vec_dot_q4_k_q8_k_cpu()

float vec_dot_q4_k_q8_k_cpu	(	int	n,
		const std::vector< block_q4_K > &	x_vec,
		const std::vector< block_q8_K > &	y_vec,
		bool	log_this_call
	)

Computes dot product between Q4_K and Q8_K vectors on CPU.

Parameters

n	Number of blocks
x_vec	Q4_K vector
y_vec	Q8_K vector
log_this_call	Whether to log computation details

Returns: Dot product result

Definition at line 922 of file quantization.cpp.

                                                {
  int log_count_now = g_vec_dot_q4_k_q8_k_log_count.fetch_add(1);
  if (log_count_now >= 5) log_this_call = false;
 
  if (n % GGML_QK_K != 0) {
    throw std::runtime_error("vec_dot_q4_k_q8_k: n must be multiple of QK_K");
  }
  size_t nb = n / GGML_QK_K;
  if (x_vec.size() != nb || y_vec.size() != nb) {
    throw std::runtime_error("vec_dot_q4_k_q8_k: vector block count mismatch");
  }
 
  const block_q4_K* x = x_vec.data();
  const block_q8_K* y = y_vec.data();
 
  float sumf = 0.0f;
  for (size_t i = 0; i < nb; ++i) {
    int8_t q4_vals[GGML_QK_K];
    const uint8_t* q4 = x[i].qs;
    for (int j = 0; j < GGML_QK_K / 2; ++j) {
      q4_vals[2 * j + 0] = static_cast<int8_t>(q4[j] & 0xF);
      q4_vals[2 * j + 1] = static_cast<int8_t>(q4[j] >> 4);
    }
 
    const int8_t* q8 = y[i].qs;
 
    for (int sub = 0; sub < 16; ++sub) {
      uint8_t scale_idx, min_idx;
      get_scale_min_indices_q4_K(sub, x[i].scales, &scale_idx, &min_idx);
      float scale = fp16_to_fp32(x[i].d) * K_SCALE_VALUES[scale_idx];
      float minv = fp16_to_fp32(x[i].dmin) * K_MIN_VALUES[min_idx];
      for (int k = 0; k < 16; ++k) {
        int idx = sub * 16 + k;
        float q4_val = static_cast<float>(q4_vals[idx]) - 8.0f;
        float q8_val = static_cast<float>(q8[idx]);
        sumf += (scale * q4_val + minv) * q8_val;
      }
    }
 
    if (i == 0 && log_this_call) {
      std::stringstream ss;
      ss << "[Q4K_Q8K] Block #0: d: " << fp16_to_fp32(x[i].d)
         << ", dmin: " << fp16_to_fp32(x[i].dmin);
      Logger::debug(ss.str());
      ss.str("");
      ss << "[Q4K_Q8K] Block #0: Q8_K input (first 16): ";
      for (int k = 0; k < 16; ++k) ss << (int)q8[k] << " ";
      Logger::debug(ss.str());
      ss.str("");
      ss << "[Q4K_Q8K] Block #0: Q4_K unpacked (first 16): ";
      for (int k = 0; k < 16; ++k) ss << (int)q4_vals[k] << " ";
      Logger::debug(ss.str());
      ss.str("");
    }
  }
  return sumf;
}

References Logger::debug(), fp16_to_fp32(), g_vec_dot_q4_k_q8_k_log_count, get_scale_min_indices_q4_K(), GGML_QK_K, K_MIN_VALUES, K_SCALE_VALUES, block_q4_K::qs, and block_q8_K::qs.

Referenced by matvec_q4k_q8k_cpu().

◆ vec_dot_q6_k_q8_k_cpu()

float vec_dot_q6_k_q8_k_cpu	(	int	n,
		const std::vector< block_q6_K > &	x,
		const std::vector< block_q8_K > &	y,
		bool	log_this_call
	)

Computes dot product between Q6_K and Q8_K vectors on CPU.

Parameters

n	Number of blocks
x	Q6_K vector
y	Q8_K vector
log_this_call	Whether to log computation details

Returns: Dot product result

Definition at line 772 of file quantization.cpp.

                                                {
  if (n % GGML_QK_K != 0) {
    throw std::runtime_error("vec_dot_q6_k_q8_k: n must be multiple of QK_K");
  }
  size_t nb = n / GGML_QK_K;
  if (x_vec.size() != nb || y_vec.size() != nb) {
    throw std::runtime_error("vec_dot_q6_k_q8_k: vector block count mismatch");
  }
 
  const block_q6_K* x = x_vec.data();
  const block_q8_K* y = y_vec.data();
 
  int8_t aux8[GGML_QK_K];
  int16_t aux16[8];
  float sums[8];
  int32_t aux32[8];
  std::memset(sums, 0, 8 * sizeof(float));
 
  float sumf = 0.0f;
 
  static std::atomic<int> log_count_dot = 0;
  bool should_log_this_block = log_this_call && log_count_dot < 5;
 
  for (size_t i = 0; i < nb; ++i) {
    const uint8_t* ql = x[i].ql;
    const uint8_t* qh = x[i].qh;
    const int8_t* q8 = y[i].qs;
    std::memset(aux32, 0, 8 * sizeof(int32_t));
 
    int8_t* a = aux8;
    for (int j = 0; j < GGML_QK_K; j += 128) {
      for (int l = 0; l < 32; ++l) {
        a[l + 0] = static_cast<int8_t>(
            ((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32);
        a[l + 32] = static_cast<int8_t>(
            ((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32);
        a[l + 64] = static_cast<int8_t>(
            ((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32);
        a[l + 96] = static_cast<int8_t>(
            ((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
      }
      a += 128;
      ql += 64;
      qh += 32;
    }
 
    a = aux8;
    int is = 0;
    for (int j = 0; j < GGML_QK_K / 16; ++j) {
      int scale = x[i].scales[is++];
      for (int l = 0; l < 8; ++l)
        aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
      q8 += 8;
      a += 8;
      for (int l = 0; l < 8; ++l)
        aux16[l] = static_cast<int16_t>(q8[l]) * static_cast<int16_t>(a[l]);
      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
      q8 += 8;
      a += 8;
    }
 
    int32_t sumi_mins = 0;
    for (int j = 0; j < GGML_QK_K / 16; ++j) {
      sumi_mins += static_cast<int32_t>(y[i].bsums[j]) *
                   static_cast<int32_t>(x[i].scales[j]);
    }
 
    const float d_q6 = fp16_to_fp32(x[i].d);
    const float d_q8 = fp16_to_fp32(y[i].d);
    const float d = d_q6 * d_q8;
 
    float block_contribution = 0.0f;
    for (int l = 0; l < 8; ++l) {
      float term = d * (aux32[l] - 32 * sumi_mins / 8);
      sums[l] += term;
      block_contribution += term;
    }
 
    if (i == 0 && should_log_this_block) {
      std::stringstream ss_log;
      ss_log << "[DOT_Q6K_Q8K] Call #" << (log_count_dot.load() + 1)
             << ", Block #0:";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Q6_K Scale d_q6: " << d_q6 << " (Raw FP16: 0x" << std::hex
             << x[i].d << std::dec << ")";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Q8_K Scale d_q8: " << d_q8;
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Combined Scale d: " << d;
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Q6_K Sub-scales (int8): ";
      for (int k = 0; k < 16; ++k) ss_log << (int)x[i].scales[k] << " ";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Int32 Sums (aux32, before compensation): ";
      for (int l = 0; l < 8; ++l) ss_log << aux32[l] << " ";
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Compensation term (sumi_mins): " << sumi_mins
             << ", -32 * sumi_mins: " << (-32 * sumi_mins);
      Logger::debug(ss_log.str());
      ss_log.str("");
      ss_log << "  Block #0 Contribution to Sums (after compensation): "
             << block_contribution;
      Logger::debug(ss_log.str());
    }
  }
 
  for (int l = 0; l < 8; ++l) {
    sumf += sums[l];
  }
 
  if (should_log_this_block) {
    log_count_dot++;
  }
  return sumf;
}

References block_q8_K::bsums, block_q6_K::d, Logger::debug(), fp16_to_fp32(), GGML_QK_K, block_q6_K::qh, block_q6_K::ql, block_q8_K::qs, and block_q6_K::scales.

Referenced by matvec_q6k_q8k_cpu().

Variable Documentation

◆ g_vec_dot_q4_k_q8_k_log_count

std::atomic<int> g_vec_dot_q4_k_q8_k_log_count {0}

static

Definition at line 45 of file quantization.cpp.

45{0};

Referenced by vec_dot_q4_k_q8_k_cpu().

◆ K_MIN_VALUES

constexpr float K_MIN_VALUES[64]

constexpr

Initial value:

= {
    0.0f,       -0.0078125f, -0.015625f, -0.0234375f, -0.03125f,  -0.0390625f,
    -0.046875f, -0.0546875f, -0.0625f,   -0.0703125f, -0.078125f, -0.0859375f,
    -0.09375f,  -0.1015625f, -0.109375f, -0.1171875f, -0.125f,    -0.140625f,
    -0.15625f,  -0.171875f,  -0.1875f,   -0.203125f,  -0.21875f,  -0.234375f,
    -0.25f,     -0.265625f,  -0.28125f,  -0.296875f,  -0.3125f,   -0.328125f,
    -0.34375f,  -0.359375f,  -0.375f,    -0.40625f,   -0.4375f,   -0.46875f,
    -0.5f,      -0.53125f,   -0.5625f,   -0.59375f,   -0.625f,    -0.65625f,
    -0.6875f,   -0.71875f,   -0.75f,     -0.78125f,   -0.8125f,   -0.84375f,
    -0.875f,    -0.9375f,    -1.0f,      -1.0625f,    -1.125f,    -1.1875f,
    -1.25f,     -1.3125f,    -1.375f,    -1.4375f,    -1.5f,      -1.5625f,
    -1.625f,    -1.6875f,    -1.75f,     -1.8125f}

Definition at line 32 of file quantization.cpp.

                                   {
    0.0f,       -0.0078125f, -0.015625f, -0.0234375f, -0.03125f,  -0.0390625f,
    -0.046875f, -0.0546875f, -0.0625f,   -0.0703125f, -0.078125f, -0.0859375f,
    -0.09375f,  -0.1015625f, -0.109375f, -0.1171875f, -0.125f,    -0.140625f,
    -0.15625f,  -0.171875f,  -0.1875f,   -0.203125f,  -0.21875f,  -0.234375f,
    -0.25f,     -0.265625f,  -0.28125f,  -0.296875f,  -0.3125f,   -0.328125f,
    -0.34375f,  -0.359375f,  -0.375f,    -0.40625f,   -0.4375f,   -0.46875f,
    -0.5f,      -0.53125f,   -0.5625f,   -0.59375f,   -0.625f,    -0.65625f,
    -0.6875f,   -0.71875f,   -0.75f,     -0.78125f,   -0.8125f,   -0.84375f,
    -0.875f,    -0.9375f,    -1.0f,      -1.0625f,    -1.125f,    -1.1875f,
    -1.25f,     -1.3125f,    -1.375f,    -1.4375f,    -1.5f,      -1.5625f,
    -1.625f,    -1.6875f,    -1.75f,     -1.8125f};

Referenced by quantize_q4_k_m(), and vec_dot_q4_k_q8_k_cpu().

◆ K_SCALE_VALUES

constexpr float K_SCALE_VALUES[64]

constexpr

Initial value:

= {
0f,  1.0625f, 1.125f, 1.1875f, 1.25f, 1.3125f, 1.375f, 1.4375f,
5f,  1.5625f, 1.625f, 1.6875f, 1.75f, 1.8125f, 1.875f, 1.9375f,
0f,  2.125f,  2.25f,  2.375f,  2.5f,  2.625f,  2.75f,  2.875f,
0f,  3.125f,  3.25f,  3.375f,  3.5f,  3.625f,  3.75f,  3.875f,
0f,  4.25f,   4.5f,   4.75f,   5.0f,  5.25f,   5.5f,   5.75f,
0f,  6.25f,   6.5f,   6.75f,   7.0f,  7.25f,   7.5f,   7.75f,
0f,  8.5f,    9.0f,   9.5f,    10.0f, 10.5f,   11.0f,  11.5f,
0f, 12.5f,   13.0f,  13.5f,   14.0f, 14.5f,   15.0f,  15.5f}

Definition at line 22 of file quantization.cpp.

                                     {
0f,  1.0625f, 1.125f, 1.1875f, 1.25f, 1.3125f, 1.375f, 1.4375f,
5f,  1.5625f, 1.625f, 1.6875f, 1.75f, 1.8125f, 1.875f, 1.9375f,
0f,  2.125f,  2.25f,  2.375f,  2.5f,  2.625f,  2.75f,  2.875f,
0f,  3.125f,  3.25f,  3.375f,  3.5f,  3.625f,  3.75f,  3.875f,
0f,  4.25f,   4.5f,   4.75f,   5.0f,  5.25f,   5.5f,   5.75f,
0f,  6.25f,   6.5f,   6.75f,   7.0f,  7.25f,   7.5f,   7.75f,
0f,  8.5f,    9.0f,   9.5f,    10.0f, 10.5f,   11.0f,  11.5f,
0f, 12.5f,   13.0f,  13.5f,   14.0f, 14.5f,   15.0f,  15.5f};

Referenced by dequantize_q3_k(), quantize_q4_k_m(), and vec_dot_q4_k_q8_k_cpu().

Functions

Variables

Function Documentation

◆ dequantize_q2_k()

◆ dequantize_q3_k()

◆ dequantize_q4_k_m()

◆ dequantize_q6_k()

◆ dequantize_q8_0_block()

◆ dequantize_q8_k()

◆ dequantize_vector_q4k_to_f32()

◆ dequantize_vector_q6k_to_f32()

◆ dequantize_vector_q8_0_to_f32()

◆ fp16_to_fp32()

◆ fp32_to_fp16()

◆ get_scale_min_indices_q4_K()

◆ get_scale_min_k4()

◆ ggml_type_block_size()

◆ ggml_type_name()

◆ ggml_type_size()

◆ handle_i8_tensor()

◆ matvec_q4k_q8k_cpu()

◆ matvec_q6k_q8k_cpu()

◆ quantize_fp32_to_q8_K()

◆ quantize_q4_k_m()

◆ quantize_q6_k()

◆ vec_dot_q4_k_q8_k_cpu()

◆ vec_dot_q6_k_q8_k_cpu()

Variable Documentation

◆ g_vec_dot_q4_k_q8_k_log_count

◆ K_MIN_VALUES

◆ K_SCALE_VALUES