TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
Functions
utils.h File Reference
#include <vector>
#include <string>
#include <cstdint>
#include "quantization.h"
#include "model_constants.h"
Include dependency graph for utils.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

float simd_dot_product (const float *a, const float *b, int n)
 
void simd_scaled_add (float *dst, const float *src, float scale, int n)
 
uint16_t float32_to_bfloat16 (float val)
 
float bfloat16_to_float32 (uint16_t bf16)
 
std::vector< float > bfloat16_vector_to_float32 (const std::vector< uint16_t > &bf16_vec)
 
std::vector< uint16_t > uint8_vector_to_uint16_vector (const std::vector< uint8_t > &bytes, size_t numel)
 
int argmax (const std::vector< float > &v)
 
void matvec_q6k_f32_vector_cpu (const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
 
void matvec_q4k_f32_vector_cpu (const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
 
void matvec_q8_0_f32_vector_cpu (const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
 
void matvec_q8k_f32_vector_cpu (const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
 
void matvec_f32_f32_vector_cpu (const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
 
void matmul_q4k_f32_batch_cpu (const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
 
void matmul_q6k_f32_batch_cpu (const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
 
void matmul_q8_0_f32_batch_cpu (const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
 
void matmul_q8k_f32_batch_cpu (const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
 
void apply_rope_vector (std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
 
void apply_rope_batch_cpu (std::vector< float > &q_batch, std::vector< float > &k_batch, int num_tokens, int num_q_heads, int num_kv_heads, int head_dim, int start_pos_in_sequence, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
 
void rmsnorm_batch_cpu (const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps=numeric::DEFAULT_EPS)
 
void rmsnorm_vector_cpu (const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps=numeric::DEFAULT_EPS)
 
void softmax_vector_cpu (const std::vector< float > &x, std::vector< float > &out)
 
void silu_cpu (const std::vector< float > &x, std::vector< float > &out)
 
void matmul_f32_f32_batch_cpu (const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
 
void matvec_bf16_f32_vector_cpu (const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
 
void weighted_sum_probs_v (const std::vector< float > &probs, const std::vector< float > &V, std::vector< float > &out, int seq_len, int head_dim)
 
void calculate_attention_scores (const std::vector< float > &Q, const std::vector< float > &K, std::vector< float > &scores, int seq_len, int head_dim, float scale)
 
void log_vector_summary (const std::string &name, const std::vector< float > &v, int head_count)
 
void log_vector_summary_with_tail (const std::string &name, const std::vector< float > &v, int head_count, int tail_count)
 
void log_vector_summary_detailed (const std::string &name, const std::vector< float > &v, int current_pos, int current_layer, int N=5)
 
void log_vec_stats (const std::string &name, const std::vector< float > &v)
 
void log_raw_float_pointer (const std::string &name, const float *ptr, size_t count=5)
 
bool write_vector_to_file (const std::string &filename, const std::vector< float > &vec)
 
std::vector< std::vector< float > > load_rmsnorm_bin (const std::string &filename, int num_tokens, int hidden_size)
 
std::vector< float > bf16vec_to_float_vec (const std::vector< uint16_t > &v_bf16)
 
void dequantize_q8_k (const std::vector< block_q8_K > &q8k_vec, std::vector< float > &out_f32, int n, bool log_this_block)
 

Function Documentation

◆ apply_rope_batch_cpu()

void apply_rope_batch_cpu ( std::vector< float > &  q_batch,
std::vector< float > &  k_batch,
int  num_tokens,
int  num_q_heads,
int  num_kv_heads,
int  head_dim,
int  start_pos_in_sequence,
const std::vector< std::pair< float, float > > &  all_freqs_cis,
int  max_pos_embeddings,
bool  use_adjacent_pairing 
)

Definition at line 491 of file utils.cpp.

502 {
503 if (q_batch.size() != (size_t)num_tokens * num_q_heads * head_dim) {
504 Logger::error("apply_rope_batch_cpu: q_batch size mismatch. Expected " +
505 std::to_string((size_t)num_tokens * num_q_heads * head_dim) + ", got " + std::to_string(q_batch.size()));
506 return;
507 }
508 if (k_batch.size() != (size_t)num_tokens * num_kv_heads * head_dim) {
509 Logger::error("apply_rope_batch_cpu: k_batch size mismatch. Expected " +
510 std::to_string((size_t)num_tokens * num_kv_heads * head_dim) + ", got " + std::to_string(k_batch.size()));
511 return;
512 }
513 if (head_dim % 2 != 0) {
514 Logger::error("apply_rope_batch_cpu: head_dim must be even for RoPE.");
515 return;
516 }
517
518 for (int t = 0; t < num_tokens; ++t) {
519 int current_token_pos = start_pos_in_sequence + t;
520
521 if (current_token_pos < 0 || current_token_pos >= max_pos_embeddings) {
522 Logger::warning("[ROPE_BATCH_CPU] Token " + std::to_string(t) + " (actual_pos: " + std::to_string(current_token_pos) +
523 ") is out of range [0, " + std::to_string(max_pos_embeddings -1) + "]. Skipping RoPE for this token.");
524 continue;
525 }
526
527 for (int h = 0; h < num_q_heads; ++h) {
528 size_t head_start_offset_in_batch = ((size_t)t * num_q_heads + h) * head_dim;
529
530 for (int i = 0; i < head_dim / 2; ++i) {
531 size_t freq_idx = (size_t)current_token_pos * (head_dim / 2) + i;
532
533 if (freq_idx >= all_freqs_cis.size()) {
534 Logger::warning("[ROPE_BATCH_CPU] Q - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
535 ", DimPair " + std::to_string(i) + ": freq_idx (" + std::to_string(freq_idx) +
536 ") out of bounds for all_freqs_cis.size (" + std::to_string(all_freqs_cis.size()) + "). Skipping pair.");
537 continue;
538 }
539
540 float freq_cis_real = all_freqs_cis[freq_idx].first;
541 float freq_cis_imag = all_freqs_cis[freq_idx].second;
542
543 float val0, val1;
544 size_t idx0, idx1;
545
546 if (use_adjacent_pairing) {
547 idx0 = head_start_offset_in_batch + 2 * i;
548 idx1 = head_start_offset_in_batch + 2 * i + 1;
549 } else {
550 idx0 = head_start_offset_in_batch + i;
551 idx1 = head_start_offset_in_batch + i + head_dim / 2;
552 }
553
554 if (idx0 >= q_batch.size() || idx1 >= q_batch.size()) {
555 Logger::warning("[ROPE_BATCH_CPU] Q - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
556 ", DimPair " + std::to_string(i) + ": q_batch index out of bounds. q_batch.size(): " + std::to_string(q_batch.size()) +
557 ", idx0: " + std::to_string(idx0) + ", idx1: " + std::to_string(idx1) + ". Skipping pair.");
558 continue;
559 }
560
561 val0 = q_batch[idx0];
562 val1 = q_batch[idx1];
563
564 q_batch[idx0] = val0 * freq_cis_real - val1 * freq_cis_imag;
565 q_batch[idx1] = val0 * freq_cis_imag + val1 * freq_cis_real;
566 }
567 }
568
569 for (int h = 0; h < num_kv_heads; ++h) {
570 size_t head_start_offset_in_batch = ((size_t)t * num_kv_heads + h) * head_dim;
571
572 for (int i = 0; i < head_dim / 2; ++i) {
573 size_t freq_idx = (size_t)current_token_pos * (head_dim / 2) + i;
574
575 if (freq_idx >= all_freqs_cis.size()) {
576 Logger::warning("[ROPE_BATCH_CPU] K - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
577 ", DimPair " + std::to_string(i) + ": freq_idx (" + std::to_string(freq_idx) +
578 ") out of bounds for all_freqs_cis.size (" + std::to_string(all_freqs_cis.size()) + "). Skipping pair.");
579 continue;
580 }
581
582 float freq_cis_real = all_freqs_cis[freq_idx].first;
583 float freq_cis_imag = all_freqs_cis[freq_idx].second;
584
585 float val0, val1;
586 size_t idx0, idx1;
587
588 if (use_adjacent_pairing) {
589 idx0 = head_start_offset_in_batch + 2 * i;
590 idx1 = head_start_offset_in_batch + 2 * i + 1;
591 } else {
592 idx0 = head_start_offset_in_batch + i;
593 idx1 = head_start_offset_in_batch + i + head_dim / 2;
594 }
595
596 if (idx0 >= k_batch.size() || idx1 >= k_batch.size()) {
597 Logger::warning("[ROPE_BATCH_CPU] K - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
598 ", DimPair " + std::to_string(i) + ": k_batch index out of bounds. k_batch.size(): " + std::to_string(k_batch.size()) +
599 ", idx0: " + std::to_string(idx0) + ", idx1: " + std::to_string(idx1) + ". Skipping pair.");
600 continue;
601 }
602
603 val0 = k_batch[idx0];
604 val1 = k_batch[idx1];
605
606 k_batch[idx0] = val0 * freq_cis_real - val1 * freq_cis_imag;
607 k_batch[idx1] = val0 * freq_cis_imag + val1 * freq_cis_real;
608 }
609 }
610 }
611}
static void warning(const std::string &message)
Definition logger.cpp:139
static void error(const std::string &message)
Definition logger.cpp:143

References Logger::error(), and Logger::warning().

Referenced by CPUBatchProcessor::forward_cpu_batch().

◆ apply_rope_vector()

void apply_rope_vector ( std::vector< float > &  x,
int  num_heads,
int  head_dim,
int  current_token_pos,
const std::vector< std::pair< float, float > > &  all_freqs_cis,
int  max_pos_embeddings,
bool  use_adjacent_pairing 
)

Definition at line 428 of file utils.cpp.

436 {
437 if (current_token_pos < 0 || current_token_pos >= max_pos_embeddings) {
438 return;
439 }
440 if (head_dim % 2 != 0) {
441 Logger::error("RoPE apply_rope_vector: head_dim must be even. head_dim: " + std::to_string(head_dim));
442 return;
443 }
444
445 const int dim_half = head_dim / 2;
446 size_t pos_offset = static_cast<size_t>(current_token_pos) * static_cast<size_t>(dim_half);
447
448 for (int h = 0; h < num_heads; ++h) {
449 size_t head_offset = static_cast<size_t>(h) * head_dim;
450
451 for (int i = 0; i < dim_half; ++i) {
452 size_t freq_idx = pos_offset + static_cast<size_t>(i);
453
454 if (freq_idx >= all_freqs_cis.size()) {
455 Logger::warning("RoPE apply_rope_vector: freq_idx out of bounds. pos: " +
456 std::to_string(current_token_pos) + ", head_dim/2: " + std::to_string(dim_half) +
457 ", i: " + std::to_string(i) + ", calculated freq_idx: " + std::to_string(freq_idx) +
458 ", all_freqs_cis.size(): " + std::to_string(all_freqs_cis.size()));
459 continue;
460 }
461
462 float cos_theta = all_freqs_cis[freq_idx].first;
463 float sin_theta = all_freqs_cis[freq_idx].second;
464
465 float x0_val, x1_val;
466 size_t x0_idx, x1_idx;
467
468 if (use_adjacent_pairing) {
469 x0_idx = head_offset + (2 * i);
470 x1_idx = head_offset + (2 * i + 1);
471 } else {
472 x0_idx = head_offset + i;
473 x1_idx = head_offset + i + dim_half;
474 }
475
476 if (x0_idx >= x.size() || x1_idx >= x.size()) {
477 Logger::warning("RoPE apply_rope_vector: x index out of bounds. x.size(): " + std::to_string(x.size()) +
478 ", x0_idx: " + std::to_string(x0_idx) + ", x1_idx: " + std::to_string(x1_idx));
479 continue;
480 }
481
482 x0_val = x[x0_idx];
483 x1_val = x[x1_idx];
484
485 x[x0_idx] = x0_val * cos_theta - x1_val * sin_theta;
486 x[x1_idx] = x0_val * sin_theta + x1_val * cos_theta;
487 }
488 }
489}

References Logger::error(), and Logger::warning().

Referenced by TinyLlamaModel::forward(), CPUBatchProcessor::forward_cpu_batch(), and TinyLlamaModel::forward_cpu_batch_generation().

◆ argmax()

int argmax ( const std::vector< float > &  v)

Definition at line 185 of file utils.cpp.

185 {
186 if (v.empty()) {
187 Logger::error("Cannot perform argmax on empty vector");
188 return -1;
189 }
190 auto max_it = std::max_element(v.begin(), v.end());
191 float max_val = *max_it;
192 int max_idx = std::distance(v.begin(), max_it);
193 Logger::debug("[ARGMAX HELPER] Max value found: " + std::to_string(max_val) +
194 " at index: " + std::to_string(max_idx));
195 return max_idx;
196}
static void debug(const std::string &message)
Definition logger.cpp:131

◆ bf16vec_to_float_vec()

std::vector< float > bf16vec_to_float_vec ( const std::vector< uint16_t > &  v_bf16)

◆ bfloat16_to_float32()

float bfloat16_to_float32 ( uint16_t  bf16)

Definition at line 144 of file utils.cpp.

144 {
145 if (bf16 == bfloat16::ZERO) return 0.0f;
146 if (bf16 == bfloat16::NEG_ZERO) return -0.0f;
147
148 bool is_nan = ((bf16 & bfloat16::EXPONENT_MASK) == bfloat16::EXPONENT_MASK) &&
149 ((bf16 & bfloat16::MANTISSA_MASK) != 0);
150 if (is_nan) return std::numeric_limits<float>::quiet_NaN();
151
153 (bf16 & bfloat16::MANTISSA_MASK) == 0) {
154 return (bf16 & bfloat16::SIGN_BIT) ? -std::numeric_limits<float>::infinity()
155 : std::numeric_limits<float>::infinity();
156 }
157
158 uint32_t bits = static_cast<uint32_t>(bf16) << bfloat16::SHIFT_BITS;
159 float result;
160 std::memcpy(&result, &bits, sizeof(float));
161
162 return result;
163}
constexpr uint16_t ZERO
constexpr uint16_t SIGN_BIT
constexpr uint16_t NEG_ZERO
constexpr uint16_t EXPONENT_MASK
constexpr uint16_t MANTISSA_MASK
constexpr int SHIFT_BITS

Referenced by bf16vec_to_float_vec(), bfloat16_vector_to_float32(), and matvec_bf16_f32_vector_cpu().

◆ bfloat16_vector_to_float32()

std::vector< float > bfloat16_vector_to_float32 ( const std::vector< uint16_t > &  bf16_vec)

Definition at line 165 of file utils.cpp.

165 {
166 std::vector<float> f32_vec(bf16_vec.size());
167
168#pragma omp parallel for
169 for (int64_t i = 0; i < static_cast<int64_t>(bf16_vec.size()); ++i) {
170 f32_vec[i] = bfloat16_to_float32(bf16_vec[i]);
171 }
172
173 return f32_vec;
174}

References bfloat16_to_float32().

◆ calculate_attention_scores()

void calculate_attention_scores ( const std::vector< float > &  Q,
const std::vector< float > &  K,
std::vector< float > &  scores,
int  seq_len,
int  head_dim,
float  scale 
)

Definition at line 1091 of file utils.cpp.

1094 {
1095 if (Q.empty() || K.empty()) return;
1096 scores.resize(seq_len);
1097
1098 scale = std::clamp(scale, attention::MIN_SCALE, attention::MAX_SCALE);
1099 float effective_scale = scale * attention::ATTENTION_SCALE_BASE;
1100
1101#pragma omp parallel for collapse(1)
1102 for (int64_t i = 0; i < static_cast<int64_t>(seq_len); ++i) {
1103 double dot_product = 0.0;
1104 double c_kahan = 0.0;
1105 size_t k_offset = static_cast<size_t>(i) * head_dim;
1106
1107 for (int j = 0; j < head_dim; ++j) {
1108 double term = static_cast<double>(Q[j]) * static_cast<double>(K[k_offset + j]);
1109 double y = term - c_kahan;
1110 double t_sum = dot_product + y;
1111 c_kahan = (t_sum - dot_product) - y;
1112 dot_product = t_sum;
1113 }
1114
1115 scores[i] = static_cast<float>(dot_product * effective_scale);
1116 }
1117}
constexpr float MIN_SCALE
constexpr float ATTENTION_SCALE_BASE
constexpr float MAX_SCALE

References attention::ATTENTION_SCALE_BASE, attention::MAX_SCALE, and attention::MIN_SCALE.

◆ dequantize_q8_k()

void dequantize_q8_k ( const std::vector< block_q8_K > &  q8k_vec,
std::vector< float > &  out_f32,
int  n,
bool  log_this_block 
)

Definition at line 1009 of file quantization.cpp.

1010 {
1011 if (n % GGML_QK_K != 0) {
1012 std::cerr
1013 << "Error: n must be a multiple of GGML_QK_K for Q8_K dequantization."
1014 << std::endl;
1015 return;
1016 }
1017 size_t num_blocks = n / GGML_QK_K;
1018 if (q_data.size() < num_blocks) {
1019 std::cerr << "Error: Not enough Q8_K blocks provided for dequantization."
1020 << std::endl;
1021 return;
1022 }
1023
1024 static std::atomic<int> log_count_q8k_dequant_scales = 0;
1025
1026 for (size_t i = 0; i < num_blocks; ++i) {
1027 const block_q8_K* qblock = &q_data[i];
1028 float* x_block = &x[i * GGML_QK_K];
1029
1030 const float d = fp16_to_fp32(qblock->d, true);
1031
1032 if (log_this_block && log_count_q8k_dequant_scales < 10) {
1033 std::stringstream scale_log_ss;
1034 scale_log_ss << "[Q8K_DEQUANT_SCALES] Block #"
1035 << (log_count_q8k_dequant_scales.load()) << " Raw_d_fp16=0x"
1036 << std::hex << qblock->d << std::dec << " -> d=" << d;
1037 Logger::debug(scale_log_ss.str());
1038 log_count_q8k_dequant_scales++;
1039 }
1040
1041 for (int j = 0; j < GGML_QK_K; ++j) {
1042 x_block[j] = d * static_cast<float>(qblock->qs[j]);
1043 }
1044 }
1045}
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
float fp16_to_fp32(uint16_t h, bool is_gguf_scale_field)
Converts a 16-bit floating point number to 32-bit float.
8-bit K-quantized block structure with block sums
int8_t qs[GGML_QK_K]
uint16_t d

References block_q8_K::d, Logger::debug(), fp16_to_fp32(), GGML_QK_K, and block_q8_K::qs.

Referenced by TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_embed_tokens_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_lm_head_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), TinyLlamaModel::initialize_weights(), and matvec_q8k_f32_vector_cpu().

◆ float32_to_bfloat16()

uint16_t float32_to_bfloat16 ( float  val)

Definition at line 136 of file utils.cpp.

136 {
137 uint32_t bits;
138 std::memcpy(&bits, &val, sizeof(float));
139
140 bits += 0x7FFF + ((bits >> 16) & 1);
141 return static_cast<uint16_t>(bits >> 16);
142}

Referenced by TinyLlamaModel::initialize_gpu_and_rope().

◆ load_rmsnorm_bin()

std::vector< std::vector< float > > load_rmsnorm_bin ( const std::string &  filename,
int  num_tokens,
int  hidden_size 
)

Definition at line 1157 of file utils.cpp.

1157 {
1158 std::ifstream infile(filename, std::ios::binary);
1159 if (!infile) throw std::runtime_error("Failed to open " + filename);
1160 std::vector<float> flat(num_tokens * hidden_size);
1161 infile.read(reinterpret_cast<char*>(flat.data()),
1162 flat.size() * sizeof(float));
1163 if (!infile)
1164 throw std::runtime_error("Failed to read all data from " + filename);
1165 std::vector<std::vector<float>> result(num_tokens,
1166 std::vector<float>(hidden_size));
1167 for (int t = 0; t < num_tokens; ++t) {
1168 for (int h = 0; h < hidden_size; ++h) {
1169 result[t][h] = flat[t * hidden_size + h];
1170 }
1171 }
1172 return result;
1173}

◆ log_raw_float_pointer()

void log_raw_float_pointer ( const std::string &  name,
const float *  ptr,
size_t  count = 5 
)

Definition at line 1175 of file utils.cpp.

1175 {
1176 if (!ptr) {
1177 Logger::info(name + ": NULL POINTER");
1178 return;
1179 }
1180 std::stringstream ss;
1181 ss << name << ": [";
1182 for (size_t i = 0; i < count; ++i) {
1183 if (i > 0) ss << ", ";
1184 ss << std::fixed << std::setprecision(6) << ptr[i];
1185 }
1186 ss << "]";
1187 Logger::info(ss.str());
1188}
static void info(const std::string &message)
Definition logger.cpp:135

References Logger::info().

◆ log_vec_stats()

void log_vec_stats ( const std::string &  name,
const std::vector< float > &  v 
)

Definition at line 1119 of file utils.cpp.

1119 {
1120 if (v.empty()) {
1121 Logger::info(name + ": EMPTY VECTOR");
1122 return;
1123 }
1124 float minv = *std::min_element(v.begin(), v.end());
1125 float maxv = *std::max_element(v.begin(), v.end());
1126 float mean = std::accumulate(v.begin(), v.end(), 0.0f) / v.size();
1127 bool all_finite =
1128 std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
1129 Logger::info(name + ": min=" + std::to_string(minv) + ", max=" +
1130 std::to_string(maxv) + ", mean=" + std::to_string(mean) +
1131 ", all_finite=" + (all_finite ? "yes" : "no"));
1132}

References Logger::info().

◆ log_vector_summary()

void log_vector_summary ( const std::string &  name,
const std::vector< float > &  v,
int  head_count 
)

Definition at line 207 of file utils.cpp.

207 {
208 if (v.empty()) {
209 Logger::info(name + ": EMPTY");
210 return;
211 }
212 std::stringstream ss;
213 size_t actual_head_count = SAFE_MIN(static_cast<size_t>(head_count), v.size());
214
215 ss << name << ": size=" << v.size();
216
217 if (actual_head_count > 0) {
218 ss << ", first " << actual_head_count << ": [";
219 for (size_t i = 0; i < actual_head_count; ++i) {
220 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(4) << v[i];
221 }
222 ss << "]";
223 }
224 float minv = *std::min_element(v.begin(), v.end());
225 float maxv = *std::max_element(v.begin(), v.end());
226 double sum = std::accumulate(v.begin(), v.end(), 0.0);
227 float mean = sum / v.size();
228 bool all_finite = std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
229 ss << ", min=" << minv << ", max=" << maxv << ", mean=" << mean
230 << ", finite=" << (all_finite ? "yes" : "NO");
231 Logger::info(ss.str());
232}
#define SAFE_MIN(a, b)

◆ log_vector_summary_detailed()

void log_vector_summary_detailed ( const std::string &  name,
const std::vector< float > &  v,
int  current_pos,
int  current_layer,
int  N = 5 
)

Definition at line 1190 of file utils.cpp.

1192 {
1193 if (v.empty()) {
1194 Logger::info(name + ": EMPTY");
1195 return;
1196 }
1197
1198 std::stringstream ss;
1199 ss << "[POS=" << current_pos << " LAYER=" << current_layer << "] " << name;
1200 ss << ": size=" << v.size();
1201
1202 size_t actual_N = SAFE_MIN(static_cast<size_t>(N), v.size());
1203 if (actual_N > 0) {
1204 ss << ", first " << actual_N << ": [";
1205 for (size_t i = 0; i < actual_N; ++i) {
1206 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(6) << v[i];
1207 }
1208 ss << "]";
1209 }
1210
1211 float minv = *std::min_element(v.begin(), v.end());
1212 float maxv = *std::max_element(v.begin(), v.end());
1213 double sum = std::accumulate(v.begin(), v.end(), 0.0);
1214 float mean = sum / v.size();
1215 bool all_finite = std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
1216
1217 ss << ", min=" << minv << ", max=" << maxv << ", mean=" << mean
1218 << ", finite=" << (all_finite ? "yes" : "NO");
1219 Logger::info(ss.str());
1220}

References Logger::info(), and SAFE_MIN.

◆ log_vector_summary_with_tail()

void log_vector_summary_with_tail ( const std::string &  name,
const std::vector< float > &  v,
int  head_count,
int  tail_count 
)

Definition at line 234 of file utils.cpp.

235 {
236 if (v.empty()) {
237 Logger::info(name + ": EMPTY");
238 return;
239 }
240 std::stringstream ss;
241
242 size_t actual_head_count = SAFE_MIN(static_cast<size_t>(head_count), v.size());
243 size_t actual_tail_count = SAFE_MIN(static_cast<size_t>(tail_count), v.size());
244 size_t total_shown = actual_head_count + actual_tail_count;
245 bool overlap = total_shown > v.size();
246 if (overlap) {
247 actual_tail_count = v.size() - actual_head_count;
248 if (actual_tail_count > SAFE_MIN(static_cast<size_t>(tail_count), v.size())) {
249 actual_tail_count = SAFE_MIN(static_cast<size_t>(tail_count), v.size());
250 }
251 if (tail_count > 0 && actual_head_count == v.size()) {
252 actual_tail_count = 0;
253 }
254 }
255 size_t tail_start_index = v.size() - actual_tail_count;
256
257 ss << name << ": size=" << v.size();
258
259 if (actual_head_count > 0) {
260 ss << ", first " << actual_head_count << ": [";
261 for (size_t i = 0; i < actual_head_count; ++i) {
262 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(4) << v[i];
263 }
264 ss << "]";
265 }
266
267 if (actual_tail_count > 0 && tail_start_index >= actual_head_count) {
268 ss << ", last " << actual_tail_count << ": [";
269 for (size_t i = 0; i < actual_tail_count; ++i) {
270 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(4)
271 << v[tail_start_index + i];
272 }
273 ss << "]";
274 } else if (overlap && tail_count > 0 && actual_head_count < v.size()) {
275 ss << " (... tail overlaps head ...)";
276 }
277
278 float minv = *std::min_element(v.begin(), v.end());
279 float maxv = *std::max_element(v.begin(), v.end());
280 double sum = std::accumulate(v.begin(), v.end(), 0.0);
281 float mean = sum / v.size();
282 bool all_finite = std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
283 ss << ", min=" << minv << ", max=" << maxv << ", mean=" << mean
284 << ", finite=" << (all_finite ? "yes" : "NO");
285 Logger::info(ss.str());
286}

References Logger::info(), and SAFE_MIN.

◆ matmul_f32_f32_batch_cpu()

void matmul_f32_f32_batch_cpu ( const std::vector< float > &  mat_weights,
const std::vector< float > &  batch_input_activations,
std::vector< float > &  batch_output_activations,
int  num_tokens,
int  output_dim,
int  input_dim 
)

Definition at line 709 of file utils.cpp.

716 {
717 if (mat_weights.empty() || batch_input_activations.empty()) {
718 Logger::error("[MATMUL_F32_BATCH_CPU] Input matrix or batch_input_activations is empty.");
719 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
720 return;
721 }
722 if (mat_weights.size() != (size_t)output_dim * input_dim) {
723 Logger::error("[MATMUL_F32_BATCH_CPU] Matrix dimensions mismatch. Expected " +
724 std::to_string((size_t)output_dim * input_dim) + ", got " +
725 std::to_string(mat_weights.size()));
726 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
727 return;
728 }
729 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
731 "[MATMUL_F32_BATCH_CPU] Batch input activations dimension mismatch. Expected " +
732 std::to_string((size_t)num_tokens * input_dim) + ", got " +
733 std::to_string(batch_input_activations.size()));
734 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
735 return;
736 }
737
738 batch_output_activations.resize((size_t)num_tokens * output_dim);
739
740#pragma omp parallel for schedule(static)
741 for (int t = 0; t < num_tokens; ++t) {
742 size_t input_token_offset = (size_t)t * input_dim;
743 size_t output_token_offset = (size_t)t * output_dim;
744
745 for (int o = 0; o < output_dim; ++o) {
746 double k_sum = 0.0;
747 double k_c = 0.0;
748 size_t weight_row_offset = (size_t)o * input_dim;
749
750 for (int i = 0; i < input_dim; ++i) {
751 double term = static_cast<double>(mat_weights[weight_row_offset + i]) *
752 static_cast<double>(batch_input_activations[input_token_offset + i]);
753 double y = term - k_c;
754 double t_sum = k_sum + y;
755 k_c = (t_sum - k_sum) - y;
756 k_sum = t_sum;
757 }
758 batch_output_activations[output_token_offset + o] = static_cast<float>(k_sum);
759 }
760 }
761}

References Logger::error().

Referenced by CPUBatchProcessor::forward_cpu_batch(), TinyLlamaModel::forward_cpu_batch_generation(), and TinyLlamaModel::forward_cpu_logits_batch().

◆ matmul_q4k_f32_batch_cpu()

void matmul_q4k_f32_batch_cpu ( const std::vector< block_q4_K > &  mat_q4k,
const std::vector< float > &  batch_input_activations,
std::vector< float > &  batch_output_activations,
int  num_tokens,
int  output_dim,
int  input_dim 
)

Definition at line 988 of file utils.cpp.

995 {
996 if (mat_q4k.empty() || batch_input_activations.empty()) {
997 Logger::error("[MATMUL_Q4K_BATCH_CPU] Input matrix or batch_input_activations is empty.");
998 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
999 return;
1000 }
1001 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
1002 Logger::error("[MATMUL_Q4K_BATCH_CPU] batch_input_activations size mismatch. Expected " +
1003 std::to_string((size_t)num_tokens * input_dim) + ", got " +
1004 std::to_string(batch_input_activations.size()));
1005 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
1006 return;
1007 }
1008
1009 batch_output_activations.resize((size_t)num_tokens * output_dim);
1010
1011#pragma omp parallel for
1012 for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
1013 std::vector<float> current_token_input(input_dim);
1014 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
1015 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
1016
1017 std::vector<float> current_token_output(output_dim);
1018 matvec_q4k_f32_vector_cpu(mat_q4k, current_token_input, current_token_output, output_dim, input_dim, false);
1019
1020 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
1021 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
1022 }
1023}
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:816

References Logger::error(), and matvec_q4k_f32_vector_cpu().

Referenced by CPUBatchProcessor::forward_cpu_batch(), TinyLlamaModel::forward_cpu_batch_generation(), and TinyLlamaModel::forward_cpu_logits_batch().

◆ matmul_q6k_f32_batch_cpu()

void matmul_q6k_f32_batch_cpu ( const std::vector< block_q6_K > &  mat_q6k,
const std::vector< float > &  batch_input_activations,
std::vector< float > &  batch_output_activations,
int  num_tokens,
int  output_dim,
int  input_dim 
)

Definition at line 950 of file utils.cpp.

957 {
958 if (mat_q6k.empty() || batch_input_activations.empty()) {
959 Logger::error("[MATMUL_Q6K_BATCH_CPU] Input matrix or batch_input_activations is empty.");
960 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
961 return;
962 }
963
964 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
965 Logger::error("[MATMUL_Q6K_BATCH_CPU] batch_input_activations size mismatch. Expected " +
966 std::to_string((size_t)num_tokens * input_dim) + ", got " +
967 std::to_string(batch_input_activations.size()));
968 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
969 return;
970 }
971
972 batch_output_activations.resize((size_t)num_tokens * output_dim);
973
974#pragma omp parallel for
975 for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
976 std::vector<float> current_token_input(input_dim);
977 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
978 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
979
980 std::vector<float> current_token_output(output_dim);
981 matvec_q6k_f32_vector_cpu(mat_q6k, current_token_input, current_token_output, output_dim, input_dim, false);
982
983 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
984 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
985 }
986}
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:763

References Logger::error(), and matvec_q6k_f32_vector_cpu().

Referenced by CPUBatchProcessor::forward_cpu_batch(), TinyLlamaModel::forward_cpu_batch_generation(), and TinyLlamaModel::forward_cpu_logits_batch().

◆ matmul_q8_0_f32_batch_cpu()

void matmul_q8_0_f32_batch_cpu ( const std::vector< block_q8_0 > &  mat_q8_0,
const std::vector< float > &  batch_input_activations,
std::vector< float > &  batch_output_activations,
int  num_tokens,
int  output_dim,
int  input_dim 
)

Definition at line 869 of file utils.cpp.

876 {
877 if (mat_q8_0.empty() || batch_input_activations.empty()) {
878 Logger::error("[MATMUL_Q8_0_BATCH_CPU] Input matrix or batch_input_activations is empty.");
879 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
880 return;
881 }
882
883 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
884 Logger::error("[MATMUL_Q8_0_BATCH_CPU] batch_input_activations size mismatch. Expected " +
885 std::to_string((size_t)num_tokens * input_dim) + ", got " +
886 std::to_string(batch_input_activations.size()));
887 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
888 return;
889 }
890
891 batch_output_activations.resize((size_t)num_tokens * output_dim);
892
893#pragma omp parallel for
894 for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
895 std::vector<float> current_token_input(input_dim);
896 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
897 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
898
899 std::vector<float> current_token_output(output_dim);
900 matvec_q8_0_f32_vector_cpu(mat_q8_0, current_token_input, current_token_output, output_dim, input_dim, false);
901
902 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
903 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
904 }
905}
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:293

References Logger::error(), and matvec_q8_0_f32_vector_cpu().

Referenced by CPUBatchProcessor::forward_cpu_batch(), TinyLlamaModel::forward_cpu_batch_generation(), and TinyLlamaModel::forward_cpu_logits_batch().

◆ matmul_q8k_f32_batch_cpu()

void matmul_q8k_f32_batch_cpu ( const std::vector< block_q8_K > &  mat_q8k,
const std::vector< float > &  batch_input_activations,
std::vector< float > &  batch_output_activations,
int  num_tokens,
int  output_dim,
int  input_dim 
)

Definition at line 907 of file utils.cpp.

914 {
915 if (input_dim % GGML_QK_K != 0) {
916 throw std::runtime_error("matmul_q8k_f32_batch_cpu: input_dim (" + std::to_string(input_dim) +
917 ") must be divisible by GGML_QK_K (" + std::to_string(GGML_QK_K) + ")");
918 }
919
920 size_t expected_input_size = (size_t)num_tokens * input_dim;
921 if (batch_input_activations.size() != expected_input_size) {
922 throw std::runtime_error("matmul_q8k_f32_batch_cpu: batch_input_activations size mismatch. Expected " +
923 std::to_string(expected_input_size) + ", got " + std::to_string(batch_input_activations.size()));
924 }
925
926 size_t num_blocks_per_row = input_dim / GGML_QK_K;
927 size_t total_blocks_expected = (size_t)output_dim * num_blocks_per_row;
928 if (mat_q8k.size() != total_blocks_expected) {
929 throw std::runtime_error("matmul_q8k_f32_batch_cpu: mat_q8k size mismatch. Expected " +
930 std::to_string(total_blocks_expected) + " blocks, got " + std::to_string(mat_q8k.size()));
931 }
932
933 batch_output_activations.resize((size_t)num_tokens * output_dim);
934
935 for (int t = 0; t < num_tokens; ++t) {
936 std::vector<float> current_token_input(input_dim);
937 for (int i = 0; i < input_dim; ++i) {
938 current_token_input[i] = batch_input_activations[t * input_dim + i];
939 }
940
941 std::vector<float> current_token_output(output_dim);
942 matvec_q8k_f32_vector_cpu(mat_q8k, current_token_input, current_token_output, output_dim, input_dim, false);
943
944 for (int i = 0; i < output_dim; ++i) {
945 batch_output_activations[t * output_dim + i] = current_token_output[i];
946 }
947 }
948}
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:399

References GGML_QK_K, and matvec_q8k_f32_vector_cpu().

◆ matvec_bf16_f32_vector_cpu()

void matvec_bf16_f32_vector_cpu ( const std::vector< uint16_t > &  mat_bf16,
const std::vector< float > &  vec_f32,
std::vector< float > &  out_f32,
int  rows,
int  cols 
)

Definition at line 1025 of file utils.cpp.

1027 {
1028 if (mat_bf16.size() != (size_t)rows * cols ||
1029 vec_f32.size() != (size_t)cols) {
1030 Logger::error("matvec_bf16_f32_vector_cpu: Size mismatch. Mat: " +
1031 std::to_string(mat_bf16.size()) + " (Expected " +
1032 std::to_string(rows * cols) +
1033 "), Vec: " + std::to_string(vec_f32.size()) + " (Expected " +
1034 std::to_string(cols) + ")");
1035 out_f32.assign(rows, 0.0f);
1036 return;
1037 }
1038 out_f32.resize(rows);
1039
1040#pragma omp parallel for
1041 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
1042 double sum = 0.0;
1043 double c = 0.0;
1044 size_t row_offset = r * cols;
1045
1046 for (int c_idx = 0; c_idx < cols; ++c_idx) {
1047 float weight = bfloat16_to_float32(mat_bf16[row_offset + c_idx]);
1048 double term =
1049 static_cast<double>(weight) * static_cast<double>(vec_f32[c_idx]);
1050
1051 double y = term - c;
1052 double t = sum + y;
1053 c = (t - sum) - y;
1054 sum = t;
1055 }
1056 out_f32[r] = static_cast<float>(sum);
1057 }
1058}

References bfloat16_to_float32(), and Logger::error().

Referenced by TinyLlamaModel::forward().

◆ matvec_f32_f32_vector_cpu()

void matvec_f32_f32_vector_cpu ( const std::vector< float > &  mat_f32,
const std::vector< float > &  vec_f32,
std::vector< float > &  out_f32,
int  rows,
int  cols 
)

Definition at line 349 of file utils.cpp.

352 {
353 if (mat_f32.empty() || vec_f32.empty()) {
355 "matvec_f32_f32_vector_cpu: Input matrix or vector is empty.");
356 out_f32.assign(rows, 0.0f);
357 return;
358 }
359 if (mat_f32.size() != (size_t)rows * cols) {
361 "matvec_f32_f32_vector_cpu: Matrix dimensions mismatch. Expected " +
362 std::to_string((size_t)rows * cols) + ", got " +
363 std::to_string(mat_f32.size()));
364 out_f32.assign(rows, 0.0f);
365 return;
366 }
367 if (vec_f32.size() != (size_t)cols) {
369 "matvec_f32_f32_vector_cpu: Vector dimension mismatch. Expected " +
370 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
371 out_f32.assign(rows, 0.0f);
372 return;
373 }
374
375 out_f32.resize(rows);
376
377#pragma omp parallel for schedule(static)
378 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
379 float sum = 0.0f;
380 size_t row_offset = static_cast<size_t>(r) * cols;
381
382 const float* mat_row_ptr = mat_f32.data() + row_offset;
383 const float* vec_ptr = vec_f32.data();
384
385 double k_sum = 0.0;
386 double k_c = 0.0;
387
388 for (int c = 0; c < cols; ++c) {
389 double term = static_cast<double>(mat_row_ptr[c]) * static_cast<double>(vec_ptr[c]);
390 double y = term - k_c;
391 double t_sum = k_sum + y;
392 k_c = (t_sum - k_sum) - y;
393 k_sum = t_sum;
394 }
395 out_f32[r] = static_cast<float>(k_sum);
396 }
397}

References Logger::error().

Referenced by TinyLlamaModel::forward(), and matvec_q8k_f32_vector_cpu().

◆ matvec_q4k_f32_vector_cpu()

void matvec_q4k_f32_vector_cpu ( const std::vector< block_q4_K > &  mat_q4k,
const std::vector< float > &  vec_f32,
std::vector< float > &  out_f32,
int  rows,
int  cols,
bool  log_first_block = false 
)

Definition at line 816 of file utils.cpp.

819 {
820 if (cols % GGML_QK_K != 0) {
821 throw std::runtime_error(
822 "matvec_q4k_f32_vector_cpu: cols (" + std::to_string(cols) +
823 ") must be divisible by GGML_QK_K (" + std::to_string(GGML_QK_K) + ")");
824 }
825 if (vec_f32.size() != cols) {
826 throw std::runtime_error(
827 "matvec_q4k_f32_vector_cpu: vec_f32 size mismatch. Expected " +
828 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
829 }
830 size_t num_blocks_per_row = cols / GGML_QK_K;
831 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
832 if (mat_q4k.size() != total_blocks_expected) {
833 throw std::runtime_error(
834 "matvec_q4k_f32_vector_cpu: mat_q4k size mismatch. Expected " +
835 std::to_string(total_blocks_expected) + " blocks, got " +
836 std::to_string(mat_q4k.size()));
837 }
838
839 out_f32.resize(rows);
840 float dequantized_block[GGML_QK_K];
841
842#pragma omp parallel for private(dequantized_block)
843 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
844 double row_sum = 0.0;
845 double kahan_c = 0.0;
846
847 size_t block_row_offset = r * num_blocks_per_row;
848
849 for (size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
850 const block_q4_K* qblock = &mat_q4k[block_row_offset + block_col_idx];
851 bool enable_dequant_log = log_first_block && (r == 0 && block_col_idx == 0);
852 dequantize_q4_k_m(qblock, dequantized_block, GGML_QK_K, enable_dequant_log);
853
854 size_t vec_offset = block_col_idx * GGML_QK_K;
855 for (int i = 0; i < GGML_QK_K; ++i) {
856 double term = static_cast<double>(dequantized_block[i]) *
857 static_cast<double>(vec_f32[vec_offset + i]);
858
859 double y = term - kahan_c;
860 double t = row_sum + y;
861 kahan_c = (t - row_sum) - y;
862 row_sum = t;
863 }
864 }
865 out_f32[r] = static_cast<float>(row_sum);
866 }
867}
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
4-bit K-quantized block structure

References dequantize_q4_k_m(), and GGML_QK_K.

Referenced by TinyLlamaModel::forward(), and matmul_q4k_f32_batch_cpu().

◆ matvec_q6k_f32_vector_cpu()

void matvec_q6k_f32_vector_cpu ( const std::vector< block_q6_K > &  mat_q6k,
const std::vector< float > &  vec_f32,
std::vector< float > &  out_f32,
int  rows,
int  cols,
bool  log_first_block = false 
)

Definition at line 763 of file utils.cpp.

766 {
767 if (cols % GGML_QK_K != 0) {
768 throw std::runtime_error(
769 "matvec_q6k_f32_vector_cpu: cols (" + std::to_string(cols) +
770 ") must be divisible by GGML_QK_K (" + std::to_string(GGML_QK_K) + ")");
771 }
772 if (vec_f32.size() != cols) {
773 throw std::runtime_error(
774 "matvec_q6k_f32_vector_cpu: vec_f32 size mismatch. Expected " +
775 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
776 }
777 size_t num_blocks_per_row = cols / GGML_QK_K;
778 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
779 if (mat_q6k.size() != total_blocks_expected) {
780 throw std::runtime_error(
781 "matvec_q6k_f32_vector_cpu: mat_q6k size mismatch. Expected " +
782 std::to_string(total_blocks_expected) + " blocks, got " +
783 std::to_string(mat_q6k.size()));
784 }
785
786 out_f32.resize(rows);
787 float dequantized_block[GGML_QK_K];
788
789#pragma omp parallel for private(dequantized_block)
790 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
791 double row_sum = 0.0;
792 double kahan_c = 0.0;
793
794 size_t block_row_offset = r * num_blocks_per_row;
795
796 for (size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
797 const block_q6_K* qblock = &mat_q6k[block_row_offset + block_col_idx];
798 bool enable_dequant_log = log_first_block && (r == 0 && block_col_idx == 0);
799 dequantize_q6_k(qblock, dequantized_block, GGML_QK_K);
800
801 size_t vec_offset = block_col_idx * GGML_QK_K;
802 for (int i = 0; i < GGML_QK_K; ++i) {
803 double term = static_cast<double>(dequantized_block[i]) *
804 static_cast<double>(vec_f32[vec_offset + i]);
805
806 double y = term - kahan_c;
807 double t = row_sum + y;
808 kahan_c = (t - row_sum) - y;
809 row_sum = t;
810 }
811 }
812 out_f32[r] = static_cast<float>(row_sum);
813 }
814}
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
6-bit K-quantized block structure

References dequantize_q6_k(), and GGML_QK_K.

Referenced by TinyLlamaModel::forward(), and matmul_q6k_f32_batch_cpu().

◆ matvec_q8_0_f32_vector_cpu()

void matvec_q8_0_f32_vector_cpu ( const std::vector< block_q8_0 > &  mat_q8_0,
const std::vector< float > &  vec_f32,
std::vector< float > &  out_f32,
int  rows,
int  cols,
bool  log_first_block = false 
)

Definition at line 293 of file utils.cpp.

296 {
297 if (cols % GGML_QK8_0 != 0) {
298 throw std::runtime_error(
299 "matvec_q8_0_f32_vector_cpu: cols (" + std::to_string(cols) +
300 ") must be divisible by GGML_QK8_0 (" + std::to_string(GGML_QK8_0) + ")");
301 }
302 if (vec_f32.size() != static_cast<size_t>(cols)) {
303 throw std::runtime_error(
304 "matvec_q8_0_f32_vector_cpu: vec_f32 size mismatch. Expected " +
305 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
306 }
307 size_t num_blocks_per_row = cols / GGML_QK8_0;
308 size_t total_blocks_expected = static_cast<size_t>(rows) * num_blocks_per_row;
309 if (mat_q8_0.size() != total_blocks_expected) {
310 throw std::runtime_error(
311 "matvec_q8_0_f32_vector_cpu: mat_q8_0 size mismatch. Expected " +
312 std::to_string(total_blocks_expected) + " blocks, got " +
313 std::to_string(mat_q8_0.size()));
314 }
315
316 out_f32.resize(rows);
317 float dequantized_block[GGML_QK8_0];
318
319
320#pragma omp parallel for private(dequantized_block)
321 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
322 double row_sum = 0.0;
323 double kahan_c = 0.0;
324
325 size_t block_row_offset = static_cast<size_t>(r) * num_blocks_per_row;
326
327 for (size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
328 const block_q8_0* qblock = &mat_q8_0[block_row_offset + block_col_idx];
329 dequantize_q8_0_block(qblock, dequantized_block);
330
331 size_t vec_offset = block_col_idx * GGML_QK8_0;
332
333
334 for (int i = 0; i < GGML_QK8_0; ++i) {
335 double term = static_cast<double>(dequantized_block[i]) *
336 static_cast<double>(vec_f32[vec_offset + i]);
337
338 double y = term - kahan_c;
339 double t = row_sum + y;
340 kahan_c = (t - row_sum) - y;
341 row_sum = t;
342 }
343 }
344 out_f32[r] = static_cast<float>(row_sum);
345
346 }
347}
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
Simple 8-bit quantized block structure.

References dequantize_q8_0_block(), and GGML_QK8_0.

Referenced by TinyLlamaModel::forward(), and matmul_q8_0_f32_batch_cpu().

◆ matvec_q8k_f32_vector_cpu()

void matvec_q8k_f32_vector_cpu ( const std::vector< block_q8_K > &  mat_q8k,
const std::vector< float > &  vec_f32,
std::vector< float > &  out_f32,
int  rows,
int  cols,
bool  log_first_block = false 
)

Definition at line 399 of file utils.cpp.

402 {
403 if (cols % GGML_QK_K != 0) {
404 throw std::runtime_error("matvec_q8k_f32_vector_cpu: cols must be divisible by GGML_QK_K");
405 }
406
407 size_t num_blocks_per_row = cols / GGML_QK_K;
408 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
409 if (mat_q8k.size() != total_blocks_expected) {
410 throw std::runtime_error("matvec_q8k_f32_vector_cpu: mat_q8k size mismatch");
411 }
412 if (vec_f32.size() != (size_t)cols) {
413 throw std::runtime_error("matvec_q8k_f32_vector_cpu: vec_f32 size mismatch");
414 }
415
416 out_f32.resize(rows);
417
418 std::vector<float> mat_f32;
419 dequantize_q8_k(mat_q8k, mat_f32, rows * cols, log_first_block);
420
421 matvec_f32_f32_vector_cpu(mat_f32, vec_f32, out_f32, rows, cols);
422
423 if (log_first_block && rows > 0) {
424 Logger::info("[Q8K_MATVEC_DEBUG] First output: " + std::to_string(out_f32[0]));
425 }
426}
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:349

References dequantize_q8_k(), GGML_QK_K, Logger::info(), and matvec_f32_f32_vector_cpu().

Referenced by TinyLlamaModel::forward(), and matmul_q8k_f32_batch_cpu().

◆ rmsnorm_batch_cpu()

void rmsnorm_batch_cpu ( const std::vector< float > &  x_batch,
const std::vector< float > &  weight,
std::vector< float > &  out_batch,
int  num_tokens,
int  hidden_size,
float  eps = numeric::DEFAULT_EPS 
)

Definition at line 613 of file utils.cpp.

618 {
619 if (x_batch.empty() || x_batch.size() != (size_t)num_tokens * hidden_size || weight.size() != (size_t)hidden_size) {
620 Logger::error("[RMSNORM_BATCH_CPU] RMSNorm batch size mismatch or empty input. x_batch.size(): " + std::to_string(x_batch.size()) +
621 ", expected x_batch: " + std::to_string((size_t)num_tokens * hidden_size) +
622 ", weight.size(): " + std::to_string(weight.size()) +
623 ", expected weight: " + std::to_string((size_t)hidden_size));
624 out_batch.assign((size_t)num_tokens * hidden_size, 0.0f);
625 return;
626 }
627 out_batch.resize((size_t)num_tokens * hidden_size);
628
629#pragma omp parallel for
630 for (int t = 0; t < num_tokens; ++t) {
631 double ssq = 0.0;
632 size_t token_offset = (size_t)t * hidden_size;
633
634 for (int i = 0; i < hidden_size; ++i) {
635 ssq += static_cast<double>(x_batch[token_offset + i]) * static_cast<double>(x_batch[token_offset + i]);
636 }
637
638 double ssq_mean = ssq / hidden_size;
639 float norm_factor_input_sqrt = static_cast<float>(ssq_mean);
640 float norm_factor = 1.0f / SAFE_SQRT(norm_factor_input_sqrt + eps);
641
642 for (int i = 0; i < hidden_size; ++i) {
643 out_batch[token_offset + i] = x_batch[token_offset + i] * norm_factor * weight[i];
644 }
645 }
646}
#define SAFE_SQRT(x)

References Logger::error(), and SAFE_SQRT.

Referenced by CPUBatchProcessor::forward_cpu_batch(), TinyLlamaModel::forward_cpu_batch_generation(), and TinyLlamaModel::forward_cpu_logits_batch().

◆ rmsnorm_vector_cpu()

void rmsnorm_vector_cpu ( const std::vector< float > &  x,
const std::vector< float > &  weight,
std::vector< float > &  out,
float  eps = numeric::DEFAULT_EPS 
)

Definition at line 648 of file utils.cpp.

650 {
651 if (x.empty() || x.size() != weight.size()) {
652 Logger::error("RMSNorm vector size mismatch or empty input.");
653 out.assign(x.size(), 0.0f);
654 return;
655 }
656 out.resize(x.size());
657 size_t n = x.size();
658
659 double ssq = 0.0;
660#pragma omp parallel for reduction(+ : ssq)
661 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
662 ssq += static_cast<double>(x[i]) * static_cast<double>(x[i]);
663 }
664 ssq /= n;
665
666 float norm_factor = 1.0f / SAFE_SQRT(static_cast<float>(ssq) +
668
669#pragma omp parallel for
670 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
671 out[i] = x[i] * norm_factor * weight[i];
672 }
673}
#define SAFE_MAX(a, b)
constexpr float MIN_NORM_EPS

References Logger::error(), numeric::MIN_NORM_EPS, SAFE_MAX, and SAFE_SQRT.

Referenced by TinyLlamaModel::forward().

◆ silu_cpu()

void silu_cpu ( const std::vector< float > &  x,
std::vector< float > &  out 
)

Definition at line 700 of file utils.cpp.

700 {
701 if (x.size() != out.size()) out.resize(x.size());
702#pragma omp parallel for
703 for (int64_t i = 0; i < static_cast<int64_t>(x.size()); ++i) {
704 float sigmoid_x = 1.0f / (1.0f + std::exp(-x[i]));
705 out[i] = x[i] * sigmoid_x;
706 }
707}

Referenced by TinyLlamaModel::forward().

◆ simd_dot_product()

float simd_dot_product ( const float *  a,
const float *  b,
int  n 
)

Definition at line 35 of file utils.cpp.

35 {
36#if defined(__AVX2__)
37 __m256 sum = _mm256_setzero_ps();
38 int i = 0;
39 for (; i <= n - 8; i += 8) {
40 __m256 va = _mm256_loadu_ps(&a[i]);
41 __m256 vb = _mm256_loadu_ps(&b[i]);
42 sum = _mm256_fmadd_ps(va, vb, sum);
43 }
44 float result[8];
45 _mm256_storeu_ps(result, sum);
46 float final_sum = result[0] + result[1] + result[2] + result[3] +
47 result[4] + result[5] + result[6] + result[7];
48 for (; i < n; ++i) {
49 final_sum += a[i] * b[i];
50 }
51 return final_sum;
52#elif defined(__SSE2__)
53 __m128 sum = _mm_setzero_ps();
54 int i = 0;
55 for (; i <= n - 4; i += 4) {
56 __m128 va = _mm_loadu_ps(&a[i]);
57 __m128 vb = _mm_loadu_ps(&b[i]);
58 sum = _mm_add_ps(sum, _mm_mul_ps(va, vb));
59 }
60 float result[4];
61 _mm_storeu_ps(result, sum);
62 float final_sum = result[0] + result[1] + result[2] + result[3];
63 for (; i < n; ++i) {
64 final_sum += a[i] * b[i];
65 }
66 return final_sum;
67#elif defined(__ARM_NEON)
68 float32x4_t sum = vdupq_n_f32(0.0f);
69 int i = 0;
70 for (; i <= n - 4; i += 4) {
71 float32x4_t va = vld1q_f32(&a[i]);
72 float32x4_t vb = vld1q_f32(&b[i]);
73 sum = vmlaq_f32(sum, va, vb);
74 }
75 float result[4];
76 vst1q_f32(result, sum);
77 float final_sum = result[0] + result[1] + result[2] + result[3];
78 for (; i < n; ++i) {
79 final_sum += a[i] * b[i];
80 }
81 return final_sum;
82#else
83 float sum = 0.0f;
84 for (int i = 0; i < n; ++i) {
85 sum += a[i] * b[i];
86 }
87 return sum;
88#endif
89}

Referenced by TinyLlamaModel::forward_cpu_batch_generation().

◆ simd_scaled_add()

void simd_scaled_add ( float *  dst,
const float *  src,
float  scale,
int  n 
)

Definition at line 92 of file utils.cpp.

92 {
93#if defined(__AVX2__)
94 __m256 vscale = _mm256_set1_ps(scale);
95 int i = 0;
96 for (; i <= n - 8; i += 8) {
97 __m256 vdst = _mm256_loadu_ps(&dst[i]);
98 __m256 vsrc = _mm256_loadu_ps(&src[i]);
99 __m256 result = _mm256_fmadd_ps(vsrc, vscale, vdst);
100 _mm256_storeu_ps(&dst[i], result);
101 }
102 for (; i < n; ++i) {
103 dst[i] += src[i] * scale;
104 }
105#elif defined(__SSE2__)
106 __m128 vscale = _mm_set1_ps(scale);
107 int i = 0;
108 for (; i <= n - 4; i += 4) {
109 __m128 vdst = _mm_loadu_ps(&dst[i]);
110 __m128 vsrc = _mm_loadu_ps(&src[i]);
111 __m128 result = _mm_add_ps(vdst, _mm_mul_ps(vsrc, vscale));
112 _mm_storeu_ps(&dst[i], result);
113 }
114 for (; i < n; ++i) {
115 dst[i] += src[i] * scale;
116 }
117#elif defined(__ARM_NEON)
118 float32x4_t vscale = vdupq_n_f32(scale);
119 int i = 0;
120 for (; i <= n - 4; i += 4) {
121 float32x4_t vdst = vld1q_f32(&dst[i]);
122 float32x4_t vsrc = vld1q_f32(&src[i]);
123 float32x4_t result = vmlaq_f32(vdst, vsrc, vscale);
124 vst1q_f32(&dst[i], result);
125 }
126 for (; i < n; ++i) {
127 dst[i] += src[i] * scale;
128 }
129#else
130 for (int i = 0; i < n; ++i) {
131 dst[i] += src[i] * scale;
132 }
133#endif
134}

Referenced by TinyLlamaModel::forward_cpu_batch_generation().

◆ softmax_vector_cpu()

void softmax_vector_cpu ( const std::vector< float > &  x,
std::vector< float > &  out 
)

Definition at line 675 of file utils.cpp.

676 {
677 if (x.empty()) return;
678 out.resize(x.size());
679 size_t n = x.size();
680
681 float max_val = x[0];
682 for (size_t i = 1; i < n; ++i) {
683 if (x[i] > max_val) max_val = x[i];
684 }
685
686 float exp_sum = 0.0f;
687 for (size_t i = 0; i < n; ++i) {
688 out[i] = std::exp(x[i] - max_val);
689 exp_sum += out[i];
690 }
691
692 float inv_sum = 1.0f / (exp_sum + 1e-9f);
693
694#pragma omp parallel for
695 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
696 out[i] *= inv_sum;
697 }
698}

Referenced by attention_batch_cpu(), attention_batch_cpu_sequence_aware(), TinyLlamaModel::forward(), and TinyLlamaModel::forward_cpu_batch_generation().

◆ uint8_vector_to_uint16_vector()

std::vector< uint16_t > uint8_vector_to_uint16_vector ( const std::vector< uint8_t > &  bytes,
size_t  numel 
)

Definition at line 176 of file utils.cpp.

176 {
177 if (bytes.size() != numel * 2) {
178 throw std::runtime_error("Byte vector size mismatch for uint16_t conversion");
179 }
180 std::vector<uint16_t> out(numel);
181 std::memcpy(out.data(), bytes.data(), bytes.size());
182 return out;
183}

◆ weighted_sum_probs_v()

void weighted_sum_probs_v ( const std::vector< float > &  probs,
const std::vector< float > &  V,
std::vector< float > &  out,
int  seq_len,
int  head_dim 
)

Definition at line 1060 of file utils.cpp.

1062 {
1063 if (probs.size() != seq_len || V.size() != (size_t)seq_len * head_dim) {
1064 Logger::error("weighted_sum_probs_v: Size mismatch. Probs: " +
1065 std::to_string(probs.size()) + " (Expected " +
1066 std::to_string(seq_len) +
1067 "), V: " + std::to_string(V.size()) + " (Expected " +
1068 std::to_string(seq_len * head_dim) + ")");
1069 out.assign(head_dim, 0.0f);
1070 return;
1071 }
1072 out.resize(head_dim);
1073
1074#pragma omp parallel for
1075 for (int64_t j = 0; j < static_cast<int64_t>(head_dim); ++j) {
1076 double sum = 0.0;
1077 double c_kahan = 0.0;
1078 for (int i = 0; i < seq_len; ++i) {
1079 double term = static_cast<double>(probs[i]) *
1080 static_cast<double>(V[i * head_dim + j]);
1081
1082 double y = term - c_kahan;
1083 double t = sum + y;
1084 c_kahan = (t - sum) - y;
1085 sum = t;
1086 }
1087 out[j] = static_cast<float>(sum);
1088 }
1089}

References Logger::error().

◆ write_vector_to_file()

bool write_vector_to_file ( const std::string &  filename,
const std::vector< float > &  vec 
)

Definition at line 1134 of file utils.cpp.

1134 {
1135 std::string vec_writer_vals;
1136 int N_log_writer = (std::min)(10, (int)vec.size());
1137 for (int i = 0; i < N_log_writer; ++i)
1138 vec_writer_vals += (i ? " " : "") + std::to_string(vec[i]);
1139 Logger::info("write_vector_to_file Enter: Address of vec.data() on entry: " +
1140 std::to_string(reinterpret_cast<uintptr_t>(vec.data())));
1141
1142 std::ofstream outfile(filename, std::ios::binary);
1143 if (!outfile) {
1144 Logger::error("Failed to open file for writing: " + filename);
1145 return false;
1146 }
1147 outfile.write(reinterpret_cast<const char*>(vec.data()),
1148 vec.size() * sizeof(float));
1149 if (!outfile) {
1150 Logger::error("Failed to write data to file: " + filename);
1151 return false;
1152 }
1153 Logger::info("Successfully wrote vector to " + filename);
1154 return true;
1155}

References Logger::error(), and Logger::info().