TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
utils.cpp
Go to the documentation of this file.
1#include "utils.h"
2
3#include "logger.h"
4#include "quantization.h"
5#include "model_constants.h"
6#include "model_macros.h"
7
8#include <algorithm>
9#include <cmath>
10#include <cstring>
11#include <fstream>
12#include <iomanip>
13#include <limits>
14#include <memory>
15#include <sstream>
16#include <stdexcept>
17#include <cassert>
18#include <cstdint>
19#include <iostream>
20#include <numeric>
21
22// SIMD intrinsics headers for optimized computation
23#if defined(__AVX2__)
24#include <immintrin.h>
25#define SIMD_WIDTH 8
26#elif defined(__SSE2__)
27#include <emmintrin.h>
28#define SIMD_WIDTH 4
29#elif defined(__ARM_NEON)
30#include <arm_neon.h>
31#define SIMD_WIDTH 4
32#endif
33
34// SIMD optimized dot product functions
35float simd_dot_product(const float* a, const float* b, int n) {
36#if defined(__AVX2__)
37 __m256 sum = _mm256_setzero_ps();
38 int i = 0;
39 for (; i <= n - 8; i += 8) {
40 __m256 va = _mm256_loadu_ps(&a[i]);
41 __m256 vb = _mm256_loadu_ps(&b[i]);
42 sum = _mm256_fmadd_ps(va, vb, sum);
43 }
44 float result[8];
45 _mm256_storeu_ps(result, sum);
46 float final_sum = result[0] + result[1] + result[2] + result[3] +
47 result[4] + result[5] + result[6] + result[7];
48 for (; i < n; ++i) {
49 final_sum += a[i] * b[i];
50 }
51 return final_sum;
52#elif defined(__SSE2__)
53 __m128 sum = _mm_setzero_ps();
54 int i = 0;
55 for (; i <= n - 4; i += 4) {
56 __m128 va = _mm_loadu_ps(&a[i]);
57 __m128 vb = _mm_loadu_ps(&b[i]);
58 sum = _mm_add_ps(sum, _mm_mul_ps(va, vb));
59 }
60 float result[4];
61 _mm_storeu_ps(result, sum);
62 float final_sum = result[0] + result[1] + result[2] + result[3];
63 for (; i < n; ++i) {
64 final_sum += a[i] * b[i];
65 }
66 return final_sum;
67#elif defined(__ARM_NEON)
68 float32x4_t sum = vdupq_n_f32(0.0f);
69 int i = 0;
70 for (; i <= n - 4; i += 4) {
71 float32x4_t va = vld1q_f32(&a[i]);
72 float32x4_t vb = vld1q_f32(&b[i]);
73 sum = vmlaq_f32(sum, va, vb);
74 }
75 float result[4];
76 vst1q_f32(result, sum);
77 float final_sum = result[0] + result[1] + result[2] + result[3];
78 for (; i < n; ++i) {
79 final_sum += a[i] * b[i];
80 }
81 return final_sum;
82#else
83 float sum = 0.0f;
84 for (int i = 0; i < n; ++i) {
85 sum += a[i] * b[i];
86 }
87 return sum;
88#endif
89}
90
91// SIMD optimized scaled vector addition
92void simd_scaled_add(float* dst, const float* src, float scale, int n) {
93#if defined(__AVX2__)
94 __m256 vscale = _mm256_set1_ps(scale);
95 int i = 0;
96 for (; i <= n - 8; i += 8) {
97 __m256 vdst = _mm256_loadu_ps(&dst[i]);
98 __m256 vsrc = _mm256_loadu_ps(&src[i]);
99 __m256 result = _mm256_fmadd_ps(vsrc, vscale, vdst);
100 _mm256_storeu_ps(&dst[i], result);
101 }
102 for (; i < n; ++i) {
103 dst[i] += src[i] * scale;
104 }
105#elif defined(__SSE2__)
106 __m128 vscale = _mm_set1_ps(scale);
107 int i = 0;
108 for (; i <= n - 4; i += 4) {
109 __m128 vdst = _mm_loadu_ps(&dst[i]);
110 __m128 vsrc = _mm_loadu_ps(&src[i]);
111 __m128 result = _mm_add_ps(vdst, _mm_mul_ps(vsrc, vscale));
112 _mm_storeu_ps(&dst[i], result);
113 }
114 for (; i < n; ++i) {
115 dst[i] += src[i] * scale;
116 }
117#elif defined(__ARM_NEON)
118 float32x4_t vscale = vdupq_n_f32(scale);
119 int i = 0;
120 for (; i <= n - 4; i += 4) {
121 float32x4_t vdst = vld1q_f32(&dst[i]);
122 float32x4_t vsrc = vld1q_f32(&src[i]);
123 float32x4_t result = vmlaq_f32(vdst, vsrc, vscale);
124 vst1q_f32(&dst[i], result);
125 }
126 for (; i < n; ++i) {
127 dst[i] += src[i] * scale;
128 }
129#else
130 for (int i = 0; i < n; ++i) {
131 dst[i] += src[i] * scale;
132 }
133#endif
134}
135
136uint16_t float32_to_bfloat16(float val) {
137 uint32_t bits;
138 std::memcpy(&bits, &val, sizeof(float));
139
140 bits += 0x7FFF + ((bits >> 16) & 1);
141 return static_cast<uint16_t>(bits >> 16);
142}
143
144float bfloat16_to_float32(uint16_t bf16) {
145 if (bf16 == bfloat16::ZERO) return 0.0f;
146 if (bf16 == bfloat16::NEG_ZERO) return -0.0f;
147
148 bool is_nan = ((bf16 & bfloat16::EXPONENT_MASK) == bfloat16::EXPONENT_MASK) &&
149 ((bf16 & bfloat16::MANTISSA_MASK) != 0);
150 if (is_nan) return std::numeric_limits<float>::quiet_NaN();
151
153 (bf16 & bfloat16::MANTISSA_MASK) == 0) {
154 return (bf16 & bfloat16::SIGN_BIT) ? -std::numeric_limits<float>::infinity()
155 : std::numeric_limits<float>::infinity();
156 }
157
158 uint32_t bits = static_cast<uint32_t>(bf16) << bfloat16::SHIFT_BITS;
159 float result;
160 std::memcpy(&result, &bits, sizeof(float));
161
162 return result;
163}
164
165std::vector<float> bfloat16_vector_to_float32(const std::vector<uint16_t>& bf16_vec) {
166 std::vector<float> f32_vec(bf16_vec.size());
167
168#pragma omp parallel for
169 for (int64_t i = 0; i < static_cast<int64_t>(bf16_vec.size()); ++i) {
170 f32_vec[i] = bfloat16_to_float32(bf16_vec[i]);
171 }
172
173 return f32_vec;
174}
175
176std::vector<uint16_t> uint8_vector_to_uint16_vector(const std::vector<uint8_t>& bytes, size_t numel) {
177 if (bytes.size() != numel * 2) {
178 throw std::runtime_error("Byte vector size mismatch for uint16_t conversion");
179 }
180 std::vector<uint16_t> out(numel);
181 std::memcpy(out.data(), bytes.data(), bytes.size());
182 return out;
183}
184
185int argmax(const std::vector<float>& v) {
186 if (v.empty()) {
187 Logger::error("Cannot perform argmax on empty vector");
188 return -1;
189 }
190 auto max_it = std::max_element(v.begin(), v.end());
191 float max_val = *max_it;
192 int max_idx = std::distance(v.begin(), max_it);
193 Logger::debug("[ARGMAX HELPER] Max value found: " + std::to_string(max_val) +
194 " at index: " + std::to_string(max_idx));
195 return max_idx;
196}
197
198std::vector<float> bf16vec_to_float_vec(const std::vector<uint16_t>& v_bf16) {
199 std::vector<float> v_f32(v_bf16.size());
200#pragma omp parallel for
201 for (int64_t i = 0; i < static_cast<int64_t>(v_bf16.size()); ++i) {
202 v_f32[i] = bfloat16_to_float32(v_bf16[i]);
203 }
204 return v_f32;
205}
206
207void log_vector_summary(const std::string& name, const std::vector<float>& v, int head_count) {
208 if (v.empty()) {
209 Logger::info(name + ": EMPTY");
210 return;
211 }
212 std::stringstream ss;
213 size_t actual_head_count = SAFE_MIN(static_cast<size_t>(head_count), v.size());
214
215 ss << name << ": size=" << v.size();
216
217 if (actual_head_count > 0) {
218 ss << ", first " << actual_head_count << ": [";
219 for (size_t i = 0; i < actual_head_count; ++i) {
220 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(4) << v[i];
221 }
222 ss << "]";
223 }
224 float minv = *std::min_element(v.begin(), v.end());
225 float maxv = *std::max_element(v.begin(), v.end());
226 double sum = std::accumulate(v.begin(), v.end(), 0.0);
227 float mean = sum / v.size();
228 bool all_finite = std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
229 ss << ", min=" << minv << ", max=" << maxv << ", mean=" << mean
230 << ", finite=" << (all_finite ? "yes" : "NO");
231 Logger::info(ss.str());
232}
233
234void log_vector_summary_with_tail(const std::string& name, const std::vector<float>& v,
235 int head_count, int tail_count) {
236 if (v.empty()) {
237 Logger::info(name + ": EMPTY");
238 return;
239 }
240 std::stringstream ss;
241
242 size_t actual_head_count = SAFE_MIN(static_cast<size_t>(head_count), v.size());
243 size_t actual_tail_count = SAFE_MIN(static_cast<size_t>(tail_count), v.size());
244 size_t total_shown = actual_head_count + actual_tail_count;
245 bool overlap = total_shown > v.size();
246 if (overlap) {
247 actual_tail_count = v.size() - actual_head_count;
248 if (actual_tail_count > SAFE_MIN(static_cast<size_t>(tail_count), v.size())) {
249 actual_tail_count = SAFE_MIN(static_cast<size_t>(tail_count), v.size());
250 }
251 if (tail_count > 0 && actual_head_count == v.size()) {
252 actual_tail_count = 0;
253 }
254 }
255 size_t tail_start_index = v.size() - actual_tail_count;
256
257 ss << name << ": size=" << v.size();
258
259 if (actual_head_count > 0) {
260 ss << ", first " << actual_head_count << ": [";
261 for (size_t i = 0; i < actual_head_count; ++i) {
262 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(4) << v[i];
263 }
264 ss << "]";
265 }
266
267 if (actual_tail_count > 0 && tail_start_index >= actual_head_count) {
268 ss << ", last " << actual_tail_count << ": [";
269 for (size_t i = 0; i < actual_tail_count; ++i) {
270 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(4)
271 << v[tail_start_index + i];
272 }
273 ss << "]";
274 } else if (overlap && tail_count > 0 && actual_head_count < v.size()) {
275 ss << " (... tail overlaps head ...)";
276 }
277
278 float minv = *std::min_element(v.begin(), v.end());
279 float maxv = *std::max_element(v.begin(), v.end());
280 double sum = std::accumulate(v.begin(), v.end(), 0.0);
281 float mean = sum / v.size();
282 bool all_finite = std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
283 ss << ", min=" << minv << ", max=" << maxv << ", mean=" << mean
284 << ", finite=" << (all_finite ? "yes" : "NO");
285 Logger::info(ss.str());
286}
287
288
289
290
291
292// Matrix-vector multiplication functions
293void matvec_q8_0_f32_vector_cpu(const std::vector<block_q8_0>& mat_q8_0,
294 const std::vector<float>& vec_f32,
295 std::vector<float>& out_f32, int rows,
296 int cols, bool log_first_block) {
297 if (cols % GGML_QK8_0 != 0) {
298 throw std::runtime_error(
299 "matvec_q8_0_f32_vector_cpu: cols (" + std::to_string(cols) +
300 ") must be divisible by GGML_QK8_0 (" + std::to_string(GGML_QK8_0) + ")");
301 }
302 if (vec_f32.size() != static_cast<size_t>(cols)) {
303 throw std::runtime_error(
304 "matvec_q8_0_f32_vector_cpu: vec_f32 size mismatch. Expected " +
305 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
306 }
307 size_t num_blocks_per_row = cols / GGML_QK8_0;
308 size_t total_blocks_expected = static_cast<size_t>(rows) * num_blocks_per_row;
309 if (mat_q8_0.size() != total_blocks_expected) {
310 throw std::runtime_error(
311 "matvec_q8_0_f32_vector_cpu: mat_q8_0 size mismatch. Expected " +
312 std::to_string(total_blocks_expected) + " blocks, got " +
313 std::to_string(mat_q8_0.size()));
314 }
315
316 out_f32.resize(rows);
317 float dequantized_block[GGML_QK8_0];
318
319
320#pragma omp parallel for private(dequantized_block)
321 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
322 double row_sum = 0.0;
323 double kahan_c = 0.0;
324
325 size_t block_row_offset = static_cast<size_t>(r) * num_blocks_per_row;
326
327 for (size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
328 const block_q8_0* qblock = &mat_q8_0[block_row_offset + block_col_idx];
329 dequantize_q8_0_block(qblock, dequantized_block);
330
331 size_t vec_offset = block_col_idx * GGML_QK8_0;
332
333
334 for (int i = 0; i < GGML_QK8_0; ++i) {
335 double term = static_cast<double>(dequantized_block[i]) *
336 static_cast<double>(vec_f32[vec_offset + i]);
337
338 double y = term - kahan_c;
339 double t = row_sum + y;
340 kahan_c = (t - row_sum) - y;
341 row_sum = t;
342 }
343 }
344 out_f32[r] = static_cast<float>(row_sum);
345
346 }
347}
348
349void matvec_f32_f32_vector_cpu(const std::vector<float>& mat_f32,
350 const std::vector<float>& vec_f32,
351 std::vector<float>& out_f32, int rows,
352 int cols) {
353 if (mat_f32.empty() || vec_f32.empty()) {
355 "matvec_f32_f32_vector_cpu: Input matrix or vector is empty.");
356 out_f32.assign(rows, 0.0f);
357 return;
358 }
359 if (mat_f32.size() != (size_t)rows * cols) {
361 "matvec_f32_f32_vector_cpu: Matrix dimensions mismatch. Expected " +
362 std::to_string((size_t)rows * cols) + ", got " +
363 std::to_string(mat_f32.size()));
364 out_f32.assign(rows, 0.0f);
365 return;
366 }
367 if (vec_f32.size() != (size_t)cols) {
369 "matvec_f32_f32_vector_cpu: Vector dimension mismatch. Expected " +
370 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
371 out_f32.assign(rows, 0.0f);
372 return;
373 }
374
375 out_f32.resize(rows);
376
377#pragma omp parallel for schedule(static)
378 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
379 float sum = 0.0f;
380 size_t row_offset = static_cast<size_t>(r) * cols;
381
382 const float* mat_row_ptr = mat_f32.data() + row_offset;
383 const float* vec_ptr = vec_f32.data();
384
385 double k_sum = 0.0;
386 double k_c = 0.0;
387
388 for (int c = 0; c < cols; ++c) {
389 double term = static_cast<double>(mat_row_ptr[c]) * static_cast<double>(vec_ptr[c]);
390 double y = term - k_c;
391 double t_sum = k_sum + y;
392 k_c = (t_sum - k_sum) - y;
393 k_sum = t_sum;
394 }
395 out_f32[r] = static_cast<float>(k_sum);
396 }
397}
398
399void matvec_q8k_f32_vector_cpu(const std::vector<block_q8_K>& mat_q8k,
400 const std::vector<float>& vec_f32,
401 std::vector<float>& out_f32, int rows,
402 int cols, bool log_first_block) {
403 if (cols % GGML_QK_K != 0) {
404 throw std::runtime_error("matvec_q8k_f32_vector_cpu: cols must be divisible by GGML_QK_K");
405 }
406
407 size_t num_blocks_per_row = cols / GGML_QK_K;
408 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
409 if (mat_q8k.size() != total_blocks_expected) {
410 throw std::runtime_error("matvec_q8k_f32_vector_cpu: mat_q8k size mismatch");
411 }
412 if (vec_f32.size() != (size_t)cols) {
413 throw std::runtime_error("matvec_q8k_f32_vector_cpu: vec_f32 size mismatch");
414 }
415
416 out_f32.resize(rows);
417
418 std::vector<float> mat_f32;
419 dequantize_q8_k(mat_q8k, mat_f32, rows * cols, log_first_block);
420
421 matvec_f32_f32_vector_cpu(mat_f32, vec_f32, out_f32, rows, cols);
422
423 if (log_first_block && rows > 0) {
424 Logger::info("[Q8K_MATVEC_DEBUG] First output: " + std::to_string(out_f32[0]));
425 }
426}
427
429 std::vector<float>& x,
430 int num_heads,
431 int head_dim,
432 int current_token_pos,
433 const std::vector<std::pair<float, float>>& all_freqs_cis,
434 int max_pos_embeddings,
435 bool use_adjacent_pairing
436) {
437 if (current_token_pos < 0 || current_token_pos >= max_pos_embeddings) {
438 return;
439 }
440 if (head_dim % 2 != 0) {
441 Logger::error("RoPE apply_rope_vector: head_dim must be even. head_dim: " + std::to_string(head_dim));
442 return;
443 }
444
445 const int dim_half = head_dim / 2;
446 size_t pos_offset = static_cast<size_t>(current_token_pos) * static_cast<size_t>(dim_half);
447
448 for (int h = 0; h < num_heads; ++h) {
449 size_t head_offset = static_cast<size_t>(h) * head_dim;
450
451 for (int i = 0; i < dim_half; ++i) {
452 size_t freq_idx = pos_offset + static_cast<size_t>(i);
453
454 if (freq_idx >= all_freqs_cis.size()) {
455 Logger::warning("RoPE apply_rope_vector: freq_idx out of bounds. pos: " +
456 std::to_string(current_token_pos) + ", head_dim/2: " + std::to_string(dim_half) +
457 ", i: " + std::to_string(i) + ", calculated freq_idx: " + std::to_string(freq_idx) +
458 ", all_freqs_cis.size(): " + std::to_string(all_freqs_cis.size()));
459 continue;
460 }
461
462 float cos_theta = all_freqs_cis[freq_idx].first;
463 float sin_theta = all_freqs_cis[freq_idx].second;
464
465 float x0_val, x1_val;
466 size_t x0_idx, x1_idx;
467
468 if (use_adjacent_pairing) {
469 x0_idx = head_offset + (2 * i);
470 x1_idx = head_offset + (2 * i + 1);
471 } else {
472 x0_idx = head_offset + i;
473 x1_idx = head_offset + i + dim_half;
474 }
475
476 if (x0_idx >= x.size() || x1_idx >= x.size()) {
477 Logger::warning("RoPE apply_rope_vector: x index out of bounds. x.size(): " + std::to_string(x.size()) +
478 ", x0_idx: " + std::to_string(x0_idx) + ", x1_idx: " + std::to_string(x1_idx));
479 continue;
480 }
481
482 x0_val = x[x0_idx];
483 x1_val = x[x1_idx];
484
485 x[x0_idx] = x0_val * cos_theta - x1_val * sin_theta;
486 x[x1_idx] = x0_val * sin_theta + x1_val * cos_theta;
487 }
488 }
489}
490
492 std::vector<float>& q_batch,
493 std::vector<float>& k_batch,
494 int num_tokens,
495 int num_q_heads,
496 int num_kv_heads,
497 int head_dim,
498 int start_pos_in_sequence,
499 const std::vector<std::pair<float, float>>& all_freqs_cis,
500 int max_pos_embeddings,
501 bool use_adjacent_pairing
502) {
503 if (q_batch.size() != (size_t)num_tokens * num_q_heads * head_dim) {
504 Logger::error("apply_rope_batch_cpu: q_batch size mismatch. Expected " +
505 std::to_string((size_t)num_tokens * num_q_heads * head_dim) + ", got " + std::to_string(q_batch.size()));
506 return;
507 }
508 if (k_batch.size() != (size_t)num_tokens * num_kv_heads * head_dim) {
509 Logger::error("apply_rope_batch_cpu: k_batch size mismatch. Expected " +
510 std::to_string((size_t)num_tokens * num_kv_heads * head_dim) + ", got " + std::to_string(k_batch.size()));
511 return;
512 }
513 if (head_dim % 2 != 0) {
514 Logger::error("apply_rope_batch_cpu: head_dim must be even for RoPE.");
515 return;
516 }
517
518 for (int t = 0; t < num_tokens; ++t) {
519 int current_token_pos = start_pos_in_sequence + t;
520
521 if (current_token_pos < 0 || current_token_pos >= max_pos_embeddings) {
522 Logger::warning("[ROPE_BATCH_CPU] Token " + std::to_string(t) + " (actual_pos: " + std::to_string(current_token_pos) +
523 ") is out of range [0, " + std::to_string(max_pos_embeddings -1) + "]. Skipping RoPE for this token.");
524 continue;
525 }
526
527 for (int h = 0; h < num_q_heads; ++h) {
528 size_t head_start_offset_in_batch = ((size_t)t * num_q_heads + h) * head_dim;
529
530 for (int i = 0; i < head_dim / 2; ++i) {
531 size_t freq_idx = (size_t)current_token_pos * (head_dim / 2) + i;
532
533 if (freq_idx >= all_freqs_cis.size()) {
534 Logger::warning("[ROPE_BATCH_CPU] Q - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
535 ", DimPair " + std::to_string(i) + ": freq_idx (" + std::to_string(freq_idx) +
536 ") out of bounds for all_freqs_cis.size (" + std::to_string(all_freqs_cis.size()) + "). Skipping pair.");
537 continue;
538 }
539
540 float freq_cis_real = all_freqs_cis[freq_idx].first;
541 float freq_cis_imag = all_freqs_cis[freq_idx].second;
542
543 float val0, val1;
544 size_t idx0, idx1;
545
546 if (use_adjacent_pairing) {
547 idx0 = head_start_offset_in_batch + 2 * i;
548 idx1 = head_start_offset_in_batch + 2 * i + 1;
549 } else {
550 idx0 = head_start_offset_in_batch + i;
551 idx1 = head_start_offset_in_batch + i + head_dim / 2;
552 }
553
554 if (idx0 >= q_batch.size() || idx1 >= q_batch.size()) {
555 Logger::warning("[ROPE_BATCH_CPU] Q - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
556 ", DimPair " + std::to_string(i) + ": q_batch index out of bounds. q_batch.size(): " + std::to_string(q_batch.size()) +
557 ", idx0: " + std::to_string(idx0) + ", idx1: " + std::to_string(idx1) + ". Skipping pair.");
558 continue;
559 }
560
561 val0 = q_batch[idx0];
562 val1 = q_batch[idx1];
563
564 q_batch[idx0] = val0 * freq_cis_real - val1 * freq_cis_imag;
565 q_batch[idx1] = val0 * freq_cis_imag + val1 * freq_cis_real;
566 }
567 }
568
569 for (int h = 0; h < num_kv_heads; ++h) {
570 size_t head_start_offset_in_batch = ((size_t)t * num_kv_heads + h) * head_dim;
571
572 for (int i = 0; i < head_dim / 2; ++i) {
573 size_t freq_idx = (size_t)current_token_pos * (head_dim / 2) + i;
574
575 if (freq_idx >= all_freqs_cis.size()) {
576 Logger::warning("[ROPE_BATCH_CPU] K - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
577 ", DimPair " + std::to_string(i) + ": freq_idx (" + std::to_string(freq_idx) +
578 ") out of bounds for all_freqs_cis.size (" + std::to_string(all_freqs_cis.size()) + "). Skipping pair.");
579 continue;
580 }
581
582 float freq_cis_real = all_freqs_cis[freq_idx].first;
583 float freq_cis_imag = all_freqs_cis[freq_idx].second;
584
585 float val0, val1;
586 size_t idx0, idx1;
587
588 if (use_adjacent_pairing) {
589 idx0 = head_start_offset_in_batch + 2 * i;
590 idx1 = head_start_offset_in_batch + 2 * i + 1;
591 } else {
592 idx0 = head_start_offset_in_batch + i;
593 idx1 = head_start_offset_in_batch + i + head_dim / 2;
594 }
595
596 if (idx0 >= k_batch.size() || idx1 >= k_batch.size()) {
597 Logger::warning("[ROPE_BATCH_CPU] K - Token " + std::to_string(t) + ", Head " + std::to_string(h) +
598 ", DimPair " + std::to_string(i) + ": k_batch index out of bounds. k_batch.size(): " + std::to_string(k_batch.size()) +
599 ", idx0: " + std::to_string(idx0) + ", idx1: " + std::to_string(idx1) + ". Skipping pair.");
600 continue;
601 }
602
603 val0 = k_batch[idx0];
604 val1 = k_batch[idx1];
605
606 k_batch[idx0] = val0 * freq_cis_real - val1 * freq_cis_imag;
607 k_batch[idx1] = val0 * freq_cis_imag + val1 * freq_cis_real;
608 }
609 }
610 }
611}
612
613void rmsnorm_batch_cpu(const std::vector<float>& x_batch,
614 const std::vector<float>& weight,
615 std::vector<float>& out_batch,
616 int num_tokens,
617 int hidden_size,
618 float eps) {
619 if (x_batch.empty() || x_batch.size() != (size_t)num_tokens * hidden_size || weight.size() != (size_t)hidden_size) {
620 Logger::error("[RMSNORM_BATCH_CPU] RMSNorm batch size mismatch or empty input. x_batch.size(): " + std::to_string(x_batch.size()) +
621 ", expected x_batch: " + std::to_string((size_t)num_tokens * hidden_size) +
622 ", weight.size(): " + std::to_string(weight.size()) +
623 ", expected weight: " + std::to_string((size_t)hidden_size));
624 out_batch.assign((size_t)num_tokens * hidden_size, 0.0f);
625 return;
626 }
627 out_batch.resize((size_t)num_tokens * hidden_size);
628
629#pragma omp parallel for
630 for (int t = 0; t < num_tokens; ++t) {
631 double ssq = 0.0;
632 size_t token_offset = (size_t)t * hidden_size;
633
634 for (int i = 0; i < hidden_size; ++i) {
635 ssq += static_cast<double>(x_batch[token_offset + i]) * static_cast<double>(x_batch[token_offset + i]);
636 }
637
638 double ssq_mean = ssq / hidden_size;
639 float norm_factor_input_sqrt = static_cast<float>(ssq_mean);
640 float norm_factor = 1.0f / SAFE_SQRT(norm_factor_input_sqrt + eps);
641
642 for (int i = 0; i < hidden_size; ++i) {
643 out_batch[token_offset + i] = x_batch[token_offset + i] * norm_factor * weight[i];
644 }
645 }
646}
647
648void rmsnorm_vector_cpu(const std::vector<float>& x,
649 const std::vector<float>& weight,
650 std::vector<float>& out, float eps) {
651 if (x.empty() || x.size() != weight.size()) {
652 Logger::error("RMSNorm vector size mismatch or empty input.");
653 out.assign(x.size(), 0.0f);
654 return;
655 }
656 out.resize(x.size());
657 size_t n = x.size();
658
659 double ssq = 0.0;
660#pragma omp parallel for reduction(+ : ssq)
661 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
662 ssq += static_cast<double>(x[i]) * static_cast<double>(x[i]);
663 }
664 ssq /= n;
665
666 float norm_factor = 1.0f / SAFE_SQRT(static_cast<float>(ssq) +
668
669#pragma omp parallel for
670 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
671 out[i] = x[i] * norm_factor * weight[i];
672 }
673}
674
675void softmax_vector_cpu(const std::vector<float>& x,
676 std::vector<float>& out) {
677 if (x.empty()) return;
678 out.resize(x.size());
679 size_t n = x.size();
680
681 float max_val = x[0];
682 for (size_t i = 1; i < n; ++i) {
683 if (x[i] > max_val) max_val = x[i];
684 }
685
686 float exp_sum = 0.0f;
687 for (size_t i = 0; i < n; ++i) {
688 out[i] = std::exp(x[i] - max_val);
689 exp_sum += out[i];
690 }
691
692 float inv_sum = 1.0f / (exp_sum + 1e-9f);
693
694#pragma omp parallel for
695 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
696 out[i] *= inv_sum;
697 }
698}
699
700void silu_cpu(const std::vector<float>& x, std::vector<float>& out) {
701 if (x.size() != out.size()) out.resize(x.size());
702#pragma omp parallel for
703 for (int64_t i = 0; i < static_cast<int64_t>(x.size()); ++i) {
704 float sigmoid_x = 1.0f / (1.0f + std::exp(-x[i]));
705 out[i] = x[i] * sigmoid_x;
706 }
707}
708
710 const std::vector<float>& mat_weights,
711 const std::vector<float>& batch_input_activations,
712 std::vector<float>& batch_output_activations,
713 int num_tokens,
714 int output_dim,
715 int input_dim
716) {
717 if (mat_weights.empty() || batch_input_activations.empty()) {
718 Logger::error("[MATMUL_F32_BATCH_CPU] Input matrix or batch_input_activations is empty.");
719 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
720 return;
721 }
722 if (mat_weights.size() != (size_t)output_dim * input_dim) {
723 Logger::error("[MATMUL_F32_BATCH_CPU] Matrix dimensions mismatch. Expected " +
724 std::to_string((size_t)output_dim * input_dim) + ", got " +
725 std::to_string(mat_weights.size()));
726 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
727 return;
728 }
729 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
731 "[MATMUL_F32_BATCH_CPU] Batch input activations dimension mismatch. Expected " +
732 std::to_string((size_t)num_tokens * input_dim) + ", got " +
733 std::to_string(batch_input_activations.size()));
734 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
735 return;
736 }
737
738 batch_output_activations.resize((size_t)num_tokens * output_dim);
739
740#pragma omp parallel for schedule(static)
741 for (int t = 0; t < num_tokens; ++t) {
742 size_t input_token_offset = (size_t)t * input_dim;
743 size_t output_token_offset = (size_t)t * output_dim;
744
745 for (int o = 0; o < output_dim; ++o) {
746 double k_sum = 0.0;
747 double k_c = 0.0;
748 size_t weight_row_offset = (size_t)o * input_dim;
749
750 for (int i = 0; i < input_dim; ++i) {
751 double term = static_cast<double>(mat_weights[weight_row_offset + i]) *
752 static_cast<double>(batch_input_activations[input_token_offset + i]);
753 double y = term - k_c;
754 double t_sum = k_sum + y;
755 k_c = (t_sum - k_sum) - y;
756 k_sum = t_sum;
757 }
758 batch_output_activations[output_token_offset + o] = static_cast<float>(k_sum);
759 }
760 }
761}
762
763void matvec_q6k_f32_vector_cpu(const std::vector<block_q6_K>& mat_q6k,
764 const std::vector<float>& vec_f32,
765 std::vector<float>& out_f32, int rows,
766 int cols, bool log_first_block) {
767 if (cols % GGML_QK_K != 0) {
768 throw std::runtime_error(
769 "matvec_q6k_f32_vector_cpu: cols (" + std::to_string(cols) +
770 ") must be divisible by GGML_QK_K (" + std::to_string(GGML_QK_K) + ")");
771 }
772 if (vec_f32.size() != cols) {
773 throw std::runtime_error(
774 "matvec_q6k_f32_vector_cpu: vec_f32 size mismatch. Expected " +
775 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
776 }
777 size_t num_blocks_per_row = cols / GGML_QK_K;
778 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
779 if (mat_q6k.size() != total_blocks_expected) {
780 throw std::runtime_error(
781 "matvec_q6k_f32_vector_cpu: mat_q6k size mismatch. Expected " +
782 std::to_string(total_blocks_expected) + " blocks, got " +
783 std::to_string(mat_q6k.size()));
784 }
785
786 out_f32.resize(rows);
787 float dequantized_block[GGML_QK_K];
788
789#pragma omp parallel for private(dequantized_block)
790 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
791 double row_sum = 0.0;
792 double kahan_c = 0.0;
793
794 size_t block_row_offset = r * num_blocks_per_row;
795
796 for (size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
797 const block_q6_K* qblock = &mat_q6k[block_row_offset + block_col_idx];
798 bool enable_dequant_log = log_first_block && (r == 0 && block_col_idx == 0);
799 dequantize_q6_k(qblock, dequantized_block, GGML_QK_K);
800
801 size_t vec_offset = block_col_idx * GGML_QK_K;
802 for (int i = 0; i < GGML_QK_K; ++i) {
803 double term = static_cast<double>(dequantized_block[i]) *
804 static_cast<double>(vec_f32[vec_offset + i]);
805
806 double y = term - kahan_c;
807 double t = row_sum + y;
808 kahan_c = (t - row_sum) - y;
809 row_sum = t;
810 }
811 }
812 out_f32[r] = static_cast<float>(row_sum);
813 }
814}
815
816void matvec_q4k_f32_vector_cpu(const std::vector<block_q4_K>& mat_q4k,
817 const std::vector<float>& vec_f32,
818 std::vector<float>& out_f32, int rows,
819 int cols, bool log_first_block) {
820 if (cols % GGML_QK_K != 0) {
821 throw std::runtime_error(
822 "matvec_q4k_f32_vector_cpu: cols (" + std::to_string(cols) +
823 ") must be divisible by GGML_QK_K (" + std::to_string(GGML_QK_K) + ")");
824 }
825 if (vec_f32.size() != cols) {
826 throw std::runtime_error(
827 "matvec_q4k_f32_vector_cpu: vec_f32 size mismatch. Expected " +
828 std::to_string(cols) + ", got " + std::to_string(vec_f32.size()));
829 }
830 size_t num_blocks_per_row = cols / GGML_QK_K;
831 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
832 if (mat_q4k.size() != total_blocks_expected) {
833 throw std::runtime_error(
834 "matvec_q4k_f32_vector_cpu: mat_q4k size mismatch. Expected " +
835 std::to_string(total_blocks_expected) + " blocks, got " +
836 std::to_string(mat_q4k.size()));
837 }
838
839 out_f32.resize(rows);
840 float dequantized_block[GGML_QK_K];
841
842#pragma omp parallel for private(dequantized_block)
843 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
844 double row_sum = 0.0;
845 double kahan_c = 0.0;
846
847 size_t block_row_offset = r * num_blocks_per_row;
848
849 for (size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
850 const block_q4_K* qblock = &mat_q4k[block_row_offset + block_col_idx];
851 bool enable_dequant_log = log_first_block && (r == 0 && block_col_idx == 0);
852 dequantize_q4_k_m(qblock, dequantized_block, GGML_QK_K, enable_dequant_log);
853
854 size_t vec_offset = block_col_idx * GGML_QK_K;
855 for (int i = 0; i < GGML_QK_K; ++i) {
856 double term = static_cast<double>(dequantized_block[i]) *
857 static_cast<double>(vec_f32[vec_offset + i]);
858
859 double y = term - kahan_c;
860 double t = row_sum + y;
861 kahan_c = (t - row_sum) - y;
862 row_sum = t;
863 }
864 }
865 out_f32[r] = static_cast<float>(row_sum);
866 }
867}
868
870 const std::vector<block_q8_0>& mat_q8_0,
871 const std::vector<float>& batch_input_activations,
872 std::vector<float>& batch_output_activations,
873 int num_tokens,
874 int output_dim,
875 int input_dim
876) {
877 if (mat_q8_0.empty() || batch_input_activations.empty()) {
878 Logger::error("[MATMUL_Q8_0_BATCH_CPU] Input matrix or batch_input_activations is empty.");
879 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
880 return;
881 }
882
883 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
884 Logger::error("[MATMUL_Q8_0_BATCH_CPU] batch_input_activations size mismatch. Expected " +
885 std::to_string((size_t)num_tokens * input_dim) + ", got " +
886 std::to_string(batch_input_activations.size()));
887 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
888 return;
889 }
890
891 batch_output_activations.resize((size_t)num_tokens * output_dim);
892
893#pragma omp parallel for
894 for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
895 std::vector<float> current_token_input(input_dim);
896 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
897 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
898
899 std::vector<float> current_token_output(output_dim);
900 matvec_q8_0_f32_vector_cpu(mat_q8_0, current_token_input, current_token_output, output_dim, input_dim, false);
901
902 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
903 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
904 }
905}
906
908 const std::vector<block_q8_K>& mat_q8k,
909 const std::vector<float>& batch_input_activations,
910 std::vector<float>& batch_output_activations,
911 int num_tokens,
912 int output_dim,
913 int input_dim
914) {
915 if (input_dim % GGML_QK_K != 0) {
916 throw std::runtime_error("matmul_q8k_f32_batch_cpu: input_dim (" + std::to_string(input_dim) +
917 ") must be divisible by GGML_QK_K (" + std::to_string(GGML_QK_K) + ")");
918 }
919
920 size_t expected_input_size = (size_t)num_tokens * input_dim;
921 if (batch_input_activations.size() != expected_input_size) {
922 throw std::runtime_error("matmul_q8k_f32_batch_cpu: batch_input_activations size mismatch. Expected " +
923 std::to_string(expected_input_size) + ", got " + std::to_string(batch_input_activations.size()));
924 }
925
926 size_t num_blocks_per_row = input_dim / GGML_QK_K;
927 size_t total_blocks_expected = (size_t)output_dim * num_blocks_per_row;
928 if (mat_q8k.size() != total_blocks_expected) {
929 throw std::runtime_error("matmul_q8k_f32_batch_cpu: mat_q8k size mismatch. Expected " +
930 std::to_string(total_blocks_expected) + " blocks, got " + std::to_string(mat_q8k.size()));
931 }
932
933 batch_output_activations.resize((size_t)num_tokens * output_dim);
934
935 for (int t = 0; t < num_tokens; ++t) {
936 std::vector<float> current_token_input(input_dim);
937 for (int i = 0; i < input_dim; ++i) {
938 current_token_input[i] = batch_input_activations[t * input_dim + i];
939 }
940
941 std::vector<float> current_token_output(output_dim);
942 matvec_q8k_f32_vector_cpu(mat_q8k, current_token_input, current_token_output, output_dim, input_dim, false);
943
944 for (int i = 0; i < output_dim; ++i) {
945 batch_output_activations[t * output_dim + i] = current_token_output[i];
946 }
947 }
948}
949
951 const std::vector<block_q6_K>& mat_q6k,
952 const std::vector<float>& batch_input_activations,
953 std::vector<float>& batch_output_activations,
954 int num_tokens,
955 int output_dim,
956 int input_dim
957) {
958 if (mat_q6k.empty() || batch_input_activations.empty()) {
959 Logger::error("[MATMUL_Q6K_BATCH_CPU] Input matrix or batch_input_activations is empty.");
960 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
961 return;
962 }
963
964 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
965 Logger::error("[MATMUL_Q6K_BATCH_CPU] batch_input_activations size mismatch. Expected " +
966 std::to_string((size_t)num_tokens * input_dim) + ", got " +
967 std::to_string(batch_input_activations.size()));
968 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
969 return;
970 }
971
972 batch_output_activations.resize((size_t)num_tokens * output_dim);
973
974#pragma omp parallel for
975 for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
976 std::vector<float> current_token_input(input_dim);
977 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
978 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
979
980 std::vector<float> current_token_output(output_dim);
981 matvec_q6k_f32_vector_cpu(mat_q6k, current_token_input, current_token_output, output_dim, input_dim, false);
982
983 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
984 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
985 }
986}
987
989 const std::vector<block_q4_K>& mat_q4k,
990 const std::vector<float>& batch_input_activations,
991 std::vector<float>& batch_output_activations,
992 int num_tokens,
993 int output_dim,
994 int input_dim
995) {
996 if (mat_q4k.empty() || batch_input_activations.empty()) {
997 Logger::error("[MATMUL_Q4K_BATCH_CPU] Input matrix or batch_input_activations is empty.");
998 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
999 return;
1000 }
1001 if (batch_input_activations.size() != (size_t)num_tokens * input_dim) {
1002 Logger::error("[MATMUL_Q4K_BATCH_CPU] batch_input_activations size mismatch. Expected " +
1003 std::to_string((size_t)num_tokens * input_dim) + ", got " +
1004 std::to_string(batch_input_activations.size()));
1005 batch_output_activations.assign((size_t)num_tokens * output_dim, 0.0f);
1006 return;
1007 }
1008
1009 batch_output_activations.resize((size_t)num_tokens * output_dim);
1010
1011#pragma omp parallel for
1012 for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
1013 std::vector<float> current_token_input(input_dim);
1014 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
1015 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
1016
1017 std::vector<float> current_token_output(output_dim);
1018 matvec_q4k_f32_vector_cpu(mat_q4k, current_token_input, current_token_output, output_dim, input_dim, false);
1019
1020 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
1021 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
1022 }
1023}
1024
1025void matvec_bf16_f32_vector_cpu(const std::vector<uint16_t>& mat_bf16,
1026 const std::vector<float>& vec_f32,
1027 std::vector<float>& out_f32, int rows, int cols) {
1028 if (mat_bf16.size() != (size_t)rows * cols ||
1029 vec_f32.size() != (size_t)cols) {
1030 Logger::error("matvec_bf16_f32_vector_cpu: Size mismatch. Mat: " +
1031 std::to_string(mat_bf16.size()) + " (Expected " +
1032 std::to_string(rows * cols) +
1033 "), Vec: " + std::to_string(vec_f32.size()) + " (Expected " +
1034 std::to_string(cols) + ")");
1035 out_f32.assign(rows, 0.0f);
1036 return;
1037 }
1038 out_f32.resize(rows);
1039
1040#pragma omp parallel for
1041 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
1042 double sum = 0.0;
1043 double c = 0.0;
1044 size_t row_offset = r * cols;
1045
1046 for (int c_idx = 0; c_idx < cols; ++c_idx) {
1047 float weight = bfloat16_to_float32(mat_bf16[row_offset + c_idx]);
1048 double term =
1049 static_cast<double>(weight) * static_cast<double>(vec_f32[c_idx]);
1050
1051 double y = term - c;
1052 double t = sum + y;
1053 c = (t - sum) - y;
1054 sum = t;
1055 }
1056 out_f32[r] = static_cast<float>(sum);
1057 }
1058}
1059
1060void weighted_sum_probs_v(const std::vector<float>& probs,
1061 const std::vector<float>& V,
1062 std::vector<float>& out, int seq_len, int head_dim) {
1063 if (probs.size() != seq_len || V.size() != (size_t)seq_len * head_dim) {
1064 Logger::error("weighted_sum_probs_v: Size mismatch. Probs: " +
1065 std::to_string(probs.size()) + " (Expected " +
1066 std::to_string(seq_len) +
1067 "), V: " + std::to_string(V.size()) + " (Expected " +
1068 std::to_string(seq_len * head_dim) + ")");
1069 out.assign(head_dim, 0.0f);
1070 return;
1071 }
1072 out.resize(head_dim);
1073
1074#pragma omp parallel for
1075 for (int64_t j = 0; j < static_cast<int64_t>(head_dim); ++j) {
1076 double sum = 0.0;
1077 double c_kahan = 0.0;
1078 for (int i = 0; i < seq_len; ++i) {
1079 double term = static_cast<double>(probs[i]) *
1080 static_cast<double>(V[i * head_dim + j]);
1081
1082 double y = term - c_kahan;
1083 double t = sum + y;
1084 c_kahan = (t - sum) - y;
1085 sum = t;
1086 }
1087 out[j] = static_cast<float>(sum);
1088 }
1089}
1090
1091void calculate_attention_scores(const std::vector<float>& Q,
1092 const std::vector<float>& K,
1093 std::vector<float>& scores, int seq_len,
1094 int head_dim, float scale) {
1095 if (Q.empty() || K.empty()) return;
1096 scores.resize(seq_len);
1097
1098 scale = std::clamp(scale, attention::MIN_SCALE, attention::MAX_SCALE);
1099 float effective_scale = scale * attention::ATTENTION_SCALE_BASE;
1100
1101#pragma omp parallel for collapse(1)
1102 for (int64_t i = 0; i < static_cast<int64_t>(seq_len); ++i) {
1103 double dot_product = 0.0;
1104 double c_kahan = 0.0;
1105 size_t k_offset = static_cast<size_t>(i) * head_dim;
1106
1107 for (int j = 0; j < head_dim; ++j) {
1108 double term = static_cast<double>(Q[j]) * static_cast<double>(K[k_offset + j]);
1109 double y = term - c_kahan;
1110 double t_sum = dot_product + y;
1111 c_kahan = (t_sum - dot_product) - y;
1112 dot_product = t_sum;
1113 }
1114
1115 scores[i] = static_cast<float>(dot_product * effective_scale);
1116 }
1117}
1118
1119void log_vec_stats(const std::string& name, const std::vector<float>& v) {
1120 if (v.empty()) {
1121 Logger::info(name + ": EMPTY VECTOR");
1122 return;
1123 }
1124 float minv = *std::min_element(v.begin(), v.end());
1125 float maxv = *std::max_element(v.begin(), v.end());
1126 float mean = std::accumulate(v.begin(), v.end(), 0.0f) / v.size();
1127 bool all_finite =
1128 std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
1129 Logger::info(name + ": min=" + std::to_string(minv) + ", max=" +
1130 std::to_string(maxv) + ", mean=" + std::to_string(mean) +
1131 ", all_finite=" + (all_finite ? "yes" : "no"));
1132}
1133
1134bool write_vector_to_file(const std::string& filename, const std::vector<float>& vec) {
1135 std::string vec_writer_vals;
1136 int N_log_writer = (std::min)(10, (int)vec.size());
1137 for (int i = 0; i < N_log_writer; ++i)
1138 vec_writer_vals += (i ? " " : "") + std::to_string(vec[i]);
1139 Logger::info("write_vector_to_file Enter: Address of vec.data() on entry: " +
1140 std::to_string(reinterpret_cast<uintptr_t>(vec.data())));
1141
1142 std::ofstream outfile(filename, std::ios::binary);
1143 if (!outfile) {
1144 Logger::error("Failed to open file for writing: " + filename);
1145 return false;
1146 }
1147 outfile.write(reinterpret_cast<const char*>(vec.data()),
1148 vec.size() * sizeof(float));
1149 if (!outfile) {
1150 Logger::error("Failed to write data to file: " + filename);
1151 return false;
1152 }
1153 Logger::info("Successfully wrote vector to " + filename);
1154 return true;
1155}
1156
1157std::vector<std::vector<float>> load_rmsnorm_bin(const std::string& filename, int num_tokens, int hidden_size) {
1158 std::ifstream infile(filename, std::ios::binary);
1159 if (!infile) throw std::runtime_error("Failed to open " + filename);
1160 std::vector<float> flat(num_tokens * hidden_size);
1161 infile.read(reinterpret_cast<char*>(flat.data()),
1162 flat.size() * sizeof(float));
1163 if (!infile)
1164 throw std::runtime_error("Failed to read all data from " + filename);
1165 std::vector<std::vector<float>> result(num_tokens,
1166 std::vector<float>(hidden_size));
1167 for (int t = 0; t < num_tokens; ++t) {
1168 for (int h = 0; h < hidden_size; ++h) {
1169 result[t][h] = flat[t * hidden_size + h];
1170 }
1171 }
1172 return result;
1173}
1174
1175void log_raw_float_pointer(const std::string& name, const float* ptr, size_t count) {
1176 if (!ptr) {
1177 Logger::info(name + ": NULL POINTER");
1178 return;
1179 }
1180 std::stringstream ss;
1181 ss << name << ": [";
1182 for (size_t i = 0; i < count; ++i) {
1183 if (i > 0) ss << ", ";
1184 ss << std::fixed << std::setprecision(6) << ptr[i];
1185 }
1186 ss << "]";
1187 Logger::info(ss.str());
1188}
1189
1190void log_vector_summary_detailed(const std::string& name,
1191 const std::vector<float>& v,
1192 int current_pos, int current_layer, int N) {
1193 if (v.empty()) {
1194 Logger::info(name + ": EMPTY");
1195 return;
1196 }
1197
1198 std::stringstream ss;
1199 ss << "[POS=" << current_pos << " LAYER=" << current_layer << "] " << name;
1200 ss << ": size=" << v.size();
1201
1202 size_t actual_N = SAFE_MIN(static_cast<size_t>(N), v.size());
1203 if (actual_N > 0) {
1204 ss << ", first " << actual_N << ": [";
1205 for (size_t i = 0; i < actual_N; ++i) {
1206 ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(6) << v[i];
1207 }
1208 ss << "]";
1209 }
1210
1211 float minv = *std::min_element(v.begin(), v.end());
1212 float maxv = *std::max_element(v.begin(), v.end());
1213 double sum = std::accumulate(v.begin(), v.end(), 0.0);
1214 float mean = sum / v.size();
1215 bool all_finite = std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
1216
1217 ss << ", min=" << minv << ", max=" << maxv << ", mean=" << mean
1218 << ", finite=" << (all_finite ? "yes" : "NO");
1219 Logger::info(ss.str());
1220}
1221
1222
#define SAFE_SQRT(x)
static void debug(const std::string &message)
Definition logger.cpp:131
static void warning(const std::string &message)
Definition logger.cpp:139
static void info(const std::string &message)
Definition logger.cpp:135
static void error(const std::string &message)
Definition logger.cpp:143
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
Logging utilities for the TinyLlama implementation.
Constants used throughout the TinyLlama model implementation.
#define SAFE_MAX(a, b)
#define SAFE_MIN(a, b)
constexpr float MIN_SCALE
constexpr float ATTENTION_SCALE_BASE
constexpr float MAX_SCALE
constexpr uint16_t ZERO
constexpr uint16_t SIGN_BIT
constexpr uint16_t NEG_ZERO
constexpr uint16_t EXPONENT_MASK
constexpr uint16_t MANTISSA_MASK
constexpr int SHIFT_BITS
constexpr float MIN_NORM_EPS
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
4-bit K-quantized block structure
6-bit K-quantized block structure
Simple 8-bit quantized block structure.
void log_raw_float_pointer(const std::string &name, const float *ptr, size_t count)
Definition utils.cpp:1175
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:428
float bfloat16_to_float32(uint16_t bf16)
Definition utils.cpp:144
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count)
Definition utils.cpp:207
std::vector< uint16_t > uint8_vector_to_uint16_vector(const std::vector< uint8_t > &bytes, size_t numel)
Definition utils.cpp:176
std::vector< float > bfloat16_vector_to_float32(const std::vector< uint16_t > &bf16_vec)
Definition utils.cpp:165
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:399
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:763
void log_vector_summary_with_tail(const std::string &name, const std::vector< float > &v, int head_count, int tail_count)
Definition utils.cpp:234
void matvec_bf16_f32_vector_cpu(const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:1025
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:349
void simd_scaled_add(float *dst, const float *src, float scale, int n)
Definition utils.cpp:92
void log_vector_summary_detailed(const std::string &name, const std::vector< float > &v, int current_pos, int current_layer, int N)
Definition utils.cpp:1190
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:988
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198
bool write_vector_to_file(const std::string &filename, const std::vector< float > &vec)
Definition utils.cpp:1134
void softmax_vector_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:675
int argmax(const std::vector< float > &v)
Definition utils.cpp:185
float simd_dot_product(const float *a, const float *b, int n)
Definition utils.cpp:35
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:816
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:869
void apply_rope_batch_cpu(std::vector< float > &q_batch, std::vector< float > &k_batch, int num_tokens, int num_q_heads, int num_kv_heads, int head_dim, int start_pos_in_sequence, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:491
void calculate_attention_scores(const std::vector< float > &Q, const std::vector< float > &K, std::vector< float > &scores, int seq_len, int head_dim, float scale)
Definition utils.cpp:1091
std::vector< std::vector< float > > load_rmsnorm_bin(const std::string &filename, int num_tokens, int hidden_size)
Definition utils.cpp:1157
void weighted_sum_probs_v(const std::vector< float > &probs, const std::vector< float > &V, std::vector< float > &out, int seq_len, int head_dim)
Definition utils.cpp:1060
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:709
void silu_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:700
void matmul_q8k_f32_batch_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:907
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:950
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps)
Definition utils.cpp:613
void log_vec_stats(const std::string &name, const std::vector< float > &v)
Definition utils.cpp:1119
void rmsnorm_vector_cpu(const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps)
Definition utils.cpp:648
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:293
uint16_t float32_to_bfloat16(float val)
Definition utils.cpp:136