26#elif defined(__SSE2__)
29#elif defined(__ARM_NEON)
37 __m256 sum = _mm256_setzero_ps();
39 for (; i <= n - 8; i += 8) {
40 __m256 va = _mm256_loadu_ps(&a[i]);
41 __m256 vb = _mm256_loadu_ps(&b[i]);
42 sum = _mm256_fmadd_ps(va, vb, sum);
45 _mm256_storeu_ps(result, sum);
46 float final_sum = result[0] + result[1] + result[2] + result[3] +
47 result[4] + result[5] + result[6] + result[7];
49 final_sum += a[i] * b[i];
52#elif defined(__SSE2__)
53 __m128 sum = _mm_setzero_ps();
55 for (; i <= n - 4; i += 4) {
56 __m128 va = _mm_loadu_ps(&a[i]);
57 __m128 vb = _mm_loadu_ps(&b[i]);
58 sum = _mm_add_ps(sum, _mm_mul_ps(va, vb));
61 _mm_storeu_ps(result, sum);
62 float final_sum = result[0] + result[1] + result[2] + result[3];
64 final_sum += a[i] * b[i];
67#elif defined(__ARM_NEON)
68 float32x4_t sum = vdupq_n_f32(0.0f);
70 for (; i <= n - 4; i += 4) {
71 float32x4_t va = vld1q_f32(&a[i]);
72 float32x4_t vb = vld1q_f32(&b[i]);
73 sum = vmlaq_f32(sum, va, vb);
76 vst1q_f32(result, sum);
77 float final_sum = result[0] + result[1] + result[2] + result[3];
79 final_sum += a[i] * b[i];
84 for (
int i = 0; i < n; ++i) {
94 __m256 vscale = _mm256_set1_ps(scale);
96 for (; i <= n - 8; i += 8) {
97 __m256 vdst = _mm256_loadu_ps(&dst[i]);
98 __m256 vsrc = _mm256_loadu_ps(&src[i]);
99 __m256 result = _mm256_fmadd_ps(vsrc, vscale, vdst);
100 _mm256_storeu_ps(&dst[i], result);
103 dst[i] += src[i] * scale;
105#elif defined(__SSE2__)
106 __m128 vscale = _mm_set1_ps(scale);
108 for (; i <= n - 4; i += 4) {
109 __m128 vdst = _mm_loadu_ps(&dst[i]);
110 __m128 vsrc = _mm_loadu_ps(&src[i]);
111 __m128 result = _mm_add_ps(vdst, _mm_mul_ps(vsrc, vscale));
112 _mm_storeu_ps(&dst[i], result);
115 dst[i] += src[i] * scale;
117#elif defined(__ARM_NEON)
118 float32x4_t vscale = vdupq_n_f32(scale);
120 for (; i <= n - 4; i += 4) {
121 float32x4_t vdst = vld1q_f32(&dst[i]);
122 float32x4_t vsrc = vld1q_f32(&src[i]);
123 float32x4_t result = vmlaq_f32(vdst, vsrc, vscale);
124 vst1q_f32(&dst[i], result);
127 dst[i] += src[i] * scale;
130 for (
int i = 0; i < n; ++i) {
131 dst[i] += src[i] * scale;
138 std::memcpy(&bits, &val,
sizeof(
float));
140 bits += 0x7FFF + ((bits >> 16) & 1);
141 return static_cast<uint16_t
>(bits >> 16);
150 if (is_nan)
return std::numeric_limits<float>::quiet_NaN();
155 : std::numeric_limits<float>::infinity();
160 std::memcpy(&result, &bits,
sizeof(
float));
166 std::vector<float> f32_vec(bf16_vec.size());
168#pragma omp parallel for
169 for (int64_t i = 0; i < static_cast<int64_t>(bf16_vec.size()); ++i) {
177 if (bytes.size() != numel * 2) {
178 throw std::runtime_error(
"Byte vector size mismatch for uint16_t conversion");
180 std::vector<uint16_t> out(numel);
181 std::memcpy(out.data(), bytes.data(), bytes.size());
190 auto max_it = std::max_element(v.begin(), v.end());
191 float max_val = *max_it;
192 int max_idx = std::distance(v.begin(), max_it);
193 Logger::debug(
"[ARGMAX HELPER] Max value found: " + std::to_string(max_val) +
194 " at index: " + std::to_string(max_idx));
199 std::vector<float> v_f32(v_bf16.size());
200#pragma omp parallel for
201 for (int64_t i = 0; i < static_cast<int64_t>(v_bf16.size()); ++i) {
212 std::stringstream ss;
213 size_t actual_head_count =
SAFE_MIN(
static_cast<size_t>(head_count), v.size());
215 ss << name <<
": size=" << v.size();
217 if (actual_head_count > 0) {
218 ss <<
", first " << actual_head_count <<
": [";
219 for (
size_t i = 0; i < actual_head_count; ++i) {
220 ss << (i > 0 ?
" " :
"") << std::fixed << std::setprecision(4) << v[i];
224 float minv = *std::min_element(v.begin(), v.end());
225 float maxv = *std::max_element(v.begin(), v.end());
226 double sum = std::accumulate(v.begin(), v.end(), 0.0);
227 float mean = sum / v.size();
228 bool all_finite = std::all_of(v.begin(), v.end(), [](
float x) { return std::isfinite(x); });
229 ss <<
", min=" << minv <<
", max=" << maxv <<
", mean=" << mean
230 <<
", finite=" << (all_finite ?
"yes" :
"NO");
235 int head_count,
int tail_count) {
240 std::stringstream ss;
242 size_t actual_head_count =
SAFE_MIN(
static_cast<size_t>(head_count), v.size());
243 size_t actual_tail_count =
SAFE_MIN(
static_cast<size_t>(tail_count), v.size());
244 size_t total_shown = actual_head_count + actual_tail_count;
245 bool overlap = total_shown > v.size();
247 actual_tail_count = v.size() - actual_head_count;
248 if (actual_tail_count >
SAFE_MIN(
static_cast<size_t>(tail_count), v.size())) {
249 actual_tail_count =
SAFE_MIN(
static_cast<size_t>(tail_count), v.size());
251 if (tail_count > 0 && actual_head_count == v.size()) {
252 actual_tail_count = 0;
255 size_t tail_start_index = v.size() - actual_tail_count;
257 ss << name <<
": size=" << v.size();
259 if (actual_head_count > 0) {
260 ss <<
", first " << actual_head_count <<
": [";
261 for (
size_t i = 0; i < actual_head_count; ++i) {
262 ss << (i > 0 ?
" " :
"") << std::fixed << std::setprecision(4) << v[i];
267 if (actual_tail_count > 0 && tail_start_index >= actual_head_count) {
268 ss <<
", last " << actual_tail_count <<
": [";
269 for (
size_t i = 0; i < actual_tail_count; ++i) {
270 ss << (i > 0 ?
" " :
"") << std::fixed << std::setprecision(4)
271 << v[tail_start_index + i];
274 }
else if (overlap && tail_count > 0 && actual_head_count < v.size()) {
275 ss <<
" (... tail overlaps head ...)";
278 float minv = *std::min_element(v.begin(), v.end());
279 float maxv = *std::max_element(v.begin(), v.end());
280 double sum = std::accumulate(v.begin(), v.end(), 0.0);
281 float mean = sum / v.size();
282 bool all_finite = std::all_of(v.begin(), v.end(), [](
float x) { return std::isfinite(x); });
283 ss <<
", min=" << minv <<
", max=" << maxv <<
", mean=" << mean
284 <<
", finite=" << (all_finite ?
"yes" :
"NO");
294 const std::vector<float>& vec_f32,
295 std::vector<float>& out_f32,
int rows,
296 int cols,
bool log_first_block) {
298 throw std::runtime_error(
299 "matvec_q8_0_f32_vector_cpu: cols (" + std::to_string(cols) +
300 ") must be divisible by GGML_QK8_0 (" + std::to_string(
GGML_QK8_0) +
")");
302 if (vec_f32.size() !=
static_cast<size_t>(cols)) {
303 throw std::runtime_error(
304 "matvec_q8_0_f32_vector_cpu: vec_f32 size mismatch. Expected " +
305 std::to_string(cols) +
", got " + std::to_string(vec_f32.size()));
307 size_t num_blocks_per_row = cols /
GGML_QK8_0;
308 size_t total_blocks_expected =
static_cast<size_t>(rows) * num_blocks_per_row;
309 if (mat_q8_0.size() != total_blocks_expected) {
310 throw std::runtime_error(
311 "matvec_q8_0_f32_vector_cpu: mat_q8_0 size mismatch. Expected " +
312 std::to_string(total_blocks_expected) +
" blocks, got " +
313 std::to_string(mat_q8_0.size()));
316 out_f32.resize(rows);
320#pragma omp parallel for private(dequantized_block)
321 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
322 double row_sum = 0.0;
323 double kahan_c = 0.0;
325 size_t block_row_offset =
static_cast<size_t>(r) * num_blocks_per_row;
327 for (
size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
328 const block_q8_0* qblock = &mat_q8_0[block_row_offset + block_col_idx];
331 size_t vec_offset = block_col_idx *
GGML_QK8_0;
335 double term =
static_cast<double>(dequantized_block[i]) *
336 static_cast<double>(vec_f32[vec_offset + i]);
338 double y = term - kahan_c;
339 double t = row_sum + y;
340 kahan_c = (t - row_sum) - y;
344 out_f32[r] =
static_cast<float>(row_sum);
350 const std::vector<float>& vec_f32,
351 std::vector<float>& out_f32,
int rows,
353 if (mat_f32.empty() || vec_f32.empty()) {
355 "matvec_f32_f32_vector_cpu: Input matrix or vector is empty.");
356 out_f32.assign(rows, 0.0f);
359 if (mat_f32.size() != (
size_t)rows * cols) {
361 "matvec_f32_f32_vector_cpu: Matrix dimensions mismatch. Expected " +
362 std::to_string((
size_t)rows * cols) +
", got " +
363 std::to_string(mat_f32.size()));
364 out_f32.assign(rows, 0.0f);
367 if (vec_f32.size() != (
size_t)cols) {
369 "matvec_f32_f32_vector_cpu: Vector dimension mismatch. Expected " +
370 std::to_string(cols) +
", got " + std::to_string(vec_f32.size()));
371 out_f32.assign(rows, 0.0f);
375 out_f32.resize(rows);
377#pragma omp parallel for schedule(static)
378 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
380 size_t row_offset =
static_cast<size_t>(r) * cols;
382 const float* mat_row_ptr = mat_f32.data() + row_offset;
383 const float* vec_ptr = vec_f32.data();
388 for (
int c = 0; c < cols; ++c) {
389 double term =
static_cast<double>(mat_row_ptr[c]) *
static_cast<double>(vec_ptr[c]);
390 double y = term - k_c;
391 double t_sum = k_sum + y;
392 k_c = (t_sum - k_sum) - y;
395 out_f32[r] =
static_cast<float>(k_sum);
400 const std::vector<float>& vec_f32,
401 std::vector<float>& out_f32,
int rows,
402 int cols,
bool log_first_block) {
404 throw std::runtime_error(
"matvec_q8k_f32_vector_cpu: cols must be divisible by GGML_QK_K");
407 size_t num_blocks_per_row = cols /
GGML_QK_K;
408 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
409 if (mat_q8k.size() != total_blocks_expected) {
410 throw std::runtime_error(
"matvec_q8k_f32_vector_cpu: mat_q8k size mismatch");
412 if (vec_f32.size() != (
size_t)cols) {
413 throw std::runtime_error(
"matvec_q8k_f32_vector_cpu: vec_f32 size mismatch");
416 out_f32.resize(rows);
418 std::vector<float> mat_f32;
423 if (log_first_block && rows > 0) {
424 Logger::info(
"[Q8K_MATVEC_DEBUG] First output: " + std::to_string(out_f32[0]));
429 std::vector<float>& x,
432 int current_token_pos,
433 const std::vector<std::pair<float, float>>& all_freqs_cis,
434 int max_pos_embeddings,
435 bool use_adjacent_pairing
437 if (current_token_pos < 0 || current_token_pos >= max_pos_embeddings) {
440 if (head_dim % 2 != 0) {
441 Logger::error(
"RoPE apply_rope_vector: head_dim must be even. head_dim: " + std::to_string(head_dim));
445 const int dim_half = head_dim / 2;
446 size_t pos_offset =
static_cast<size_t>(current_token_pos) *
static_cast<size_t>(dim_half);
448 for (
int h = 0; h < num_heads; ++h) {
449 size_t head_offset =
static_cast<size_t>(h) * head_dim;
451 for (
int i = 0; i < dim_half; ++i) {
452 size_t freq_idx = pos_offset +
static_cast<size_t>(i);
454 if (freq_idx >= all_freqs_cis.size()) {
455 Logger::warning(
"RoPE apply_rope_vector: freq_idx out of bounds. pos: " +
456 std::to_string(current_token_pos) +
", head_dim/2: " + std::to_string(dim_half) +
457 ", i: " + std::to_string(i) +
", calculated freq_idx: " + std::to_string(freq_idx) +
458 ", all_freqs_cis.size(): " + std::to_string(all_freqs_cis.size()));
462 float cos_theta = all_freqs_cis[freq_idx].first;
463 float sin_theta = all_freqs_cis[freq_idx].second;
465 float x0_val, x1_val;
466 size_t x0_idx, x1_idx;
468 if (use_adjacent_pairing) {
469 x0_idx = head_offset + (2 * i);
470 x1_idx = head_offset + (2 * i + 1);
472 x0_idx = head_offset + i;
473 x1_idx = head_offset + i + dim_half;
476 if (x0_idx >= x.size() || x1_idx >= x.size()) {
477 Logger::warning(
"RoPE apply_rope_vector: x index out of bounds. x.size(): " + std::to_string(x.size()) +
478 ", x0_idx: " + std::to_string(x0_idx) +
", x1_idx: " + std::to_string(x1_idx));
485 x[x0_idx] = x0_val * cos_theta - x1_val * sin_theta;
486 x[x1_idx] = x0_val * sin_theta + x1_val * cos_theta;
492 std::vector<float>& q_batch,
493 std::vector<float>& k_batch,
498 int start_pos_in_sequence,
499 const std::vector<std::pair<float, float>>& all_freqs_cis,
500 int max_pos_embeddings,
501 bool use_adjacent_pairing
503 if (q_batch.size() != (
size_t)num_tokens * num_q_heads * head_dim) {
504 Logger::error(
"apply_rope_batch_cpu: q_batch size mismatch. Expected " +
505 std::to_string((
size_t)num_tokens * num_q_heads * head_dim) +
", got " + std::to_string(q_batch.size()));
508 if (k_batch.size() != (
size_t)num_tokens * num_kv_heads * head_dim) {
509 Logger::error(
"apply_rope_batch_cpu: k_batch size mismatch. Expected " +
510 std::to_string((
size_t)num_tokens * num_kv_heads * head_dim) +
", got " + std::to_string(k_batch.size()));
513 if (head_dim % 2 != 0) {
514 Logger::error(
"apply_rope_batch_cpu: head_dim must be even for RoPE.");
518 for (
int t = 0; t < num_tokens; ++t) {
519 int current_token_pos = start_pos_in_sequence + t;
521 if (current_token_pos < 0 || current_token_pos >= max_pos_embeddings) {
522 Logger::warning(
"[ROPE_BATCH_CPU] Token " + std::to_string(t) +
" (actual_pos: " + std::to_string(current_token_pos) +
523 ") is out of range [0, " + std::to_string(max_pos_embeddings -1) +
"]. Skipping RoPE for this token.");
527 for (
int h = 0; h < num_q_heads; ++h) {
528 size_t head_start_offset_in_batch = ((size_t)t * num_q_heads + h) * head_dim;
530 for (
int i = 0; i < head_dim / 2; ++i) {
531 size_t freq_idx = (size_t)current_token_pos * (head_dim / 2) + i;
533 if (freq_idx >= all_freqs_cis.size()) {
534 Logger::warning(
"[ROPE_BATCH_CPU] Q - Token " + std::to_string(t) +
", Head " + std::to_string(h) +
535 ", DimPair " + std::to_string(i) +
": freq_idx (" + std::to_string(freq_idx) +
536 ") out of bounds for all_freqs_cis.size (" + std::to_string(all_freqs_cis.size()) +
"). Skipping pair.");
540 float freq_cis_real = all_freqs_cis[freq_idx].first;
541 float freq_cis_imag = all_freqs_cis[freq_idx].second;
546 if (use_adjacent_pairing) {
547 idx0 = head_start_offset_in_batch + 2 * i;
548 idx1 = head_start_offset_in_batch + 2 * i + 1;
550 idx0 = head_start_offset_in_batch + i;
551 idx1 = head_start_offset_in_batch + i + head_dim / 2;
554 if (idx0 >= q_batch.size() || idx1 >= q_batch.size()) {
555 Logger::warning(
"[ROPE_BATCH_CPU] Q - Token " + std::to_string(t) +
", Head " + std::to_string(h) +
556 ", DimPair " + std::to_string(i) +
": q_batch index out of bounds. q_batch.size(): " + std::to_string(q_batch.size()) +
557 ", idx0: " + std::to_string(idx0) +
", idx1: " + std::to_string(idx1) +
". Skipping pair.");
561 val0 = q_batch[idx0];
562 val1 = q_batch[idx1];
564 q_batch[idx0] = val0 * freq_cis_real - val1 * freq_cis_imag;
565 q_batch[idx1] = val0 * freq_cis_imag + val1 * freq_cis_real;
569 for (
int h = 0; h < num_kv_heads; ++h) {
570 size_t head_start_offset_in_batch = ((size_t)t * num_kv_heads + h) * head_dim;
572 for (
int i = 0; i < head_dim / 2; ++i) {
573 size_t freq_idx = (size_t)current_token_pos * (head_dim / 2) + i;
575 if (freq_idx >= all_freqs_cis.size()) {
576 Logger::warning(
"[ROPE_BATCH_CPU] K - Token " + std::to_string(t) +
", Head " + std::to_string(h) +
577 ", DimPair " + std::to_string(i) +
": freq_idx (" + std::to_string(freq_idx) +
578 ") out of bounds for all_freqs_cis.size (" + std::to_string(all_freqs_cis.size()) +
"). Skipping pair.");
582 float freq_cis_real = all_freqs_cis[freq_idx].first;
583 float freq_cis_imag = all_freqs_cis[freq_idx].second;
588 if (use_adjacent_pairing) {
589 idx0 = head_start_offset_in_batch + 2 * i;
590 idx1 = head_start_offset_in_batch + 2 * i + 1;
592 idx0 = head_start_offset_in_batch + i;
593 idx1 = head_start_offset_in_batch + i + head_dim / 2;
596 if (idx0 >= k_batch.size() || idx1 >= k_batch.size()) {
597 Logger::warning(
"[ROPE_BATCH_CPU] K - Token " + std::to_string(t) +
", Head " + std::to_string(h) +
598 ", DimPair " + std::to_string(i) +
": k_batch index out of bounds. k_batch.size(): " + std::to_string(k_batch.size()) +
599 ", idx0: " + std::to_string(idx0) +
", idx1: " + std::to_string(idx1) +
". Skipping pair.");
603 val0 = k_batch[idx0];
604 val1 = k_batch[idx1];
606 k_batch[idx0] = val0 * freq_cis_real - val1 * freq_cis_imag;
607 k_batch[idx1] = val0 * freq_cis_imag + val1 * freq_cis_real;
614 const std::vector<float>& weight,
615 std::vector<float>& out_batch,
619 if (x_batch.empty() || x_batch.size() != (
size_t)num_tokens * hidden_size || weight.size() != (
size_t)hidden_size) {
620 Logger::error(
"[RMSNORM_BATCH_CPU] RMSNorm batch size mismatch or empty input. x_batch.size(): " + std::to_string(x_batch.size()) +
621 ", expected x_batch: " + std::to_string((
size_t)num_tokens * hidden_size) +
622 ", weight.size(): " + std::to_string(weight.size()) +
623 ", expected weight: " + std::to_string((
size_t)hidden_size));
624 out_batch.assign((
size_t)num_tokens * hidden_size, 0.0f);
627 out_batch.resize((
size_t)num_tokens * hidden_size);
629#pragma omp parallel for
630 for (
int t = 0; t < num_tokens; ++t) {
632 size_t token_offset = (size_t)t * hidden_size;
634 for (
int i = 0; i < hidden_size; ++i) {
635 ssq +=
static_cast<double>(x_batch[token_offset + i]) *
static_cast<double>(x_batch[token_offset + i]);
638 double ssq_mean = ssq / hidden_size;
639 float norm_factor_input_sqrt =
static_cast<float>(ssq_mean);
640 float norm_factor = 1.0f /
SAFE_SQRT(norm_factor_input_sqrt + eps);
642 for (
int i = 0; i < hidden_size; ++i) {
643 out_batch[token_offset + i] = x_batch[token_offset + i] * norm_factor * weight[i];
649 const std::vector<float>& weight,
650 std::vector<float>& out,
float eps) {
651 if (x.empty() || x.size() != weight.size()) {
652 Logger::error(
"RMSNorm vector size mismatch or empty input.");
653 out.assign(x.size(), 0.0f);
656 out.resize(x.size());
660#pragma omp parallel for reduction(+ : ssq)
661 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
662 ssq +=
static_cast<double>(x[i]) *
static_cast<double>(x[i]);
666 float norm_factor = 1.0f /
SAFE_SQRT(
static_cast<float>(ssq) +
669#pragma omp parallel for
670 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
671 out[i] = x[i] * norm_factor * weight[i];
676 std::vector<float>& out) {
677 if (x.empty())
return;
678 out.resize(x.size());
681 float max_val = x[0];
682 for (
size_t i = 1; i < n; ++i) {
683 if (x[i] > max_val) max_val = x[i];
686 float exp_sum = 0.0f;
687 for (
size_t i = 0; i < n; ++i) {
688 out[i] = std::exp(x[i] - max_val);
692 float inv_sum = 1.0f / (exp_sum + 1e-9f);
694#pragma omp parallel for
695 for (int64_t i = 0; i < static_cast<int64_t>(n); ++i) {
700void silu_cpu(
const std::vector<float>& x, std::vector<float>& out) {
701 if (x.size() != out.size()) out.resize(x.size());
702#pragma omp parallel for
703 for (int64_t i = 0; i < static_cast<int64_t>(x.size()); ++i) {
704 float sigmoid_x = 1.0f / (1.0f + std::exp(-x[i]));
705 out[i] = x[i] * sigmoid_x;
710 const std::vector<float>& mat_weights,
711 const std::vector<float>& batch_input_activations,
712 std::vector<float>& batch_output_activations,
717 if (mat_weights.empty() || batch_input_activations.empty()) {
718 Logger::error(
"[MATMUL_F32_BATCH_CPU] Input matrix or batch_input_activations is empty.");
719 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
722 if (mat_weights.size() != (
size_t)output_dim * input_dim) {
723 Logger::error(
"[MATMUL_F32_BATCH_CPU] Matrix dimensions mismatch. Expected " +
724 std::to_string((
size_t)output_dim * input_dim) +
", got " +
725 std::to_string(mat_weights.size()));
726 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
729 if (batch_input_activations.size() != (
size_t)num_tokens * input_dim) {
731 "[MATMUL_F32_BATCH_CPU] Batch input activations dimension mismatch. Expected " +
732 std::to_string((
size_t)num_tokens * input_dim) +
", got " +
733 std::to_string(batch_input_activations.size()));
734 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
738 batch_output_activations.resize((
size_t)num_tokens * output_dim);
740#pragma omp parallel for schedule(static)
741 for (
int t = 0; t < num_tokens; ++t) {
742 size_t input_token_offset = (size_t)t * input_dim;
743 size_t output_token_offset = (size_t)t * output_dim;
745 for (
int o = 0; o < output_dim; ++o) {
748 size_t weight_row_offset = (size_t)o * input_dim;
750 for (
int i = 0; i < input_dim; ++i) {
751 double term =
static_cast<double>(mat_weights[weight_row_offset + i]) *
752 static_cast<double>(batch_input_activations[input_token_offset + i]);
753 double y = term - k_c;
754 double t_sum = k_sum + y;
755 k_c = (t_sum - k_sum) - y;
758 batch_output_activations[output_token_offset + o] =
static_cast<float>(k_sum);
764 const std::vector<float>& vec_f32,
765 std::vector<float>& out_f32,
int rows,
766 int cols,
bool log_first_block) {
768 throw std::runtime_error(
769 "matvec_q6k_f32_vector_cpu: cols (" + std::to_string(cols) +
770 ") must be divisible by GGML_QK_K (" + std::to_string(
GGML_QK_K) +
")");
772 if (vec_f32.size() != cols) {
773 throw std::runtime_error(
774 "matvec_q6k_f32_vector_cpu: vec_f32 size mismatch. Expected " +
775 std::to_string(cols) +
", got " + std::to_string(vec_f32.size()));
777 size_t num_blocks_per_row = cols /
GGML_QK_K;
778 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
779 if (mat_q6k.size() != total_blocks_expected) {
780 throw std::runtime_error(
781 "matvec_q6k_f32_vector_cpu: mat_q6k size mismatch. Expected " +
782 std::to_string(total_blocks_expected) +
" blocks, got " +
783 std::to_string(mat_q6k.size()));
786 out_f32.resize(rows);
789#pragma omp parallel for private(dequantized_block)
790 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
791 double row_sum = 0.0;
792 double kahan_c = 0.0;
794 size_t block_row_offset = r * num_blocks_per_row;
796 for (
size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
797 const block_q6_K* qblock = &mat_q6k[block_row_offset + block_col_idx];
798 bool enable_dequant_log = log_first_block && (r == 0 && block_col_idx == 0);
801 size_t vec_offset = block_col_idx *
GGML_QK_K;
803 double term =
static_cast<double>(dequantized_block[i]) *
804 static_cast<double>(vec_f32[vec_offset + i]);
806 double y = term - kahan_c;
807 double t = row_sum + y;
808 kahan_c = (t - row_sum) - y;
812 out_f32[r] =
static_cast<float>(row_sum);
817 const std::vector<float>& vec_f32,
818 std::vector<float>& out_f32,
int rows,
819 int cols,
bool log_first_block) {
821 throw std::runtime_error(
822 "matvec_q4k_f32_vector_cpu: cols (" + std::to_string(cols) +
823 ") must be divisible by GGML_QK_K (" + std::to_string(
GGML_QK_K) +
")");
825 if (vec_f32.size() != cols) {
826 throw std::runtime_error(
827 "matvec_q4k_f32_vector_cpu: vec_f32 size mismatch. Expected " +
828 std::to_string(cols) +
", got " + std::to_string(vec_f32.size()));
830 size_t num_blocks_per_row = cols /
GGML_QK_K;
831 size_t total_blocks_expected = (size_t)rows * num_blocks_per_row;
832 if (mat_q4k.size() != total_blocks_expected) {
833 throw std::runtime_error(
834 "matvec_q4k_f32_vector_cpu: mat_q4k size mismatch. Expected " +
835 std::to_string(total_blocks_expected) +
" blocks, got " +
836 std::to_string(mat_q4k.size()));
839 out_f32.resize(rows);
842#pragma omp parallel for private(dequantized_block)
843 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
844 double row_sum = 0.0;
845 double kahan_c = 0.0;
847 size_t block_row_offset = r * num_blocks_per_row;
849 for (
size_t block_col_idx = 0; block_col_idx < num_blocks_per_row; ++block_col_idx) {
850 const block_q4_K* qblock = &mat_q4k[block_row_offset + block_col_idx];
851 bool enable_dequant_log = log_first_block && (r == 0 && block_col_idx == 0);
854 size_t vec_offset = block_col_idx *
GGML_QK_K;
856 double term =
static_cast<double>(dequantized_block[i]) *
857 static_cast<double>(vec_f32[vec_offset + i]);
859 double y = term - kahan_c;
860 double t = row_sum + y;
861 kahan_c = (t - row_sum) - y;
865 out_f32[r] =
static_cast<float>(row_sum);
870 const std::vector<block_q8_0>& mat_q8_0,
871 const std::vector<float>& batch_input_activations,
872 std::vector<float>& batch_output_activations,
877 if (mat_q8_0.empty() || batch_input_activations.empty()) {
878 Logger::error(
"[MATMUL_Q8_0_BATCH_CPU] Input matrix or batch_input_activations is empty.");
879 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
883 if (batch_input_activations.size() != (
size_t)num_tokens * input_dim) {
884 Logger::error(
"[MATMUL_Q8_0_BATCH_CPU] batch_input_activations size mismatch. Expected " +
885 std::to_string((
size_t)num_tokens * input_dim) +
", got " +
886 std::to_string(batch_input_activations.size()));
887 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
891 batch_output_activations.resize((
size_t)num_tokens * output_dim);
893#pragma omp parallel for
894 for (
int token_idx = 0; token_idx < num_tokens; ++token_idx) {
895 std::vector<float> current_token_input(input_dim);
896 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
897 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
899 std::vector<float> current_token_output(output_dim);
902 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
903 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
908 const std::vector<block_q8_K>& mat_q8k,
909 const std::vector<float>& batch_input_activations,
910 std::vector<float>& batch_output_activations,
916 throw std::runtime_error(
"matmul_q8k_f32_batch_cpu: input_dim (" + std::to_string(input_dim) +
917 ") must be divisible by GGML_QK_K (" + std::to_string(
GGML_QK_K) +
")");
920 size_t expected_input_size = (size_t)num_tokens * input_dim;
921 if (batch_input_activations.size() != expected_input_size) {
922 throw std::runtime_error(
"matmul_q8k_f32_batch_cpu: batch_input_activations size mismatch. Expected " +
923 std::to_string(expected_input_size) +
", got " + std::to_string(batch_input_activations.size()));
926 size_t num_blocks_per_row = input_dim /
GGML_QK_K;
927 size_t total_blocks_expected = (size_t)output_dim * num_blocks_per_row;
928 if (mat_q8k.size() != total_blocks_expected) {
929 throw std::runtime_error(
"matmul_q8k_f32_batch_cpu: mat_q8k size mismatch. Expected " +
930 std::to_string(total_blocks_expected) +
" blocks, got " + std::to_string(mat_q8k.size()));
933 batch_output_activations.resize((
size_t)num_tokens * output_dim);
935 for (
int t = 0; t < num_tokens; ++t) {
936 std::vector<float> current_token_input(input_dim);
937 for (
int i = 0; i < input_dim; ++i) {
938 current_token_input[i] = batch_input_activations[t * input_dim + i];
941 std::vector<float> current_token_output(output_dim);
944 for (
int i = 0; i < output_dim; ++i) {
945 batch_output_activations[t * output_dim + i] = current_token_output[i];
951 const std::vector<block_q6_K>& mat_q6k,
952 const std::vector<float>& batch_input_activations,
953 std::vector<float>& batch_output_activations,
958 if (mat_q6k.empty() || batch_input_activations.empty()) {
959 Logger::error(
"[MATMUL_Q6K_BATCH_CPU] Input matrix or batch_input_activations is empty.");
960 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
964 if (batch_input_activations.size() != (
size_t)num_tokens * input_dim) {
965 Logger::error(
"[MATMUL_Q6K_BATCH_CPU] batch_input_activations size mismatch. Expected " +
966 std::to_string((
size_t)num_tokens * input_dim) +
", got " +
967 std::to_string(batch_input_activations.size()));
968 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
972 batch_output_activations.resize((
size_t)num_tokens * output_dim);
974#pragma omp parallel for
975 for (
int token_idx = 0; token_idx < num_tokens; ++token_idx) {
976 std::vector<float> current_token_input(input_dim);
977 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
978 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
980 std::vector<float> current_token_output(output_dim);
983 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
984 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
989 const std::vector<block_q4_K>& mat_q4k,
990 const std::vector<float>& batch_input_activations,
991 std::vector<float>& batch_output_activations,
996 if (mat_q4k.empty() || batch_input_activations.empty()) {
997 Logger::error(
"[MATMUL_Q4K_BATCH_CPU] Input matrix or batch_input_activations is empty.");
998 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
1001 if (batch_input_activations.size() != (
size_t)num_tokens * input_dim) {
1002 Logger::error(
"[MATMUL_Q4K_BATCH_CPU] batch_input_activations size mismatch. Expected " +
1003 std::to_string((
size_t)num_tokens * input_dim) +
", got " +
1004 std::to_string(batch_input_activations.size()));
1005 batch_output_activations.assign((
size_t)num_tokens * output_dim, 0.0f);
1009 batch_output_activations.resize((
size_t)num_tokens * output_dim);
1011#pragma omp parallel for
1012 for (
int token_idx = 0; token_idx < num_tokens; ++token_idx) {
1013 std::vector<float> current_token_input(input_dim);
1014 const float* input_slice_start = batch_input_activations.data() + (size_t)token_idx * input_dim;
1015 std::copy(input_slice_start, input_slice_start + input_dim, current_token_input.begin());
1017 std::vector<float> current_token_output(output_dim);
1020 float* output_slice_start = batch_output_activations.data() + (size_t)token_idx * output_dim;
1021 std::copy(current_token_output.begin(), current_token_output.end(), output_slice_start);
1026 const std::vector<float>& vec_f32,
1027 std::vector<float>& out_f32,
int rows,
int cols) {
1028 if (mat_bf16.size() != (
size_t)rows * cols ||
1029 vec_f32.size() != (
size_t)cols) {
1030 Logger::error(
"matvec_bf16_f32_vector_cpu: Size mismatch. Mat: " +
1031 std::to_string(mat_bf16.size()) +
" (Expected " +
1032 std::to_string(rows * cols) +
1033 "), Vec: " + std::to_string(vec_f32.size()) +
" (Expected " +
1034 std::to_string(cols) +
")");
1035 out_f32.assign(rows, 0.0f);
1038 out_f32.resize(rows);
1040#pragma omp parallel for
1041 for (int64_t r = 0; r < static_cast<int64_t>(rows); ++r) {
1044 size_t row_offset = r * cols;
1046 for (
int c_idx = 0; c_idx < cols; ++c_idx) {
1049 static_cast<double>(weight) *
static_cast<double>(vec_f32[c_idx]);
1051 double y = term - c;
1056 out_f32[r] =
static_cast<float>(sum);
1061 const std::vector<float>& V,
1062 std::vector<float>& out,
int seq_len,
int head_dim) {
1063 if (probs.size() != seq_len || V.size() != (
size_t)seq_len * head_dim) {
1064 Logger::error(
"weighted_sum_probs_v: Size mismatch. Probs: " +
1065 std::to_string(probs.size()) +
" (Expected " +
1066 std::to_string(seq_len) +
1067 "), V: " + std::to_string(V.size()) +
" (Expected " +
1068 std::to_string(seq_len * head_dim) +
")");
1069 out.assign(head_dim, 0.0f);
1072 out.resize(head_dim);
1074#pragma omp parallel for
1075 for (int64_t j = 0; j < static_cast<int64_t>(head_dim); ++j) {
1077 double c_kahan = 0.0;
1078 for (
int i = 0; i < seq_len; ++i) {
1079 double term =
static_cast<double>(probs[i]) *
1080 static_cast<double>(V[i * head_dim + j]);
1082 double y = term - c_kahan;
1084 c_kahan = (t - sum) - y;
1087 out[j] =
static_cast<float>(sum);
1092 const std::vector<float>& K,
1093 std::vector<float>& scores,
int seq_len,
1094 int head_dim,
float scale) {
1095 if (Q.empty() || K.empty())
return;
1096 scores.resize(seq_len);
1101#pragma omp parallel for collapse(1)
1102 for (int64_t i = 0; i < static_cast<int64_t>(seq_len); ++i) {
1103 double dot_product = 0.0;
1104 double c_kahan = 0.0;
1105 size_t k_offset =
static_cast<size_t>(i) * head_dim;
1107 for (
int j = 0; j < head_dim; ++j) {
1108 double term =
static_cast<double>(Q[j]) *
static_cast<double>(K[k_offset + j]);
1109 double y = term - c_kahan;
1110 double t_sum = dot_product + y;
1111 c_kahan = (t_sum - dot_product) - y;
1112 dot_product = t_sum;
1115 scores[i] =
static_cast<float>(dot_product * effective_scale);
1124 float minv = *std::min_element(v.begin(), v.end());
1125 float maxv = *std::max_element(v.begin(), v.end());
1126 float mean = std::accumulate(v.begin(), v.end(), 0.0f) / v.size();
1128 std::all_of(v.begin(), v.end(), [](
float x) { return std::isfinite(x); });
1129 Logger::info(name +
": min=" + std::to_string(minv) +
", max=" +
1130 std::to_string(maxv) +
", mean=" + std::to_string(mean) +
1131 ", all_finite=" + (all_finite ?
"yes" :
"no"));
1135 std::string vec_writer_vals;
1136 int N_log_writer = (std::min)(10, (
int)vec.size());
1137 for (
int i = 0; i < N_log_writer; ++i)
1138 vec_writer_vals += (i ?
" " :
"") + std::to_string(vec[i]);
1139 Logger::info(
"write_vector_to_file Enter: Address of vec.data() on entry: " +
1140 std::to_string(
reinterpret_cast<uintptr_t
>(vec.data())));
1142 std::ofstream outfile(filename, std::ios::binary);
1144 Logger::error(
"Failed to open file for writing: " + filename);
1147 outfile.write(
reinterpret_cast<const char*
>(vec.data()),
1148 vec.size() *
sizeof(
float));
1153 Logger::info(
"Successfully wrote vector to " + filename);
1157std::vector<std::vector<float>>
load_rmsnorm_bin(
const std::string& filename,
int num_tokens,
int hidden_size) {
1158 std::ifstream infile(filename, std::ios::binary);
1159 if (!infile)
throw std::runtime_error(
"Failed to open " + filename);
1160 std::vector<float> flat(num_tokens * hidden_size);
1161 infile.read(
reinterpret_cast<char*
>(flat.data()),
1162 flat.size() *
sizeof(
float));
1164 throw std::runtime_error(
"Failed to read all data from " + filename);
1165 std::vector<std::vector<float>> result(num_tokens,
1166 std::vector<float>(hidden_size));
1167 for (
int t = 0; t < num_tokens; ++t) {
1168 for (
int h = 0; h < hidden_size; ++h) {
1169 result[t][h] = flat[t * hidden_size + h];
1180 std::stringstream ss;
1181 ss << name <<
": [";
1182 for (
size_t i = 0; i < count; ++i) {
1183 if (i > 0) ss <<
", ";
1184 ss << std::fixed << std::setprecision(6) << ptr[i];
1191 const std::vector<float>& v,
1192 int current_pos,
int current_layer,
int N) {
1198 std::stringstream ss;
1199 ss <<
"[POS=" << current_pos <<
" LAYER=" << current_layer <<
"] " << name;
1200 ss <<
": size=" << v.size();
1202 size_t actual_N =
SAFE_MIN(
static_cast<size_t>(N), v.size());
1204 ss <<
", first " << actual_N <<
": [";
1205 for (
size_t i = 0; i < actual_N; ++i) {
1206 ss << (i > 0 ?
" " :
"") << std::fixed << std::setprecision(6) << v[i];
1211 float minv = *std::min_element(v.begin(), v.end());
1212 float maxv = *std::max_element(v.begin(), v.end());
1213 double sum = std::accumulate(v.begin(), v.end(), 0.0);
1214 float mean = sum / v.size();
1215 bool all_finite = std::all_of(v.begin(), v.end(), [](
float x) { return std::isfinite(x); });
1217 ss <<
", min=" << minv <<
", max=" << maxv <<
", mean=" << mean
1218 <<
", finite=" << (all_finite ?
"yes" :
"NO");
static void debug(const std::string &message)
static void warning(const std::string &message)
static void info(const std::string &message)
static void error(const std::string &message)
constexpr size_t GGML_QK8_0
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Logging utilities for the TinyLlama implementation.
Constants used throughout the TinyLlama model implementation.
constexpr float MIN_SCALE
constexpr float ATTENTION_SCALE_BASE
constexpr float MAX_SCALE
constexpr uint16_t SIGN_BIT
constexpr uint16_t NEG_ZERO
constexpr uint16_t EXPONENT_MASK
constexpr uint16_t MANTISSA_MASK
constexpr float MIN_NORM_EPS
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
4-bit K-quantized block structure
6-bit K-quantized block structure
Simple 8-bit quantized block structure.
void log_raw_float_pointer(const std::string &name, const float *ptr, size_t count)
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
float bfloat16_to_float32(uint16_t bf16)
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count)
std::vector< uint16_t > uint8_vector_to_uint16_vector(const std::vector< uint8_t > &bytes, size_t numel)
std::vector< float > bfloat16_vector_to_float32(const std::vector< uint16_t > &bf16_vec)
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
void log_vector_summary_with_tail(const std::string &name, const std::vector< float > &v, int head_count, int tail_count)
void matvec_bf16_f32_vector_cpu(const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
void simd_scaled_add(float *dst, const float *src, float scale, int n)
void log_vector_summary_detailed(const std::string &name, const std::vector< float > &v, int current_pos, int current_layer, int N)
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
bool write_vector_to_file(const std::string &filename, const std::vector< float > &vec)
void softmax_vector_cpu(const std::vector< float > &x, std::vector< float > &out)
int argmax(const std::vector< float > &v)
float simd_dot_product(const float *a, const float *b, int n)
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void apply_rope_batch_cpu(std::vector< float > &q_batch, std::vector< float > &k_batch, int num_tokens, int num_q_heads, int num_kv_heads, int head_dim, int start_pos_in_sequence, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
void calculate_attention_scores(const std::vector< float > &Q, const std::vector< float > &K, std::vector< float > &scores, int seq_len, int head_dim, float scale)
std::vector< std::vector< float > > load_rmsnorm_bin(const std::string &filename, int num_tokens, int hidden_size)
void weighted_sum_probs_v(const std::vector< float > &probs, const std::vector< float > &V, std::vector< float > &out, int seq_len, int head_dim)
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void silu_cpu(const std::vector< float > &x, std::vector< float > &out)
void matmul_q8k_f32_batch_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps)
void log_vec_stats(const std::string &name, const std::vector< float > &v)
void rmsnorm_vector_cpu(const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps)
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
uint16_t float32_to_bfloat16(float val)