TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
utils.h
Go to the documentation of this file.
1#pragma once
2
3#include <vector>
4#include <string>
5#include <cstdint>
6
7#include "quantization.h"
8#include "model_constants.h"
9
10// SIMD optimized functions
11float simd_dot_product(const float* a, const float* b, int n);
12void simd_scaled_add(float* dst, const float* src, float scale, int n);
13
14// BFloat16 conversion functions
15uint16_t float32_to_bfloat16(float val);
16float bfloat16_to_float32(uint16_t bf16);
17std::vector<float> bfloat16_vector_to_float32(const std::vector<uint16_t>& bf16_vec);
18
19// Vector utility functions
20std::vector<uint16_t> uint8_vector_to_uint16_vector(const std::vector<uint8_t>& bytes, size_t numel);
21int argmax(const std::vector<float>& v);
22
23// Matrix-vector multiplication functions (CPU)
24void matvec_q6k_f32_vector_cpu(const std::vector<block_q6_K>& mat_q6k,
25 const std::vector<float>& vec_f32,
26 std::vector<float>& out_f32, int rows,
27 int cols, bool log_first_block = false);
28
29void matvec_q4k_f32_vector_cpu(const std::vector<block_q4_K>& mat_q4k,
30 const std::vector<float>& vec_f32,
31 std::vector<float>& out_f32, int rows,
32 int cols, bool log_first_block = false);
33
34void matvec_q8_0_f32_vector_cpu(const std::vector<block_q8_0>& mat_q8_0,
35 const std::vector<float>& vec_f32,
36 std::vector<float>& out_f32, int rows,
37 int cols, bool log_first_block = false);
38
39void matvec_q8k_f32_vector_cpu(const std::vector<block_q8_K>& mat_q8k,
40 const std::vector<float>& vec_f32,
41 std::vector<float>& out_f32, int rows,
42 int cols, bool log_first_block = false);
43
44void matvec_f32_f32_vector_cpu(const std::vector<float>& mat_f32,
45 const std::vector<float>& vec_f32,
46 std::vector<float>& out_f32, int rows,
47 int cols);
48
49// Batch matrix multiplication functions (CPU)
50void matmul_q4k_f32_batch_cpu(const std::vector<block_q4_K>& mat_q4k,
51 const std::vector<float>& batch_input_activations,
52 std::vector<float>& batch_output_activations,
53 int num_tokens, int output_dim, int input_dim);
54
55void matmul_q6k_f32_batch_cpu(const std::vector<block_q6_K>& mat_q6k,
56 const std::vector<float>& batch_input_activations,
57 std::vector<float>& batch_output_activations,
58 int num_tokens, int output_dim, int input_dim);
59
60void matmul_q8_0_f32_batch_cpu(const std::vector<block_q8_0>& mat_q8_0,
61 const std::vector<float>& batch_input_activations,
62 std::vector<float>& batch_output_activations,
63 int num_tokens, int output_dim, int input_dim);
64
65void matmul_q8k_f32_batch_cpu(const std::vector<block_q8_K>& mat_q8k,
66 const std::vector<float>& batch_input_activations,
67 std::vector<float>& batch_output_activations,
68 int num_tokens, int output_dim, int input_dim);
69
70// Neural network operations (CPU) - these are implemented as static functions in model.cpp
71
72// RoPE (Rotary Position Embedding) functions
73void apply_rope_vector(std::vector<float>& x, int num_heads, int head_dim,
74 int current_token_pos,
75 const std::vector<std::pair<float, float>>& all_freqs_cis,
76 int max_pos_embeddings, bool use_adjacent_pairing);
77
78void apply_rope_batch_cpu(std::vector<float>& q_batch, std::vector<float>& k_batch,
79 int num_tokens, int num_q_heads, int num_kv_heads,
80 int head_dim, int start_pos_in_sequence,
81 const std::vector<std::pair<float, float>>& all_freqs_cis,
82 int max_pos_embeddings, bool use_adjacent_pairing);
83
84// Neural network operations
85void rmsnorm_batch_cpu(const std::vector<float>& x_batch,
86 const std::vector<float>& weight,
87 std::vector<float>& out_batch,
88 int num_tokens, int hidden_size,
89 float eps = numeric::DEFAULT_EPS);
90
91void rmsnorm_vector_cpu(const std::vector<float>& x,
92 const std::vector<float>& weight,
93 std::vector<float>& out,
94 float eps = numeric::DEFAULT_EPS);
95
96void softmax_vector_cpu(const std::vector<float>& x, std::vector<float>& out);
97void silu_cpu(const std::vector<float>& x, std::vector<float>& out);
98
99// Batch matrix multiplication
100void matmul_f32_f32_batch_cpu(const std::vector<float>& mat_weights,
101 const std::vector<float>& batch_input_activations,
102 std::vector<float>& batch_output_activations,
103 int num_tokens, int output_dim, int input_dim);
104
105// BFloat16 matrix-vector operations
106void matvec_bf16_f32_vector_cpu(const std::vector<uint16_t>& mat_bf16,
107 const std::vector<float>& vec_f32,
108 std::vector<float>& out_f32, int rows, int cols);
109
110// Attention computation functions
111void weighted_sum_probs_v(const std::vector<float>& probs,
112 const std::vector<float>& V,
113 std::vector<float>& out, int seq_len, int head_dim);
114
115void calculate_attention_scores(const std::vector<float>& Q,
116 const std::vector<float>& K,
117 std::vector<float>& scores, int seq_len,
118 int head_dim, float scale);
119
120// Logging and debugging functions
121void log_vector_summary(const std::string& name, const std::vector<float>& v, int head_count);
122void log_vector_summary_with_tail(const std::string& name, const std::vector<float>& v,
123 int head_count, int tail_count);
124void log_vector_summary_detailed(const std::string& name, const std::vector<float>& v,
125 int current_pos, int current_layer, int N = 5);
126void log_vec_stats(const std::string& name, const std::vector<float>& v);
127void log_raw_float_pointer(const std::string& name, const float* ptr, size_t count = 5);
128
129// File I/O utility functions
130bool write_vector_to_file(const std::string& filename, const std::vector<float>& vec);
131std::vector<std::vector<float>> load_rmsnorm_bin(const std::string& filename,
132 int num_tokens, int hidden_size);
133
134// Helper conversion functions
135std::vector<float> bf16vec_to_float_vec(const std::vector<uint16_t>& v_bf16);
136
137// Quantization utility
138void dequantize_q8_k(const std::vector<block_q8_K>& q8k_vec,
139 std::vector<float>& out_f32, int n, bool log_this_block);
Constants used throughout the TinyLlama model implementation.
constexpr float DEFAULT_EPS
Weight quantization structures and functions for model compression.
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:428
float bfloat16_to_float32(uint16_t bf16)
Definition utils.cpp:144
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count)
Definition utils.cpp:207
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
Definition utils.cpp:816
std::vector< uint16_t > uint8_vector_to_uint16_vector(const std::vector< uint8_t > &bytes, size_t numel)
Definition utils.cpp:176
std::vector< float > bfloat16_vector_to_float32(const std::vector< uint16_t > &bf16_vec)
Definition utils.cpp:165
void log_vector_summary_with_tail(const std::string &name, const std::vector< float > &v, int head_count, int tail_count)
Definition utils.cpp:234
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps=numeric::DEFAULT_EPS)
Definition utils.cpp:613
void matvec_bf16_f32_vector_cpu(const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:1025
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:349
void simd_scaled_add(float *dst, const float *src, float scale, int n)
Definition utils.cpp:92
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:988
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
Definition utils.cpp:399
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
Definition utils.cpp:293
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
Definition utils.cpp:763
bool write_vector_to_file(const std::string &filename, const std::vector< float > &vec)
Definition utils.cpp:1134
void rmsnorm_vector_cpu(const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps=numeric::DEFAULT_EPS)
Definition utils.cpp:648
void log_raw_float_pointer(const std::string &name, const float *ptr, size_t count=5)
Definition utils.cpp:1175
void softmax_vector_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:675
int argmax(const std::vector< float > &v)
Definition utils.cpp:185
float simd_dot_product(const float *a, const float *b, int n)
Definition utils.cpp:35
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:869
void apply_rope_batch_cpu(std::vector< float > &q_batch, std::vector< float > &k_batch, int num_tokens, int num_q_heads, int num_kv_heads, int head_dim, int start_pos_in_sequence, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:491
void calculate_attention_scores(const std::vector< float > &Q, const std::vector< float > &K, std::vector< float > &scores, int seq_len, int head_dim, float scale)
Definition utils.cpp:1091
std::vector< std::vector< float > > load_rmsnorm_bin(const std::string &filename, int num_tokens, int hidden_size)
Definition utils.cpp:1157
void log_vector_summary_detailed(const std::string &name, const std::vector< float > &v, int current_pos, int current_layer, int N=5)
Definition utils.cpp:1190
void dequantize_q8_k(const std::vector< block_q8_K > &q8k_vec, std::vector< float > &out_f32, int n, bool log_this_block)
void weighted_sum_probs_v(const std::vector< float > &probs, const std::vector< float > &V, std::vector< float > &out, int seq_len, int head_dim)
Definition utils.cpp:1060
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:709
void silu_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:700
void matmul_q8k_f32_batch_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:907
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:950
void log_vec_stats(const std::string &name, const std::vector< float > &v)
Definition utils.cpp:1119
uint16_t float32_to_bfloat16(float val)
Definition utils.cpp:136