21int argmax(
const std::vector<float>& v);
25 const std::vector<float>& vec_f32,
26 std::vector<float>& out_f32,
int rows,
27 int cols,
bool log_first_block =
false);
30 const std::vector<float>& vec_f32,
31 std::vector<float>& out_f32,
int rows,
32 int cols,
bool log_first_block =
false);
35 const std::vector<float>& vec_f32,
36 std::vector<float>& out_f32,
int rows,
37 int cols,
bool log_first_block =
false);
40 const std::vector<float>& vec_f32,
41 std::vector<float>& out_f32,
int rows,
42 int cols,
bool log_first_block =
false);
45 const std::vector<float>& vec_f32,
46 std::vector<float>& out_f32,
int rows,
51 const std::vector<float>& batch_input_activations,
52 std::vector<float>& batch_output_activations,
53 int num_tokens,
int output_dim,
int input_dim);
56 const std::vector<float>& batch_input_activations,
57 std::vector<float>& batch_output_activations,
58 int num_tokens,
int output_dim,
int input_dim);
61 const std::vector<float>& batch_input_activations,
62 std::vector<float>& batch_output_activations,
63 int num_tokens,
int output_dim,
int input_dim);
66 const std::vector<float>& batch_input_activations,
67 std::vector<float>& batch_output_activations,
68 int num_tokens,
int output_dim,
int input_dim);
74 int current_token_pos,
75 const std::vector<std::pair<float, float>>& all_freqs_cis,
76 int max_pos_embeddings,
bool use_adjacent_pairing);
79 int num_tokens,
int num_q_heads,
int num_kv_heads,
80 int head_dim,
int start_pos_in_sequence,
81 const std::vector<std::pair<float, float>>& all_freqs_cis,
82 int max_pos_embeddings,
bool use_adjacent_pairing);
86 const std::vector<float>& weight,
87 std::vector<float>& out_batch,
88 int num_tokens,
int hidden_size,
92 const std::vector<float>& weight,
93 std::vector<float>& out,
97void silu_cpu(
const std::vector<float>& x, std::vector<float>& out);
101 const std::vector<float>& batch_input_activations,
102 std::vector<float>& batch_output_activations,
103 int num_tokens,
int output_dim,
int input_dim);
107 const std::vector<float>& vec_f32,
108 std::vector<float>& out_f32,
int rows,
int cols);
112 const std::vector<float>& V,
113 std::vector<float>& out,
int seq_len,
int head_dim);
116 const std::vector<float>& K,
117 std::vector<float>& scores,
int seq_len,
118 int head_dim,
float scale);
121void log_vector_summary(
const std::string& name,
const std::vector<float>& v,
int head_count);
123 int head_count,
int tail_count);
125 int current_pos,
int current_layer,
int N = 5);
126void log_vec_stats(
const std::string& name,
const std::vector<float>& v);
131std::vector<std::vector<float>>
load_rmsnorm_bin(
const std::string& filename,
132 int num_tokens,
int hidden_size);
139 std::vector<float>& out_f32,
int n,
bool log_this_block);
Constants used throughout the TinyLlama model implementation.
constexpr float DEFAULT_EPS
Weight quantization structures and functions for model compression.
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
float bfloat16_to_float32(uint16_t bf16)
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count)
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
std::vector< uint16_t > uint8_vector_to_uint16_vector(const std::vector< uint8_t > &bytes, size_t numel)
std::vector< float > bfloat16_vector_to_float32(const std::vector< uint16_t > &bf16_vec)
void log_vector_summary_with_tail(const std::string &name, const std::vector< float > &v, int head_count, int tail_count)
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps=numeric::DEFAULT_EPS)
void matvec_bf16_f32_vector_cpu(const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
void simd_scaled_add(float *dst, const float *src, float scale, int n)
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block=false)
bool write_vector_to_file(const std::string &filename, const std::vector< float > &vec)
void rmsnorm_vector_cpu(const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps=numeric::DEFAULT_EPS)
void log_raw_float_pointer(const std::string &name, const float *ptr, size_t count=5)
void softmax_vector_cpu(const std::vector< float > &x, std::vector< float > &out)
int argmax(const std::vector< float > &v)
float simd_dot_product(const float *a, const float *b, int n)
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void apply_rope_batch_cpu(std::vector< float > &q_batch, std::vector< float > &k_batch, int num_tokens, int num_q_heads, int num_kv_heads, int head_dim, int start_pos_in_sequence, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
void calculate_attention_scores(const std::vector< float > &Q, const std::vector< float > &K, std::vector< float > &scores, int seq_len, int head_dim, float scale)
std::vector< std::vector< float > > load_rmsnorm_bin(const std::string &filename, int num_tokens, int hidden_size)
void log_vector_summary_detailed(const std::string &name, const std::vector< float > &v, int current_pos, int current_layer, int N=5)
void dequantize_q8_k(const std::vector< block_q8_K > &q8k_vec, std::vector< float > &out_f32, int n, bool log_this_block)
void weighted_sum_probs_v(const std::vector< float > &probs, const std::vector< float > &V, std::vector< float > &out, int seq_len, int head_dim)
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void silu_cpu(const std::vector< float > &x, std::vector< float > &out)
void matmul_q8k_f32_batch_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void log_vec_stats(const std::string &name, const std::vector< float > &v)
uint16_t float32_to_bfloat16(float val)