TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
Public Member Functions | Private Attributes | List of all members
CPUBatchProcessor Class Reference

#include <cpu_batch_processor.h>

Collaboration diagram for CPUBatchProcessor:
Collaboration graph

Public Member Functions

 CPUBatchProcessor (TinyLlamaModel *model)
 
std::vector< float > forward_cpu_batch (const std::vector< float > &batch_input_activations, int num_tokens_in_batch, int num_cpu_layers_to_process, int start_pos_in_sequence, KVCache *kv_cache, const std::vector< int > &prompt_lengths)
 

Private Attributes

TinyLlamaModelmodel_
 

Detailed Description

Definition at line 7 of file cpu_batch_processor.h.

Constructor & Destructor Documentation

◆ CPUBatchProcessor()

CPUBatchProcessor::CPUBatchProcessor ( TinyLlamaModel model)
explicit

Definition at line 9 of file cpu_batch_processor.cpp.

9: model_(model) {}
TinyLlamaModel * model_

Member Function Documentation

◆ forward_cpu_batch()

std::vector< float > CPUBatchProcessor::forward_cpu_batch ( const std::vector< float > &  batch_input_activations,
int  num_tokens_in_batch,
int  num_cpu_layers_to_process,
int  start_pos_in_sequence,
KVCache kv_cache,
const std::vector< int > &  prompt_lengths 
)

Definition at line 11 of file cpu_batch_processor.cpp.

17 {
18
19 if (batch_input_activations.size() != (size_t)num_tokens_in_batch * model_->config_.hidden_size) {
20 Logger::error("[CPU_BATCH_FWD] Input size mismatch. Expected: " +
21 std::to_string((size_t)num_tokens_in_batch * model_->config_.hidden_size) + " Got: " +
22 std::to_string(batch_input_activations.size()));
23 return {};
24 }
25
26 int hs = model_->config_.hidden_size;
28 int n_heads = model_->config_.num_attention_heads;
29 int n_kv_heads = model_->config_.num_key_value_heads;
30 if (n_heads == 0) {
31 Logger::error("[CPU_BATCH_FWD] Error: num_attention_heads is zero.");
32 return {};
33 }
34 int head_dim = hs / n_heads;
35 float eps = model_->config_.rms_norm_eps;
36 int max_pos_embeddings = model_->config_.max_position_embeddings;
37 bool use_rope_adjacent_pairing = model_->config_.is_gguf_file_loaded;
38 float attention_scale = 1.0f / SAFE_SQRT(static_cast<float>(head_dim));
39
40 std::vector<float> current_batch_activations = batch_input_activations;
41
42 std::vector<int> sequence_indices(num_tokens_in_batch);
43 std::vector<int> position_in_sequence(num_tokens_in_batch);
44
45 if (!prompt_lengths.empty()) {
46 int token_offset = 0;
47 for (size_t seq_idx = 0; seq_idx < prompt_lengths.size(); ++seq_idx) {
48 for (int pos = 0; pos < prompt_lengths[seq_idx]; ++pos) {
49 if (token_offset >= num_tokens_in_batch) {
50 Logger::error("[CPU_BATCH_FWD] Token offset exceeded num_tokens_in_batch");
51 return {};
52 }
53 sequence_indices[token_offset] = seq_idx;
54 position_in_sequence[token_offset] = pos;
55 token_offset++;
56 }
57 }
58 } else {
59 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
60 sequence_indices[token_idx] = 0;
61 position_in_sequence[token_idx] = start_pos_in_sequence + token_idx;
62 }
63 }
64
65 for (int l = 0; l < num_cpu_layers_to_process; ++l) {
73
74 const auto& lw = model_->layers[l];
75
76 std::vector<float> batch_x_norm1(current_batch_activations.size());
77 const std::vector<float>& w_input_norm_vec =
78 lw.input_layernorm_f32.empty()
79 ? bf16vec_to_float_vec(lw.input_layernorm)
80 : lw.input_layernorm_f32;
81 rmsnorm_batch_cpu(current_batch_activations, w_input_norm_vec, batch_x_norm1, num_tokens_in_batch, hs, eps);
82
83 std::vector<float> residual_batch_component_attn = current_batch_activations;
84
85 std::vector<float> q_batch((size_t)num_tokens_in_batch * hs);
86 std::vector<float> k_batch((size_t)num_tokens_in_batch * n_kv_heads * head_dim);
87 std::vector<float> v_batch((size_t)num_tokens_in_batch * n_kv_heads * head_dim);
88
89 if (!lw.q_proj_f32.empty()) {
90 matmul_f32_f32_batch_cpu(lw.q_proj_f32, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
91 } else if (!lw.q_proj_q8_0.empty()) {
92 matmul_q8_0_f32_batch_cpu(lw.q_proj_q8_0, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
93 } else if (!lw.q_proj_q6k.empty()) {
94 matmul_q6k_f32_batch_cpu(lw.q_proj_q6k, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
95 } else if (!lw.q_proj_q4k.empty()) {
96 matmul_q4k_f32_batch_cpu(lw.q_proj_q4k, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
97 } else {
98 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) + ": No Q proj weights found for CPU");
99 return {};
100 }
101
102 if (!lw.k_proj_f32.empty()) {
103 matmul_f32_f32_batch_cpu(lw.k_proj_f32, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
104 } else if (!lw.k_proj_q8_0.empty()) {
105 matmul_q8_0_f32_batch_cpu(lw.k_proj_q8_0, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
106 } else if (!lw.k_proj_q6k.empty()) {
107 matmul_q6k_f32_batch_cpu(lw.k_proj_q6k, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
108 } else if (!lw.k_proj_q4k.empty()) {
109 matmul_q4k_f32_batch_cpu(lw.k_proj_q4k, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
110 } else {
111 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) + ": No K proj weights found for CPU");
112 return {};
113 }
114
115 if (!lw.v_proj_f32.empty()) {
116 matmul_f32_f32_batch_cpu(lw.v_proj_f32, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
117 } else if (!lw.v_proj_q8_0.empty()) {
118 matmul_q8_0_f32_batch_cpu(lw.v_proj_q8_0, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
119 } else if (!lw.v_proj_q6k.empty()) {
120 matmul_q6k_f32_batch_cpu(lw.v_proj_q6k, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
121 } else if (!lw.v_proj_q4k.empty()) {
122 matmul_q4k_f32_batch_cpu(lw.v_proj_q4k, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
123 } else {
124 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) + ": No V proj weights found for CPU");
125 return {};
126 }
127
128 if (!prompt_lengths.empty()) {
129 for (int t = 0; t < num_tokens_in_batch; ++t) {
130 int current_token_pos = position_in_sequence[t];
131 int seq_idx = sequence_indices[t];
132
133 if (current_token_pos < 0 || current_token_pos >= max_pos_embeddings) {
134 Logger::warning("[CPU_BATCH_FWD] Token " + std::to_string(t) + " (seq=" + std::to_string(seq_idx) +
135 ", pos=" + std::to_string(current_token_pos) + ") is out of range. Skipping RoPE.");
136 continue;
137 }
138
139 std::vector<float> q_token(hs);
140 std::vector<float> k_token(n_kv_heads * head_dim);
141
142 std::copy(q_batch.begin() + (size_t)t * hs,
143 q_batch.begin() + (size_t)(t + 1) * hs,
144 q_token.begin());
145 std::copy(k_batch.begin() + (size_t)t * n_kv_heads * head_dim,
146 k_batch.begin() + (size_t)(t + 1) * n_kv_heads * head_dim,
147 k_token.begin());
148
149 apply_rope_vector(q_token, n_heads, head_dim, current_token_pos, model_->precomputed_freqs_cis_, max_pos_embeddings, use_rope_adjacent_pairing);
150 apply_rope_vector(k_token, n_kv_heads, head_dim, current_token_pos, model_->precomputed_freqs_cis_, max_pos_embeddings, use_rope_adjacent_pairing);
151
152 std::copy(q_token.begin(), q_token.end(), q_batch.begin() + (size_t)t * hs);
153 std::copy(k_token.begin(), k_token.end(), k_batch.begin() + (size_t)t * n_kv_heads * head_dim);
154 }
155 } else {
156 apply_rope_batch_cpu(q_batch, k_batch, num_tokens_in_batch, n_heads, n_kv_heads, head_dim,
157 start_pos_in_sequence, model_->precomputed_freqs_cis_, max_pos_embeddings, use_rope_adjacent_pairing);
158 }
159
160 if (kv_cache) {
161 if (!prompt_lengths.empty()) {
162 update_kv_cache_batch_cpu_sequence_aware(kv_cache, l, k_batch, v_batch, num_tokens_in_batch,
163 sequence_indices, position_in_sequence, n_kv_heads, head_dim);
164 } else {
165 update_kv_cache_batch_cpu(kv_cache, l, k_batch, v_batch, num_tokens_in_batch,
166 start_pos_in_sequence, n_kv_heads, head_dim);
167 }
168 }
169
170 std::vector<float> batch_attn_output((size_t)num_tokens_in_batch * hs);
171
172 if (kv_cache && static_cast<size_t>(l) < kv_cache->layers.size()) {
173 if (!prompt_lengths.empty()) {
174 attention_batch_cpu_sequence_aware(q_batch, kv_cache->layers[l], batch_attn_output,
175 num_tokens_in_batch, sequence_indices, position_in_sequence,
176 n_heads, n_kv_heads, head_dim, attention_scale,
177 kv_cache->max_seq_len_config_);
178 } else {
179 attention_batch_cpu(q_batch, kv_cache->layers[l], batch_attn_output,
180 num_tokens_in_batch, start_pos_in_sequence,
181 n_heads, n_kv_heads, head_dim, attention_scale);
182 }
183 } else if (kv_cache) {
184 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) +
185 " is out of bounds for KV Cache access during attention. KVCache layers size: " +
186 std::to_string(kv_cache->layers.size()) +
187 ". Filling attention output with zeros.");
188 std::fill(batch_attn_output.begin(), batch_attn_output.end(), 0.0f);
189 } else {
190 Logger::error("[CPU_BATCH_FWD] KV Cache is null, cannot perform attention for layer " + std::to_string(l) +
191 ". Filling attention output with zeros.");
192 std::fill(batch_attn_output.begin(), batch_attn_output.end(), 0.0f);
193 }
194
195 std::vector<float> batch_attn_proj_out((size_t)num_tokens_in_batch * hs);
196 if(!lw.o_proj_f32.empty()) {
197 matmul_f32_f32_batch_cpu(lw.o_proj_f32, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
198 } else if (!lw.o_proj_q8_0.empty()) {
199 matmul_q8_0_f32_batch_cpu(lw.o_proj_q8_0, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
200 } else if (!lw.o_proj_q6k.empty()) {
201 matmul_q6k_f32_batch_cpu(lw.o_proj_q6k, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
202 } else if (!lw.o_proj_q4k.empty()) {
203 matmul_q4k_f32_batch_cpu(lw.o_proj_q4k, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
204 } else {
205 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) + ": No O proj weights found for CPU");
206 return {};
207 }
208
209 for(size_t i=0; i < current_batch_activations.size(); ++i) {
210 current_batch_activations[i] = residual_batch_component_attn[i] + batch_attn_proj_out[i];
211 }
212
213 std::vector<float> residual_batch_component_mlp = current_batch_activations;
214 std::vector<float> batch_x_norm2(current_batch_activations.size());
215 const std::vector<float>& w_post_attn_norm_vec =
216 lw.post_attention_layernorm_f32.empty()
217 ? bf16vec_to_float_vec(lw.post_attention_layernorm)
218 : lw.post_attention_layernorm_f32;
219
220 rmsnorm_batch_cpu(current_batch_activations, w_post_attn_norm_vec, batch_x_norm2, num_tokens_in_batch, hs, eps);
221
222 std::vector<float> batch_gate_proj_out((size_t)num_tokens_in_batch * is);
223 std::vector<float> batch_up_proj_out((size_t)num_tokens_in_batch * is);
224
225 if (!lw.gate_proj_f32.empty()) {
226 matmul_f32_f32_batch_cpu(lw.gate_proj_f32, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
227 } else if (!lw.gate_proj_q8_0.empty()) {
228 matmul_q8_0_f32_batch_cpu(lw.gate_proj_q8_0, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
229 } else if (!lw.gate_proj_q6k.empty()) {
230 matmul_q6k_f32_batch_cpu(lw.gate_proj_q6k, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
231 } else if (!lw.gate_proj_q4k.empty()) {
232 matmul_q4k_f32_batch_cpu(lw.gate_proj_q4k, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
233 } else {
234 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) + ": No gate_proj weights found for CPU");
235 return {};
236 }
237
238 if (!lw.up_proj_f32.empty()) {
239 matmul_f32_f32_batch_cpu(lw.up_proj_f32, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
240 } else if (!lw.up_proj_q8_0.empty()) {
241 matmul_q8_0_f32_batch_cpu(lw.up_proj_q8_0, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
242 } else if (!lw.up_proj_q6k.empty()) {
243 matmul_q6k_f32_batch_cpu(lw.up_proj_q6k, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
244 } else if (!lw.up_proj_q4k.empty()) {
245 matmul_q4k_f32_batch_cpu(lw.up_proj_q4k, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
246 } else {
247 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) + ": No up_proj weights found for CPU");
248 return {};
249 }
250
251 std::vector<float> batch_swiglu_out((size_t)num_tokens_in_batch * is);
252 for (size_t i = 0; i < batch_gate_proj_out.size(); ++i) {
253 float gate_val = batch_gate_proj_out[i];
254 float silu_gate_val = gate_val / (1.0f + std::exp(-gate_val));
255 batch_swiglu_out[i] = silu_gate_val * batch_up_proj_out[i];
256 }
257
258 std::vector<float> batch_mlp_down_proj_out((size_t)num_tokens_in_batch * hs);
259 if (!lw.down_proj_f32.empty()) {
260 matmul_f32_f32_batch_cpu(lw.down_proj_f32, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
261 } else if (!lw.down_proj_q8_0.empty()) {
262 matmul_q8_0_f32_batch_cpu(lw.down_proj_q8_0, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
263 } else if (!lw.down_proj_q6k.empty()) {
264 matmul_q6k_f32_batch_cpu(lw.down_proj_q6k, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
265 } else if (!lw.down_proj_q4k.empty()) {
266 matmul_q4k_f32_batch_cpu(lw.down_proj_q4k, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
267 } else {
268 Logger::error("[CPU_BATCH_FWD] Layer " + std::to_string(l) + ": No down_proj weights found for CPU");
269 return {};
270 }
271
272 for(size_t i = 0; i < current_batch_activations.size(); ++i) {
273 current_batch_activations[i] = residual_batch_component_mlp[i] + batch_mlp_down_proj_out[i];
274 }
275 }
276
277 if (kv_cache && num_tokens_in_batch > 0) {
278 kv_cache->seq_len = start_pos_in_sequence + num_tokens_in_batch;
279 }
280 return current_batch_activations;
281}
#define SAFE_SQRT(x)
static void warning(const std::string &message)
Definition logger.cpp:139
static void error(const std::string &message)
Definition logger.cpp:143
void ensure_up_proj_dequantized(int layer_idx)
void ensure_v_proj_dequantized(int layer_idx)
std::vector< std::pair< float, float > > precomputed_freqs_cis_
Definition model.h:554
std::vector< LayerWeights > layers
Definition model.h:491
ModelConfig config_
Definition model.h:480
void ensure_o_proj_dequantized(int layer_idx)
void ensure_k_proj_dequantized(int layer_idx)
void ensure_q_proj_dequantized(int layer_idx)
void ensure_down_proj_dequantized(int layer_idx)
void ensure_gate_proj_dequantized(int layer_idx)
void attention_batch_cpu_sequence_aware(const std::vector< float > &q_batch_roped, KVCacheLayer &current_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale, int max_seq_len_per_sequence)
void attention_batch_cpu(const std::vector< float > &q_batch_roped, KVCacheLayer &current_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, int start_pos_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale)
void update_kv_cache_batch_cpu_sequence_aware(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_kv_heads, int head_dim)
void update_kv_cache_batch_cpu(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, int start_pos_in_sequence, int num_kv_heads, int head_dim)
int max_seq_len_config_
Definition model.h:163
std::vector< KVCacheLayer > layers
Definition model.h:152
int seq_len
Definition model.h:155
int hidden_size
Definition model.h:81
float rms_norm_eps
Definition model.h:88
int num_attention_heads
Definition model.h:83
int intermediate_size
Definition model.h:82
bool is_gguf_file_loaded
Definition model.h:101
int num_key_value_heads
Definition model.h:84
int max_position_embeddings
Definition model.h:87
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:428
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:988
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:869
void apply_rope_batch_cpu(std::vector< float > &q_batch, std::vector< float > &k_batch, int num_tokens, int num_q_heads, int num_kv_heads, int head_dim, int start_pos_in_sequence, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:491
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:709
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:950
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps)
Definition utils.cpp:613

References apply_rope_batch_cpu(), apply_rope_vector(), attention_batch_cpu(), attention_batch_cpu_sequence_aware(), bf16vec_to_float_vec(), TinyLlamaModel::config_, TinyLlamaModel::ensure_down_proj_dequantized(), TinyLlamaModel::ensure_gate_proj_dequantized(), TinyLlamaModel::ensure_k_proj_dequantized(), TinyLlamaModel::ensure_o_proj_dequantized(), TinyLlamaModel::ensure_q_proj_dequantized(), TinyLlamaModel::ensure_up_proj_dequantized(), TinyLlamaModel::ensure_v_proj_dequantized(), Logger::error(), ModelConfig::hidden_size, ModelConfig::intermediate_size, ModelConfig::is_gguf_file_loaded, KVCache::layers, TinyLlamaModel::layers, matmul_f32_f32_batch_cpu(), matmul_q4k_f32_batch_cpu(), matmul_q6k_f32_batch_cpu(), matmul_q8_0_f32_batch_cpu(), ModelConfig::max_position_embeddings, KVCache::max_seq_len_config_, model_, ModelConfig::num_attention_heads, ModelConfig::num_key_value_heads, TinyLlamaModel::precomputed_freqs_cis_, ModelConfig::rms_norm_eps, rmsnorm_batch_cpu(), SAFE_SQRT, KVCache::seq_len, update_kv_cache_batch_cpu(), update_kv_cache_batch_cpu_sequence_aware(), and Logger::warning().

Member Data Documentation

◆ model_

TinyLlamaModel* CPUBatchProcessor::model_
private

Definition at line 20 of file cpu_batch_processor.h.

Referenced by forward_cpu_batch().


The documentation for this class was generated from the following files: