TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
model.cpp
Go to the documentation of this file.
1#include "model.h"
2
3#ifdef HAS_CUDA
4#include "cuda_kernels.h"
5#endif
6#include <algorithm>
7#include <cmath>
8#include <cstring>
9#include <fstream>
10#include <iomanip>
11#include <limits>
12#include <memory>
13#include <sstream>
14#include <stdexcept>
15#ifdef _WIN32
16#include <windows.h>
17#endif
18#include <cassert>
19#include <cstdint>
20#include <iostream>
21#include <numeric>
22#include <variant>
23
24#include "cpu_attention.h"
25#include "cpu_batch_processor.h"
26#include "gguf_parser.h"
27#include "gpu_initialization.h"
28#include "kv_cache.h"
29#include "logger.h"
30#include "model_config.h"
31#include "model_constants.h"
32#include "model_macros.h"
33#include "model_utils.h"
34#include "quantization.h"
35#include "safetensors_loader.h"
36#include "utils.h"
37#include "weight_management.h"
39 const GGUFData* gguf) {
40 Logger::info("Initializing model weights...");
41 int hs = config_.hidden_size;
43 int nhl = config_.num_hidden_layers;
44 int vs = config_.vocab_size;
45 layers.resize(nhl);
46
47 if (gguf) {
48 Logger::info("Processing weights from GGUF data source...");
49
50 if (gguf && (gguf->mapped_tensor_data || !gguf->tensor_data.empty())) {
51 map_gguf_weights(*gguf, *this);
52 Logger::info("[INIT_WEIGHTS_GGUF] map_gguf_weights(*gguf, *this) CALLED (using function parameter).");
53 } else if (gguf_data_ && (gguf_data_->mapped_tensor_data || !gguf_data_->tensor_data.empty())) {
55 Logger::info("[INIT_WEIGHTS_GGUF] map_gguf_weights(*gguf_data_, *this) CALLED (using member gguf_data_).");
56 } else {
57 Logger::error("[INIT_WEIGHTS_GGUF] map_gguf_weights failed - tensor data not available. No GGUF weights mapped.");
58 }
59
60 // LAZY DEQUANTIZATION: Only dequantize what's immediately needed
61 Logger::info("[INIT_WEIGHTS_GGUF] Using lazy dequantization to prevent OOM");
62
63 // Only dequantize embed_tokens and final_norm immediately (small and always needed)
64 if (this->embed_tokens_f32.empty()) {
65 size_t total_elements_embed = static_cast<size_t>(config_.vocab_size) * config_.hidden_size;
66 if (!this->embed_tokens_q6k.empty()) {
67 dequantize_vector_q6k_to_f32(this->embed_tokens_q6k, this->embed_tokens_f32, total_elements_embed, 1);
68 } else if (!this->embed_tokens_q4k.empty()) {
69 dequantize_vector_q4k_to_f32(this->embed_tokens_q4k, this->embed_tokens_f32, total_elements_embed, 1);
70 } else if (!this->embed_tokens_q8k.empty()) {
71 dequantize_q8_k(this->embed_tokens_q8k, this->embed_tokens_f32, total_elements_embed, true);
72 } else if (!this->embed_tokens_q8_0.empty()) {
73 dequantize_vector_q8_0_to_f32(this->embed_tokens_q8_0, this->embed_tokens_f32, total_elements_embed, 1);
74 } else if (!this->embed_tokens.empty()) {
76 }
77 if (!this->embed_tokens_f32.empty()) {
78 Logger::info("[INIT_WEIGHTS_GGUF_DEQUANT] embed_tokens_f32 populated. Size: " + std::to_string(this->embed_tokens_f32.size()));
79 }
80 }
81
82 if (this->final_norm_f32.empty()) {
83 if (!this->final_norm.empty()) {
85 if (!this->final_norm_f32.empty()) {
86 Logger::info("[INIT_WEIGHTS_GGUF_DEQUANT] Successfully converted final_norm (BF16) to final_norm_f32. Size: " + std::to_string(this->final_norm_f32.size()));
87 }
88 }
89 }
90
91 // DEFER lm_head dequantization until actually needed (it's huge)
92 Logger::info("[INIT_WEIGHTS_GGUF] Deferring lm_head dequantization until needed to save memory");
93
94 // DEFER all layer weight dequantization until the layer is actually used
95 Logger::info("[INIT_WEIGHTS_GGUF] Deferring all layer weight dequantization until layers are used");
96
97 // Only populate layer norms immediately (small and needed for validation)
98 for (int l = 0; l < nhl; ++l) {
99 auto& lw = layers[l];
100
101 if (lw.input_layernorm_f32.empty() && !lw.input_layernorm.empty()) {
102 lw.input_layernorm_f32 = bf16vec_to_float_vec(lw.input_layernorm);
103 if (!lw.input_layernorm_f32.empty()) Logger::info(" L" + std::to_string(l) + " input_layernorm_f32 populated from BF16. Size: " + std::to_string(lw.input_layernorm_f32.size()));
104 }
105 if (lw.post_attention_layernorm_f32.empty() && !lw.post_attention_layernorm.empty()) {
106 lw.post_attention_layernorm_f32 = bf16vec_to_float_vec(lw.post_attention_layernorm);
107 if (!lw.post_attention_layernorm_f32.empty()) Logger::info(" L" + std::to_string(l) + " post_attention_layernorm_f32 populated from BF16. Size: " + std::to_string(lw.post_attention_layernorm_f32.size()));
108 }
109 }
110
111 // Validation checks for layer norms
112 for (int l = 0; l < nhl; ++l) {
113 const auto& lw = layers[l];
114 if (lw.input_layernorm_f32.empty()) {
115 Logger::error("[INIT_WEIGHTS_GGUF_CHECK] Layer " + std::to_string(l) +
116 ": input_layernorm_f32 is EMPTY post-GGUF. This WILL cause GPU init errors if this layer is on GPU.");
117 }
118 if (lw.post_attention_layernorm_f32.empty()) {
119 Logger::error("[INIT_WEIGHTS_GGUF_CHECK] Layer " + std::to_string(l) +
120 ": post_attention_layernorm_f32 is EMPTY post-GGUF. This WILL cause GPU init errors if this layer is on GPU.");
121 }
122 }
123 Logger::info("[INIT_WEIGHTS_GGUF] Finished per-layer NORM F32 vector checks post-GGUF.");
124
125 } else {
126 Logger::fatal("TinyLlamaModel::initialize_weights called with neither GGUF nor SafeTensors loader. Cannot initialize weights.");
127 throw std::runtime_error("Model weights source (GGUF or SafeTensors) not provided to initialize_weights.");
128 }
129
130 Logger::info("Finished initializing model weights logic block.");
131
132 if (this->final_norm_f32.empty()) {
133 Logger::error("[INIT_WEIGHTS_FINAL_CHECK] final_norm_f32 is EMPTY. This WILL cause errors if final normalization is needed in F32.");
134 } else {
135 Logger::info("[INIT_WEIGHTS_FINAL_CHECK] final_norm_f32 is POPULATED. Size: " + std::to_string(this->final_norm_f32.size()));
136 }
137
138 if (this->embed_tokens_f32.empty()) {
139 Logger::error("[INIT_WEIGHTS_FINAL_CHECK] embed_tokens_f32 is EMPTY. This WILL cause errors if token embeddings are needed in F32.");
140 } else {
141 Logger::info("[INIT_WEIGHTS_FINAL_CHECK] embed_tokens_f32 is POPULATED. Size: " + std::to_string(this->embed_tokens_f32.size()));
142 }
143}
145 const SafeTensorsLoader& loader)
146 : config_(config) { // Copies the potentially faulty config first
147 config_.is_gguf_file_loaded = false; // Explicitly set to false for SafeTensors path
148 Logger::info("Constructing TinyLlamaModel from SafeTensorsLoader (is_gguf_file_loaded set to false).");
149 initialize_weights(&loader, nullptr);
151 Logger::info("TinyLlamaModel construction from SafeTensorsLoader complete.");
152}
153
155 const std::string& model_path)
156 : model_path_(model_path)
157#ifdef HAS_CUDA
158 , cublas_handle_(nullptr), token_embedding_table_dev_(nullptr), lm_head_dev_(nullptr), final_norm_dev(nullptr), w_q_dev_(nullptr), w_k_dev_(nullptr), w_v_dev_(nullptr), w_o_dev_(nullptr), w_gate_dev_(nullptr), w_up_dev_(nullptr), w_down_dev_(nullptr), all_freqs_cis_dev(nullptr), x_dev_(nullptr), x_norm_dev_(nullptr), x_resid1_dev_(nullptr), x_resid2_dev_(nullptr), q_dev_(nullptr), k_dev_(nullptr), v_dev_(nullptr), attn_out_dev_(nullptr), attn_proj_dev_(nullptr), gate_vec_dev_(nullptr), up_vec_dev_(nullptr), swiglu_vec_dev_(nullptr), mlp_down_dev_(nullptr), logits_dev_(nullptr), token_embedding_table_f32_dev_(nullptr), lm_head_f32_dev_(nullptr), w_q_f32_dev_(nullptr), w_k_f32_dev_(nullptr), w_v_f32_dev_(nullptr), w_o_f32_dev_(nullptr), w_gate_f32_dev_(nullptr), w_up_f32_dev_(nullptr), w_down_f32_dev_(nullptr)
159#endif
160{
161 Logger::info("TinyLlamaModel constructor entered. Model path (from string): " + model_path);
162 int cli_gpu_layer_request = initial_config.num_cpu_offload_layers;
163 bool cli_mmap_preference = initial_config.use_mmap_for_gguf;
164 this->config_ = initial_config;
165 if (this->model_path_.empty() && !model_path.empty()) {
166 this->model_path_ = model_path;
167 }
168 std::unique_ptr<SafeTensorsLoader> loader = nullptr;
169 if (!this->model_path_.empty() && this->model_path_.size() > 5 &&
170 this->model_path_.substr(this->model_path_.size() - 5) == ".gguf") {
171 Logger::info("GGUF file detected by path in Model Constructor: " + this->model_path_);
172 try {
173 bool force_mmap_for_gguf_load = cli_mmap_preference;
174 Logger::info("TinyLlamaModel GGUF path: Using mmap setting " + std::string(force_mmap_for_gguf_load ? "true" : "false") +
175 " for gguf_meta/weight loading based on CLI mmap preference: " +
176 std::string(cli_mmap_preference ? "true" : "false"));
177
178 this->gguf_data_ = std::make_unique<GGUFData>(load_gguf_meta(this->model_path_, force_mmap_for_gguf_load));
179
180 ModelConfig config_from_gguf = parse_model_config_from_gguf(*(this->gguf_data_));
181 this->config_ = config_from_gguf;
182 this->config_.use_mmap_for_gguf = cli_mmap_preference;
183 this->config_.is_gguf_file_loaded = true;
184 if (cli_gpu_layer_request < 0) {
186 Logger::info("TinyLlamaModel GGUF Ctor CALC: CLI hint < 0 (all GPU). num_cpu_offload_layers set to 0.");
187 } else if (cli_gpu_layer_request == 0) {
189 Logger::info("TinyLlamaModel GGUF Ctor CALC: CLI hint == 0 (all CPU). num_cpu_offload_layers set to num_hidden_layers (" + std::to_string(this->config_.num_cpu_offload_layers) + ").");
190 } else { // CLI hint > 0, meaning cli_gpu_layer_request is the number of desired GPU layers
191 if (this->config_.num_hidden_layers > 0) {
192 if (cli_gpu_layer_request >= this->config_.num_hidden_layers) {
193 this->config_.num_cpu_offload_layers = 0; // More GPU layers requested than available -> all on GPU
194 Logger::info("TinyLlamaModel GGUF Ctor CALC: CLI GPU layer request ("+ std::to_string(cli_gpu_layer_request) +") >= total layers. num_cpu_offload_layers set to 0.");
195 } else {
196 this->config_.num_cpu_offload_layers = this->config_.num_hidden_layers - cli_gpu_layer_request;
197 Logger::info("TinyLlamaModel GGUF Ctor CALC: Partial GPU. CLI GPU req: " + std::to_string(cli_gpu_layer_request) + ". num_cpu_offload_layers set to " + std::to_string(this->config_.num_cpu_offload_layers));
198 }
199 } else { // num_hidden_layers is 0 or negative, something is wrong with GGUF. Default to all CPU.
201 Logger::warning("TinyLlamaModel GGUF Ctor CALC: num_hidden_layers from GGUF is <= 0. Defaulting num_cpu_offload_layers to 0. CLI GPU req: " + std::to_string(cli_gpu_layer_request));
202 }
203 }
204 Logger::info("TinyLlamaModel GGUF Ctor: POST-CALC (within GGUF block) final num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
205 Logger::info("[CTOR_GGUF_DEBUG_L1860] After CLI hint logic: this->config_.num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers) +
206 ", this->config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
207 } catch (const std::exception& e) {
208 Logger::error("Failed to load or parse GGUF file: " + std::string(e.what()));
209 throw;
210 }
211 } else if (model_path.size() > 12 &&
212 model_path.substr(model_path.size() - 12) == ".safetensors") {
213 Logger::info("SafeTensors file detected: " + model_path);
214 ModelConfig config_from_json;
215 bool json_loaded_successfully = SafeTensorsLoader::load_model_config_from_json(model_path, config_from_json);
216
217 // For SafeTensors, start with JSON config, then layer CLI preferences.
218 if (json_loaded_successfully) {
219 Logger::info("Successfully loaded and parsed config.json for SafeTensors model.");
220 this->config_ = config_from_json; // Base is from JSON
221 } else {
222 Logger::warning("Failed to load config.json or it was not found for SafeTensors model. Proceeding with initial_config defaults and CLI overrides.");
223 }
224 this->config_.is_gguf_file_loaded = false;
225 this->config_.use_mmap_for_gguf = cli_mmap_preference; // This field is GGUF specific, but store CLI pref anyway.
226
227 if (cli_gpu_layer_request < 0) {
229 } else if (cli_gpu_layer_request == 0) {
231 } else {
232 if (this->config_.num_hidden_layers > 0) {
233 if (cli_gpu_layer_request >= this->config_.num_hidden_layers) {
235 } else {
236 this->config_.num_cpu_offload_layers = this->config_.num_hidden_layers - cli_gpu_layer_request;
237 }
238 } else {
239 this->config_.num_cpu_offload_layers = 0; // Fallback if num_hidden_layers not known
240 Logger::warning("SafeTensors path: num_hidden_layers is 0 from JSON/default. Defaulting num_cpu_offload_layers to 0 despite CLI GPU request: " + std::to_string(cli_gpu_layer_request));
241 }
242 }
243 Logger::info("SafeTensors path: Calculated num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
244
245 try {
246 loader = std::make_unique<SafeTensorsLoader>(model_path);
247 Logger::info("SafeTensorsLoader initialized for: " + model_path);
248 } catch (const std::exception& e) {
249 Logger::error("Failed to initialize SafeTensorsLoader: " + std::string(e.what()));
250 throw;
251 }
252 } else {
253 throw std::runtime_error(
254 "Unsupported model file type. Please use .gguf or .safetensors");
255 }
256
257 Logger::info("TinyLlamaModel constructor: After specific loader block. Current config_.num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers) +
258 ", config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
259 Logger::info("TinyLlamaModel constructor: Current config_.use_mmap_for_gguf = " + std::string(this->config_.use_mmap_for_gguf ? "true" : "false"));
260
261 if (this->config_.num_cpu_offload_layers < 0) { // Should not happen if logic above is correct for -1 CLI hint
263 Logger::warning("Clamping num_cpu_offload_layers: was < 0, set to 0.");
264 }
265 if (this->config_.num_hidden_layers > 0 && this->config_.num_cpu_offload_layers > this->config_.num_hidden_layers) {
266 Logger::warning("Clamping num_cpu_offload_layers: Requested CPU offload layers (" + std::to_string(this->config_.num_cpu_offload_layers) +
267 ") exceeds total hidden layers (" + std::to_string(this->config_.num_hidden_layers) +
268 "). Clamping to " + std::to_string(this->config_.num_hidden_layers) + " (all CPU).");
270 }
271 Logger::info("TinyLlamaModel constructor: Final clamped num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
272 Logger::info("[CTOR_DEBUG_L1921] End of Model Ctor (before initialize_weights/rope call): this->config_.num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers) +
273 ", this->config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
274 Logger::info("Final ModelConfig (before initialize_weights/rope):");
275 Logger::info(" hidden_size: " + std::to_string(config_.hidden_size));
276 Logger::info(" intermediate_size: " + std::to_string(config_.intermediate_size));
277 Logger::info(" num_attention_heads: " + std::to_string(config_.num_attention_heads));
278 Logger::info(" num_key_value_heads: " + std::to_string(config_.num_key_value_heads));
279 Logger::info(" num_hidden_layers: " + std::to_string(config_.num_hidden_layers));
280 Logger::info(" vocab_size: " + std::to_string(config_.vocab_size));
281 Logger::info(" max_position_embeddings: " + std::to_string(config_.max_position_embeddings));
282 Logger::info(" architecture: " + config_.architecture);
283 Logger::info(" is_gguf_file_loaded: " + std::string(config_.is_gguf_file_loaded ? "true" : "false"));
284 Logger::info(" use_mmap_for_gguf: " + std::string(config_.use_mmap_for_gguf ? "true" : "false"));
285 // --- BEGIN GGUFData Integrity Check ---
286 if (this->config_.is_gguf_file_loaded && this->gguf_data_) {
287 if (this->gguf_data_->tensor_infos_map.empty()) {
288 Logger::error("[CTOR_GGUF_PRE_INIT_W] CRITICAL: gguf_data_->tensor_infos_map is EMPTY. Weights will not be loaded by map_gguf_weights.");
289 }
290 } else if (this->config_.is_gguf_file_loaded && !this->gguf_data_) {
291 Logger::error("[CTOR_GGUF_PRE_INIT_W] CRITICAL: config_.is_gguf_file_loaded is TRUE, but gguf_data_ pointer IS NULL. Weights cannot be loaded.");
292 } else if (!this->config_.is_gguf_file_loaded) {
293 Logger::info("[CTOR_GGUF_PRE_INIT_W] Not a GGUF file load context (e.g., SafeTensors). Skipping gguf_data_ check here.");
294 }
295 // --- END GGUFData Integrity Check ---
296 initialize_weights(loader.get(), this->gguf_data_.get());
298
299 Logger::info("TinyLlamaModel (from path string) constructed and initialized successfully.");
300}
302 std::unique_ptr<GGUFData> gguf_data_from_session)
303 : config_(config_from_session),
304 gguf_data_(std::move(gguf_data_from_session)),
305 model_path_("loaded_from_gguf_data_memory")
306#ifdef HAS_CUDA
307 // Initialize all CUDA pointers to nullptr as in the other constructor
308 , cublas_handle_(nullptr), token_embedding_table_dev_(nullptr), lm_head_dev_(nullptr), final_norm_dev(nullptr), w_q_dev_(nullptr), w_k_dev_(nullptr), w_v_dev_(nullptr), w_o_dev_(nullptr), w_gate_dev_(nullptr), w_up_dev_(nullptr), w_down_dev_(nullptr), all_freqs_cis_dev(nullptr), x_dev_(nullptr), x_norm_dev_(nullptr), x_resid1_dev_(nullptr), x_resid2_dev_(nullptr), q_dev_(nullptr), k_dev_(nullptr), v_dev_(nullptr), attn_out_dev_(nullptr), attn_proj_dev_(nullptr), gate_vec_dev_(nullptr), up_vec_dev_(nullptr), swiglu_vec_dev_(nullptr), mlp_down_dev_(nullptr), logits_dev_(nullptr), token_embedding_table_f32_dev_(nullptr), lm_head_f32_dev_(nullptr), w_q_f32_dev_(nullptr), w_k_f32_dev_(nullptr), w_v_f32_dev_(nullptr), w_o_f32_dev_(nullptr), w_gate_f32_dev_(nullptr), w_up_f32_dev_(nullptr), w_down_f32_dev_(nullptr)
309#endif
310{
311 Logger::info("TinyLlamaModel constructor entered (with pre-loaded GGUFData). Model path placeholder: " + model_path_);
312 this->config_.is_gguf_file_loaded = true; // Ensure this is set
313
314 if (this->config_.num_cpu_offload_layers < 0) {
316 }
317 if (this->config_.num_hidden_layers > 0 && this->config_.num_cpu_offload_layers > this->config_.num_hidden_layers) {
318 Logger::warning("Requested CPU offload layers (" + std::to_string(this->config_.num_cpu_offload_layers) +
319 ") exceeds total hidden layers (" + std::to_string(this->config_.num_hidden_layers) +
320 "). Clamping to " + std::to_string(this->config_.num_hidden_layers) + " layers on CPU (all CPU).");
322 }
323 Logger::info("TinyLlamaModel (pre-loaded GGUF): Final clamped num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
324
325 initialize_weights(nullptr, gguf_data_.get()); // Pass raw GGUFData pointer
327 Logger::info("TinyLlamaModel (with pre-loaded GGUFData) constructed and initialized successfully.");
328}
329
331#ifdef HAS_CUDA
332 // Only perform GPU cleanup if GPU layers were actually used
333 int active_num_gpu_layers = config_.num_hidden_layers - config_.num_cpu_offload_layers;
334 if (active_num_gpu_layers > 0) {
335 Logger::info("Freeing TinyLlamaModel CUDA resources...");
336 if (cublas_handle_) {
337 cublasStatus_t cublas_status = cublasDestroy(cublas_handle_);
338 if (cublas_status != CUBLAS_STATUS_SUCCESS) {
339 Logger::error("cuBLAS handle destruction failed with error code: " +
340 std::to_string(cublas_status));
341 }
342 cublas_handle_ = nullptr;
343 Logger::info("cuBLAS handle destroyed.");
344 }
345 } else {
346 // CPU-only mode: just clean up cuBLAS handle if it exists
347 if (cublas_handle_) {
348 cublasStatus_t cublas_status = cublasDestroy(cublas_handle_);
349 if (cublas_status != CUBLAS_STATUS_SUCCESS) {
350 Logger::error("cuBLAS handle destruction failed with error code: " +
351 std::to_string(cublas_status));
352 }
353 cublas_handle_ = nullptr;
354 }
355 }
356 // Continue GPU cleanup only if GPU layers were active
357 if (active_num_gpu_layers > 0) {
358 if (final_norm_dev) {
359 gpuErrchk(cudaFree(final_norm_dev));
360 final_norm_dev = nullptr;
361 }
362
363 for (auto& layer : layers) {
364 if (layer.input_layernorm_dev) {
365 gpuErrchk(cudaFree(layer.input_layernorm_dev));
366 layer.input_layernorm_dev = nullptr;
367 }
368 if (layer.post_attention_layernorm_dev) {
369 gpuErrchk(cudaFree(layer.post_attention_layernorm_dev));
370 layer.post_attention_layernorm_dev = nullptr;
371 }
372 }
373
374 if (all_freqs_cis_dev) {
375 gpuErrchk(cudaFree(all_freqs_cis_dev));
376 all_freqs_cis_dev = nullptr;
377 }
378 if (token_embedding_table_dev_) {
379 gpuErrchk(cudaFree(token_embedding_table_dev_));
380 token_embedding_table_dev_ = nullptr;
381 }
382 if (lm_head_dev_) {
383 gpuErrchk(cudaFree(lm_head_dev_));
384 lm_head_dev_ = nullptr;
385 }
386 if (w_q_dev_) {
387 gpuErrchk(cudaFree(w_q_dev_));
388 w_q_dev_ = nullptr;
389 }
390 if (w_k_dev_) {
391 gpuErrchk(cudaFree(w_k_dev_));
392 w_k_dev_ = nullptr;
393 }
394 if (w_v_dev_) {
395 gpuErrchk(cudaFree(w_v_dev_));
396 w_v_dev_ = nullptr;
397 }
398 if (w_o_dev_) {
399 gpuErrchk(cudaFree(w_o_dev_));
400 w_o_dev_ = nullptr;
401 }
402 if (w_gate_dev_) {
403 gpuErrchk(cudaFree(w_gate_dev_));
404 w_gate_dev_ = nullptr;
405 }
406 if (w_up_dev_) {
407 gpuErrchk(cudaFree(w_up_dev_));
408 w_up_dev_ = nullptr;
409 }
410 if (w_down_dev_) {
411 gpuErrchk(cudaFree(w_down_dev_));
412 w_down_dev_ = nullptr;
413 }
414 if (token_embedding_table_f32_dev_) {
415 gpuErrchk(cudaFree(token_embedding_table_f32_dev_));
416 token_embedding_table_f32_dev_ = nullptr;
417 }
418 if (lm_head_f32_dev_) {
419 gpuErrchk(cudaFree(lm_head_f32_dev_));
420 lm_head_f32_dev_ = nullptr;
421 }
422 if (w_q_f32_dev_) {
423 gpuErrchk(cudaFree(w_q_f32_dev_));
424 w_q_f32_dev_ = nullptr;
425 }
426 if (w_k_f32_dev_) {
427 gpuErrchk(cudaFree(w_k_f32_dev_));
428 w_k_f32_dev_ = nullptr;
429 }
430 if (w_v_f32_dev_) {
431 gpuErrchk(cudaFree(w_v_f32_dev_));
432 w_v_f32_dev_ = nullptr;
433 }
434 if (w_o_f32_dev_) {
435 gpuErrchk(cudaFree(w_o_f32_dev_));
436 w_o_f32_dev_ = nullptr;
437 }
438 if (w_gate_f32_dev_) {
439 gpuErrchk(cudaFree(w_gate_f32_dev_));
440 w_gate_f32_dev_ = nullptr;
441 }
442 if (w_up_f32_dev_) {
443 gpuErrchk(cudaFree(w_up_f32_dev_));
444 w_up_f32_dev_ = nullptr;
445 }
446 if (w_down_f32_dev_) {
447 gpuErrchk(cudaFree(w_down_f32_dev_));
448 w_down_f32_dev_ = nullptr;
449 }
450
451 if (x_dev_) {
452 gpuErrchk(cudaFree(x_dev_));
453 x_dev_ = nullptr;
454 }
455 if (x_norm_dev_) {
456 gpuErrchk(cudaFree(x_norm_dev_));
457 x_norm_dev_ = nullptr;
458 }
459 if (x_resid1_dev_) {
460 gpuErrchk(cudaFree(x_resid1_dev_));
461 x_resid1_dev_ = nullptr;
462 }
463 if (x_resid2_dev_) {
464 gpuErrchk(cudaFree(x_resid2_dev_));
465 x_resid2_dev_ = nullptr;
466 }
467 if (q_dev_) {
468 gpuErrchk(cudaFree(q_dev_));
469 q_dev_ = nullptr;
470 }
471 if (k_dev_) {
472 gpuErrchk(cudaFree(k_dev_));
473 k_dev_ = nullptr;
474 }
475 if (v_dev_) {
476 gpuErrchk(cudaFree(v_dev_));
477 v_dev_ = nullptr;
478 }
479 if (attn_out_dev_) {
480 gpuErrchk(cudaFree(attn_out_dev_));
481 attn_out_dev_ = nullptr;
482 }
483 if (attn_proj_dev_) {
484 gpuErrchk(cudaFree(attn_proj_dev_));
485 attn_proj_dev_ = nullptr;
486 }
487 if (gate_vec_dev_) {
488 gpuErrchk(cudaFree(gate_vec_dev_));
489 gate_vec_dev_ = nullptr;
490 }
491 if (up_vec_dev_) {
492 gpuErrchk(cudaFree(up_vec_dev_));
493 up_vec_dev_ = nullptr;
494 }
495 if (swiglu_vec_dev_) {
496 gpuErrchk(cudaFree(swiglu_vec_dev_));
497 swiglu_vec_dev_ = nullptr;
498 }
499 if (mlp_down_dev_) {
500 gpuErrchk(cudaFree(mlp_down_dev_));
501 mlp_down_dev_ = nullptr;
502 }
503 if (logits_dev_) {
504 gpuErrchk(cudaFree(logits_dev_));
505 logits_dev_ = nullptr;
506 }
507 // Free KVCache dequantization buffers
508 if (dequant_k_cache_buffer_dev_) {
509 gpuErrchk(cudaFree(dequant_k_cache_buffer_dev_));
510 dequant_k_cache_buffer_dev_ = nullptr;
511 }
512 if (dequant_v_cache_buffer_dev_) {
513 gpuErrchk(cudaFree(dequant_v_cache_buffer_dev_));
514 dequant_v_cache_buffer_dev_ = nullptr;
515 }
516 // Free selective KVCache dequantization buffers
517 if (selective_k_dequant_buffer_dev_) {
518 gpuErrchk(cudaFree(selective_k_dequant_buffer_dev_));
519 selective_k_dequant_buffer_dev_ = nullptr;
520 }
521 if (selective_v_dequant_buffer_dev_) {
522 gpuErrchk(cudaFree(selective_v_dequant_buffer_dev_));
523 selective_v_dequant_buffer_dev_ = nullptr;
524 }
525
526 // Free persistent batch processing buffers
527 free_persistent_batch_buffers();
528
529 Logger::info("Freed persistent GPU workspace buffers.");
530 Logger::info("Finished freeing TinyLlamaModel CUDA weight memory.");
531 } else {
532 Logger::info("CPU-only mode: No GPU resources to free.");
533 }
534#endif
535}
536std::vector<float> TinyLlamaModel::forward(
537 std::vector<float>& input,
538 int n_tokens, KVCache* kv_cache,
539 const std::vector<int>* attention_mask) {
540 Logger::info("[CPU_FWD] Entered. Processing up to layer " + std::to_string(config_.num_cpu_offload_layers -1) + ". Input n_tokens: " + std::to_string(n_tokens));
541
542 int hs = config_.hidden_size;
543 int vs = config_.vocab_size;
544 int is = config_.intermediate_size;
545 int n_heads = config_.num_attention_heads;
546 int n_kv_heads = config_.num_key_value_heads;
547 int head_dim = hs / n_heads;
548 float eps = config_.rms_norm_eps;
549 int max_pos_embeddings = config_.max_position_embeddings;
550
551 bool log_first_gen_step = (n_tokens == 0);
552 bool log_this_step = log_first_gen_step || (n_tokens == 12) || (n_tokens == 13);
553
554 // Layer processing loop - ONLY for CPU-offloaded layers
555 for (int l = 0; l < config_.num_cpu_offload_layers; ++l) {
556 Logger::info("[CPU_FWD_MEM] Starting layer " + std::to_string(l) + " processing");
557
558 bool log_this_layer = log_this_step && (l == 0); // Log details only for layer 0 on specific steps
559 if (log_this_layer) {
560 Logger::info("[CPU_FWD] ------ START Layer " + std::to_string(l) +
561 " (pos=" + std::to_string(n_tokens) + ") ------");
562 log_vector_summary("Layer " + std::to_string(l) + " Input (input)", input);
563 }
564
565 const auto& lw = layers[l];
566 std::vector<float> x_norm_vec1(hs);
567 const std::vector<float>& w_input_norm_vec =
568 lw.input_layernorm_f32.empty()
569 ? bf16vec_to_float_vec(lw.input_layernorm)
570 : lw.input_layernorm_f32;
571 rmsnorm_vector_cpu(input, w_input_norm_vec, x_norm_vec1, eps);
572 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": Allocating QKV vectors");
573 std::vector<float> q_vec(hs), k_vec(n_kv_heads * head_dim), v_vec(n_kv_heads * head_dim);
574 bool enable_debug_logging = (l == 0);
575 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_q_proj_dequantized");
577 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_q_proj_dequantized completed");
578 if (!lw.q_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.q_proj_f32, x_norm_vec1, q_vec, hs, hs);
579 else if (!lw.q_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.q_proj_q8k, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
580 else if (!lw.q_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.q_proj_q8_0, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
581 else if (!lw.q_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.q_proj_q4k, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
582 else if (!lw.q_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.q_proj_q6k, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
583 else if (!lw.q_proj.empty()) matvec_bf16_f32_vector_cpu(lw.q_proj, x_norm_vec1, q_vec, hs, hs); // BF16 from SafeTensors
584 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Q proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
585
586 // ... K, V projections ...
587 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_k_proj_dequantized");
589 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_k_proj_dequantized completed");
590 if (!lw.k_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.k_proj_f32, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs);
591 else if (!lw.k_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.k_proj_q8k, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
592 else if (!lw.k_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.k_proj_q8_0, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
593 else if (!lw.k_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.k_proj_q4k, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
594 else if (!lw.k_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.k_proj_q6k, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
595 else if (!lw.k_proj.empty()) matvec_bf16_f32_vector_cpu(lw.k_proj, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs);
596 else throw std::runtime_error("Layer " + std::to_string(l) + ": No K proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
597
598 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_v_proj_dequantized");
600 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_v_proj_dequantized completed");
601 if (!lw.v_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.v_proj_f32, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs);
602 else if (!lw.v_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.v_proj_q8k, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
603 else if (!lw.v_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.v_proj_q8_0, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
604 else if (!lw.v_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.v_proj_q4k, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
605 else if (!lw.v_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.v_proj_q6k, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs, enable_debug_logging);
606 else if (!lw.v_proj.empty()) matvec_bf16_f32_vector_cpu(lw.v_proj, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs);
607 else throw std::runtime_error("Layer " + std::to_string(l) + ": No V proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
608
609 apply_rope_vector(q_vec, n_heads, head_dim, n_tokens, precomputed_freqs_cis_, max_pos_embeddings, config_.is_gguf_file_loaded);
610 apply_rope_vector(k_vec, n_kv_heads, head_dim, n_tokens, precomputed_freqs_cis_, max_pos_embeddings, config_.is_gguf_file_loaded);
611 if (kv_cache) {
612 if (static_cast<size_t>(l) < kv_cache->layers.size()) {
613 KVCacheLayer& kv_layer = kv_cache->layers[l];
614 size_t layer_max_seq_len = static_cast<size_t>(kv_cache->max_seq_len_config_);
615 if (static_cast<size_t>(n_tokens) >= layer_max_seq_len && layer_max_seq_len > 0) {
616 Logger::error("KV Cache access out of bounds in CPU forward. Layer " + std::to_string(l) +
617 ", n_tokens: " + std::to_string(n_tokens) +
618 ", configured layer_max_seq_len: " + std::to_string(layer_max_seq_len) + ". Skipping KV update.");
619 } else if (layer_max_seq_len == 0 && n_tokens > 0) {
620 Logger::error("KV Cache layer_max_seq_len is 0, but n_tokens > 0. Layer " + std::to_string(l) + ". Skipping KV update.");
621 } else {
622 for(int h=0; h < n_kv_heads; ++h) {
623 std::copy(k_vec.begin() + h * head_dim, k_vec.begin() + (h+1) * head_dim, kv_layer.k.begin() + n_tokens * (n_kv_heads * head_dim) + h * head_dim);
624 std::copy(v_vec.begin() + h * head_dim, v_vec.begin() + (h+1) * head_dim, kv_layer.v.begin() + n_tokens * (n_kv_heads * head_dim) + h * head_dim);
625 }
626 }
627 } else {
628 Logger::error("KV Cache layer index " + std::to_string(l) + " out of bounds for kv_cache->layers.size() = " + std::to_string(kv_cache->layers.size()));
629 }
630 }
631
632 std::vector<float> attn_out_vec(hs);
633 std::vector<float> x_resid1_vec = input; // Store residual
634 float att_scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
635 std::fill(attn_out_vec.begin(), attn_out_vec.end(), 0.0f);
636 for (int h = 0; h < n_heads; ++h) {
637 std::vector<float> q_head(head_dim);
638 std::copy(q_vec.begin() + h * head_dim, q_vec.begin() + (h + 1) * head_dim, q_head.begin());
639 std::vector<float> current_multihead_attn_out(head_dim, 0.0f);
640 int kv_cache_num_kv_heads = n_kv_heads; // from KVCache struct if available
641 int kv_group = n_heads / kv_cache_num_kv_heads;
642 int kv_head_idx = h / kv_group;
643
644 if (kv_cache && static_cast<size_t>(l) < kv_cache->layers.size()) {
645 const KVCacheLayer& kv_layer = kv_cache->layers[l];
646 int current_seq_len = n_tokens + 1;
647 std::vector<float> scores(current_seq_len);
648 for (int t = 0; t < current_seq_len; ++t) {
649 float score = 0.0f;
650 for (int d = 0; d < head_dim; ++d) {
651 score += q_head[d] * kv_layer.k[t * (n_kv_heads * head_dim) + kv_head_idx * head_dim + d];
652 }
653 scores[t] = score * att_scale;
654 }
655 softmax_vector_cpu(scores, scores); // In-place softmax
656 for (int t = 0; t < current_seq_len; ++t) {
657 for (int d = 0; d < head_dim; ++d) {
658 current_multihead_attn_out[d] += scores[t] * kv_layer.v[t * (n_kv_heads * head_dim) + kv_head_idx * head_dim + d];
659 }
660 }
661 }
662 std::copy(current_multihead_attn_out.begin(), current_multihead_attn_out.end(), attn_out_vec.begin() + h * head_dim);
663 }
664
665
666 std::vector<float> attn_proj_vec(hs);
667 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_o_proj_dequantized");
669 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_o_proj_dequantized completed");
670 if(!lw.o_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.o_proj_f32, attn_out_vec, attn_proj_vec, hs, hs);
671 else if (!lw.o_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.o_proj_q8k, attn_out_vec, attn_proj_vec, hs, hs, enable_debug_logging);
672 else if (!lw.o_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.o_proj_q8_0, attn_out_vec, attn_proj_vec, hs, hs, enable_debug_logging);
673 else if (!lw.o_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.o_proj_q4k, attn_out_vec, attn_proj_vec, hs, hs, enable_debug_logging);
674 else if (!lw.o_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.o_proj_q6k, attn_out_vec, attn_proj_vec, hs, hs, enable_debug_logging);
675 else if(!lw.o_proj.empty()) matvec_bf16_f32_vector_cpu(lw.o_proj, attn_out_vec, attn_proj_vec, hs, hs);
676 else throw std::runtime_error("Layer " + std::to_string(l) + ": No O proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
677
678 for(size_t i=0; i<input.size(); ++i) input[i] = x_resid1_vec[i] + attn_proj_vec[i]; // Update input by reference
679
680 // MLP part
681 std::vector<float> x_norm_vec2(hs);
682 std::vector<float> x_resid2_vec = input; // Store residual for MLP
683 const std::vector<float>& w_post_attn_norm_vec =
684 lw.post_attention_layernorm_f32.empty()
685 ? bf16vec_to_float_vec(lw.post_attention_layernorm)
686 : lw.post_attention_layernorm_f32;
687 rmsnorm_vector_cpu(input, w_post_attn_norm_vec, x_norm_vec2, eps);
688
689 std::vector<float> gate_vec(is), up_vec(is);
690 // Gate-projection
691 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_gate_proj_dequantized");
693 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_gate_proj_dequantized completed");
694 if(!lw.gate_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.gate_proj_f32, x_norm_vec2, gate_vec, is, hs);
695 else if (!lw.gate_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.gate_proj_q8k, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
696 else if (!lw.gate_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.gate_proj_q8_0, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
697 else if (!lw.gate_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.gate_proj_q4k, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
698 else if (!lw.gate_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.gate_proj_q6k, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
699 else if(!lw.gate_proj.empty()) matvec_bf16_f32_vector_cpu(lw.gate_proj, x_norm_vec2, gate_vec, is, hs);
700 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Gate proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
701
702 // Up-projection
703 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_up_proj_dequantized");
705 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_up_proj_dequantized completed");
706 if(!lw.up_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.up_proj_f32, x_norm_vec2, up_vec, is, hs);
707 else if (!lw.up_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.up_proj_q8k, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
708 else if (!lw.up_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.up_proj_q8_0, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
709 else if (!lw.up_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.up_proj_q4k, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
710 else if (!lw.up_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.up_proj_q6k, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
711 else if(!lw.up_proj.empty()) matvec_bf16_f32_vector_cpu(lw.up_proj, x_norm_vec2, up_vec, is, hs);
712 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Up proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
713
714 std::vector<float> silu_out_vec(is);
715 silu_cpu(gate_vec, silu_out_vec);
716
717 std::vector<float> swiglu_result_vec(is);
718 for(size_t i=0; i<is; ++i) swiglu_result_vec[i] = silu_out_vec[i] * up_vec[i];
719
720 std::vector<float> mlp_out_vec(hs);
721 // Down-projection
722 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_down_proj_dequantized");
724 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_down_proj_dequantized completed");
725 if(!lw.down_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.down_proj_f32, swiglu_result_vec, mlp_out_vec, hs, is);
726 else if (!lw.down_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.down_proj_q8k, swiglu_result_vec, mlp_out_vec, hs, is, enable_debug_logging);
727 else if (!lw.down_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.down_proj_q8_0, swiglu_result_vec, mlp_out_vec, hs, is, enable_debug_logging);
728 else if (!lw.down_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.down_proj_q4k, swiglu_result_vec, mlp_out_vec, hs, is, enable_debug_logging);
729 else if (!lw.down_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.down_proj_q6k, swiglu_result_vec, mlp_out_vec, hs, is, enable_debug_logging);
730 else if(!lw.down_proj.empty()) matvec_bf16_f32_vector_cpu(lw.down_proj, swiglu_result_vec, mlp_out_vec, hs, is);
731 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Down proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
732
733 for(size_t i=0; i<input.size(); ++i) input[i] = x_resid2_vec[i] + mlp_out_vec[i]; // Update input by reference
734
735
736 if (log_this_layer) {
737 Logger::info("[CPU_FWD] ------ END Layer " + std::to_string(l) +
738 " (pos=" + std::to_string(n_tokens) + ") ------");
739 }
741 int layer_to_clear = l - 2;
742 int first_gpu_layer = config_.num_cpu_offload_layers;
743 if (layer_to_clear < first_gpu_layer) {
744 clear_layer_dequantized_weights(layer_to_clear);
745 }
746 }
747 }
748
750 Logger::info("[CPU_FWD] All layers processed on CPU. Performing final RMSNorm and Logits.");
751 const std::vector<float>& w_final_norm_vec =
754 std::vector<float> x_final_norm_vec(hs);
755 rmsnorm_vector_cpu(input, w_final_norm_vec, x_final_norm_vec, eps);
756
757 std::vector<float> logits(vs);
759 bool enable_lm_head_debug_logging = true; // Always log LM head for debugging
760 if (!lm_head_f32.empty()) matvec_f32_f32_vector_cpu(lm_head_f32, x_final_norm_vec, logits, vs, hs);
761 else if (!lm_head_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lm_head_q8k, x_final_norm_vec, logits, vs, hs, enable_lm_head_debug_logging);
762 else if (!lm_head_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lm_head_q8_0, x_final_norm_vec, logits, vs, hs, enable_lm_head_debug_logging);
763 else if (!lm_head_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lm_head_q4k, x_final_norm_vec, logits, vs, hs, enable_lm_head_debug_logging);
764 else if (!lm_head_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lm_head_q6k, x_final_norm_vec, logits, vs, hs, enable_lm_head_debug_logging);
765 else if (!lm_head.empty()) matvec_bf16_f32_vector_cpu(lm_head, x_final_norm_vec, logits, vs, hs); // Fallback for BF16 SafeTensors
766 else throw std::runtime_error("No valid LM Head weights (f32, q8k, q8, q4k, q6k, bf16) found for CPU final stage.");
767
768 if (log_this_step || log_first_gen_step) {
769 log_vector_summary("[CPU_FWD] Final Logits (all CPU, pos=" + std::to_string(n_tokens) + ")", logits, 15);
770 }
771 return logits; // Return final logits if all layers were CPU
772 }
773
774 Logger::info("[CPU_FWD] Finished processing " + std::to_string(config_.num_cpu_offload_layers) + " CPU layers. Output is intermediate activation.");
775 return input; // Return the intermediate activations if not all layers were processed here.
776}
777
778#ifdef HAS_CUDA
779std::vector<float> TinyLlamaModel::forward_device(
780 float* x_input_dev,
781 int pos, KVCache* kv_cache,
782 const std::vector<int>* attention_mask, cudaStream_t stream) {
783
784 int hs = config_.hidden_size;
785 int vs = config_.vocab_size;
786 int n_heads = config_.num_attention_heads;
787 int n_kv_heads = config_.num_key_value_heads;
788 if (n_heads == 0) {
789 Logger::fatal("Number of attention heads is zero during forward_device.");
790 throw std::runtime_error("Division by zero: n_heads is zero.");
791 }
792 int head_dim = hs / n_heads;
793 int total_model_layers = config_.num_hidden_layers;
794 int num_cpu_layers = config_.num_cpu_offload_layers;
795 int num_gpu_layers = total_model_layers - num_cpu_layers;
796
797 if (num_gpu_layers <= 0) {
798 Logger::warning("forward_device called with no GPU layers to process (num_gpu_layers = " + std::to_string(num_gpu_layers) + "). Returning empty.");
799 return {};
800 }
801 if (!x_input_dev) {
802 Logger::error("forward_device called with null x_input_dev. This should be model_->x_dev_.");
803 return {};
804 }
805 if (!kv_cache) {
806 Logger::error("forward_device called with null KVCache.");
807 return {};
808 }
809
810 int is = config_.intermediate_size;
811 float eps = config_.rms_norm_eps;
812 std::vector<float> h_x_input_dev(config_.hidden_size);
813
814 cublasStatus_t stream_status = cublasSetStream(cublas_handle_, stream);
815 gpuErrchk(cudaMemcpyAsync(h_x_input_dev.data(), x_input_dev, config_.hidden_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
816 gpuErrchk(cudaStreamSynchronize(stream));
817
818 if (stream_status != CUBLAS_STATUS_SUCCESS) {
819 Logger::error("cublasSetStream failed in forward_device");
820 return {};
821 }
822 float* current_x_dev = x_input_dev;
823 for (int l_gpu_idx = 0; l_gpu_idx < num_gpu_layers; ++l_gpu_idx) {
824 int l_model_idx = num_cpu_layers + l_gpu_idx;
825
826 // Layer-specific norm weights are indexed by the model layer index (l_model_idx)
827 const float* lw_in_norm_dev = layers[l_model_idx].input_layernorm_dev;
828 const float* lw_post_norm_dev = layers[l_model_idx].post_attention_layernorm_dev;
829
830 gpuErrchk(cudaMemcpyAsync(x_resid1_dev_, x_dev_, hs * sizeof(float),
831 cudaMemcpyDeviceToDevice, stream));
832
833 if (!lw_in_norm_dev) {
834 throw std::runtime_error("[TM::fw_dev pos=" + std::to_string(pos) + " L" + std::to_string(l_model_idx) + "] Error: input_layernorm_dev is nullptr. GPU layer cannot proceed.");
835 }
836
837 // Use optimized kernels if enabled, fallback to standard if needed
839 rmsnorm_vector_cuda_optimized(x_dev_, lw_in_norm_dev, x_norm_dev_, hs, eps, stream);
840 } else {
841 rmsnorm_vector_cuda(x_dev_, lw_in_norm_dev, x_norm_dev_, hs, eps, stream);
842 }
843 // Use concatenated weights for optimal performance
845
846 if (w_q_f32_dev_ && w_k_f32_dev_ && w_v_f32_dev_) {
847 const float* w_q_layer_ptr = w_q_f32_dev_ + (size_t)l_gpu_idx * hs * hs;
848 const float* w_k_layer_ptr = w_k_f32_dev_ + (size_t)l_gpu_idx * n_kv_heads * head_dim * hs;
849 const float* w_v_layer_ptr = w_v_f32_dev_ + (size_t)l_gpu_idx * n_kv_heads * head_dim * hs;
850
851 matvec_f32_f32_cuda(cublas_handle_, w_q_layer_ptr, x_norm_dev_,
852 q_dev_, hs, hs, stream);
853 matvec_f32_f32_cuda(cublas_handle_, w_k_layer_ptr, x_norm_dev_,
854 k_dev_, n_kv_heads * head_dim, hs, stream);
855 matvec_f32_f32_cuda(cublas_handle_, w_v_layer_ptr, x_norm_dev_,
856 v_dev_, n_kv_heads * head_dim, hs, stream);
857 } else {
858 Logger::error("GPU L" + std::to_string(l_model_idx) + " (gpu_idx " + std::to_string(l_gpu_idx) + "): No valid concatenated QKV weights."); return {};
859 }
860 rope_cuda(q_dev_, n_heads, head_dim, all_freqs_cis_dev, pos, config_.is_gguf_file_loaded, stream);
861 rope_cuda(k_dev_, n_kv_heads, head_dim, all_freqs_cis_dev, pos, config_.is_gguf_file_loaded, stream);
862
863 // K/V Cache Update Logic
864 if (static_cast<size_t>(l_model_idx) < kv_cache->layers.size()) {
865 KVCacheLayer& current_kv_layer = kv_cache->layers[l_model_idx];
867 for (int kvh = 0; kvh < n_kv_heads; ++kvh) {
868 const float* current_k_head_ptr_fp32 = k_dev_ + kvh * head_dim;
869 const float* current_v_head_ptr_fp32 = v_dev_ + kvh * head_dim;
870
871 size_t token_head_offset_quant = (static_cast<size_t>(pos) * n_kv_heads + kvh) * head_dim;
872 int8_t* k_quant_target_ptr = current_kv_layer.k_dev_quantized + token_head_offset_quant;
873 int8_t* v_quant_target_ptr = current_kv_layer.v_dev_quantized + token_head_offset_quant;
874
875 size_t scale_offset = static_cast<size_t>(pos) * n_kv_heads + kvh;
876 float* k_scale_target_ptr = current_kv_layer.k_dev_scales + scale_offset;
877 float* v_scale_target_ptr = current_kv_layer.v_dev_scales + scale_offset;
878
879 quantize_fp32_to_int8_symmetric_per_tensor_cuda(
880 current_k_head_ptr_fp32, k_quant_target_ptr, k_scale_target_ptr, head_dim, stream);
881 quantize_fp32_to_int8_symmetric_per_tensor_cuda(
882 current_v_head_ptr_fp32, v_quant_target_ptr, v_scale_target_ptr, head_dim, stream);
883 }
884 } else {
885 for (int kvh = 0; kvh < n_kv_heads; ++kvh) {
886 const float* current_k_head_ptr = k_dev_ + kvh * head_dim;
887 const float* current_v_head_ptr = v_dev_ + kvh * head_dim;
888
889 update_kv_cache_cuda(current_kv_layer.k_dev_fp32, current_k_head_ptr, pos,
890 kvh, kv_cache->allocated_max_seq_len,
891 kv_cache->allocated_num_kv_heads,
892 kv_cache->allocated_head_dim, stream);
893
894 update_kv_cache_cuda(current_kv_layer.v_dev_fp32, current_v_head_ptr, pos,
895 kvh, kv_cache->allocated_max_seq_len,
896 kv_cache->allocated_num_kv_heads,
897 kv_cache->allocated_head_dim, stream);
898 }
899 }
900
901 } else {
902 Logger::error("KVCache layer index " + std::to_string(l_model_idx) + " out of bounds for kv_cache->layers access in forward_device.");
903 return {};
904 }
905
906 float scale = 1.0f / SAFE_SQRT(static_cast<float>(head_dim));
907 const float* attention_k_cache_ptr_dev = nullptr;
908 const float* attention_v_cache_ptr_dev = nullptr;
909 KVCacheLayer& attention_kv_layer = kv_cache->layers[l_model_idx];
910
912 Logger::info("[GPU L" + std::to_string(l_model_idx) + "] Using SELECTIVE KVCache dequantization");
913 } else {
914 attention_k_cache_ptr_dev = attention_kv_layer.k_dev_fp32;
915 attention_v_cache_ptr_dev = attention_kv_layer.v_dev_fp32;
916 }
917
918 float current_attention_scale = 1.0f / sqrtf((float)head_dim);
919
921 selective_k_dequant_buffer_dev_ && selective_v_dequant_buffer_dev_) {
922 attention_cuda_selective_dequant(
923 q_dev_,
924 attention_kv_layer.k_dev_quantized,
925 attention_kv_layer.v_dev_quantized,
926 attention_kv_layer.k_dev_scales,
927 attention_kv_layer.v_dev_scales,
928 selective_k_dequant_buffer_dev_,
929 selective_v_dequant_buffer_dev_,
930 attn_out_dev_,
932 pos + 1,
933 head_dim,
934 current_attention_scale,
935 kv_cache->allocated_max_seq_len,
937 stream
938 );
939 } else {
940 // Use optimized kernels if enabled, fallback to standard if needed
942 attention_cuda_optimized(
943 q_dev_,
944 attention_k_cache_ptr_dev,
945 attention_v_cache_ptr_dev,
946 attn_out_dev_,
948 pos + 1,
949 head_dim,
950 current_attention_scale,
951 kv_cache->allocated_max_seq_len,
953 stream
954 );
955 } else {
956 attention_cuda(
957 q_dev_,
958 attention_k_cache_ptr_dev,
959 attention_v_cache_ptr_dev,
960 attn_out_dev_,
962 pos + 1,
963 head_dim,
964 current_attention_scale,
965 kv_cache->allocated_max_seq_len,
967 stream
968 );
969 }
970 }
971
972 if (w_o_f32_dev_) {
973 const float* lw_o_proj_f32_dev = w_o_f32_dev_ + (size_t)l_gpu_idx * hs * hs;
974 matvec_f32_f32_cuda(cublas_handle_, lw_o_proj_f32_dev, attn_out_dev_, attn_proj_dev_, hs, hs, stream);
975 } else {
976 Logger::error("GPU L" + std::to_string(l_model_idx) + " (gpu_idx " + std::to_string(l_gpu_idx) + "): No valid O proj weights (FP32/BF16)."); return {};
977 }
978
979 add_residual_cuda(attn_proj_dev_, x_resid1_dev_, current_x_dev, hs, stream);
980
981 gpuErrchk(cudaMemcpyAsync(x_resid2_dev_, current_x_dev, hs * sizeof(float), cudaMemcpyDeviceToDevice, stream));
982
983 if (!lw_post_norm_dev) { Logger::error("Missing post_attention_layernorm_dev for GPU layer model_idx=" + std::to_string(l_model_idx)); return {}; }
984
985 // Use optimized kernels if enabled, fallback to standard if needed
987 rmsnorm_vector_cuda_optimized(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
988 } else {
989 rmsnorm_vector_cuda(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
990 }
991
992 if (w_o_f32_dev_) {
993 const float* w_o_layer_ptr = w_o_f32_dev_ + (size_t)l_gpu_idx * hs * hs;
994 matvec_f32_f32_cuda(cublas_handle_, w_o_layer_ptr, attn_out_dev_, attn_proj_dev_, hs, hs, stream);
995 } else {
996 Logger::error("GPU L" + std::to_string(l_model_idx) + ": No valid O projection weights."); return {};
997 }
998
999 add_residual_cuda(attn_proj_dev_, x_resid1_dev_, current_x_dev, hs, stream);
1000 gpuErrchk(cudaMemcpyAsync(x_resid2_dev_, current_x_dev, hs * sizeof(float), cudaMemcpyDeviceToDevice, stream));
1001
1002 if (!lw_post_norm_dev) {
1003 Logger::error("Missing post_attention_layernorm_dev for GPU layer model_idx=" + std::to_string(l_model_idx)); return {};
1004 }
1005
1006 // Use optimized kernels if enabled, fallback to standard if needed
1008 rmsnorm_vector_cuda_optimized(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
1009 } else {
1010 rmsnorm_vector_cuda(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
1011 }
1012
1013 if (w_gate_f32_dev_ && w_up_f32_dev_) {
1014 const float* w_gate_layer_ptr = w_gate_f32_dev_ + (size_t)l_gpu_idx * is * hs;
1015 const float* w_up_layer_ptr = w_up_f32_dev_ + (size_t)l_gpu_idx * is * hs;
1016
1017 matvec_f32_f32_cuda(cublas_handle_, w_gate_layer_ptr, x_norm_dev_,
1018 gate_vec_dev_, is, hs, stream);
1019 matvec_f32_f32_cuda(cublas_handle_, w_up_layer_ptr, x_norm_dev_,
1020 up_vec_dev_, is, hs, stream);
1021 } else {
1022 Logger::error("GPU L" + std::to_string(l_model_idx) + ": No valid Gate/Up projection weights.");
1023 return {};
1024 }
1025
1026 swiglu_cuda(gate_vec_dev_, up_vec_dev_, swiglu_vec_dev_, is, stream);
1027
1028 if (w_down_f32_dev_) {
1029 const float* w_down_layer_ptr = w_down_f32_dev_ + (size_t)l_gpu_idx * hs * is;
1030 matvec_f32_f32_cuda(cublas_handle_, w_down_layer_ptr, swiglu_vec_dev_,
1031 mlp_down_dev_, hs, is, stream);
1032 } else {
1033 Logger::error("GPU L" + std::to_string(l_model_idx) + ": No valid Down projection weights.");
1034 return {};
1035 }
1036 add_residual_cuda(mlp_down_dev_, x_resid2_dev_, current_x_dev, hs, stream);
1037
1038 }
1039 // Use optimized kernels if enabled, fallback to standard if needed
1041 rmsnorm_vector_cuda_optimized(x_dev_, final_norm_dev, x_norm_dev_, hs, eps, stream);
1042 } else {
1043 rmsnorm_vector_cuda(x_dev_, final_norm_dev, x_norm_dev_, hs, eps, stream);
1044 }
1046 if (lm_head_dev_) {
1047 matvec_bf16_f32_cuda(cublas_handle_, lm_head_dev_, x_norm_dev_, logits_dev_,
1048 vs, hs, this->use_bf16_tensor_cores_, stream);
1049 } else {
1050 Logger::error("LM head (lm_head_dev_ for BF16) is null. Cannot calculate logits on GPU.");
1051 return {};
1052 }
1053
1054 gpuErrchk(cudaStreamSynchronize(stream));
1055 std::vector<float> logits(vs);
1056 gpuErrchk(cudaMemcpy(logits.data(), logits_dev_, vs * sizeof(float),
1057 cudaMemcpyDeviceToHost));
1058 return logits;
1059}
1060
1061#endif // HAS_CUDA
1062
1064 const std::vector<float>& final_batch_activations, // [num_tokens, hidden_size]
1065 int num_tokens_in_batch) {
1066
1067 if (final_batch_activations.size() != (size_t)num_tokens_in_batch * config_.hidden_size) {
1068 Logger::error("[CPU_LOGITS_BATCH] final_batch_activations size mismatch. Expected: " +
1069 std::to_string((size_t)num_tokens_in_batch * config_.hidden_size) + " Got: " +
1070 std::to_string(final_batch_activations.size()));
1071 return {};
1072 }
1073
1074 int hs = config_.hidden_size;
1075 int vs = config_.vocab_size;
1076 float eps = config_.rms_norm_eps;
1077
1078 // 1. Final RMSNorm
1079 std::vector<float> final_batch_norm_out(num_tokens_in_batch * hs);
1080 const std::vector<float>& w_final_norm_vec =
1083 if (w_final_norm_vec.empty()) {
1084 Logger::error("[CPU_LOGITS_BATCH] Final RMSNorm weights are empty (neither f32 nor bf16 available).");
1085 return {};
1086 }
1087
1088 rmsnorm_batch_cpu(final_batch_activations, w_final_norm_vec, final_batch_norm_out,
1089 num_tokens_in_batch, hs, eps);
1090
1091 // 2. Batched LM Head multiplication
1092 std::vector<float> batch_logits_out(num_tokens_in_batch * vs);
1093
1094 if (!lm_head_f32.empty()) {
1095 Logger::info("[CPU_LOGITS_BATCH] Using F32 LM Head weights.");
1096 matmul_f32_f32_batch_cpu(lm_head_f32, final_batch_norm_out, batch_logits_out,
1097 num_tokens_in_batch, vs, hs);
1098 } else if (!lm_head_q8_0.empty() && config_.is_gguf_file_loaded) {
1099 Logger::info("[CPU_LOGITS_BATCH] Using Q8_0 LM Head weights.");
1100 matmul_q8_0_f32_batch_cpu(lm_head_q8_0, final_batch_norm_out, batch_logits_out,
1101 num_tokens_in_batch, vs, hs);
1102 } else if (!lm_head_q6k.empty() && config_.is_gguf_file_loaded) {
1103 Logger::info("[CPU_LOGITS_BATCH] Using Q6_K LM Head weights.");
1104 matmul_q6k_f32_batch_cpu(lm_head_q6k, final_batch_norm_out, batch_logits_out,
1105 num_tokens_in_batch, vs, hs);
1106 } else if (!lm_head_q4k.empty() && config_.is_gguf_file_loaded) {
1107 Logger::info("[CPU_LOGITS_BATCH] Using Q4_K LM Head weights.");
1108 matmul_q4k_f32_batch_cpu(lm_head_q4k, final_batch_norm_out, batch_logits_out,
1109 num_tokens_in_batch, vs, hs);
1110 } else if (!lm_head.empty()) { // BF16 SafeTensors weights
1111 Logger::info("[CPU_LOGITS_BATCH] Using BF16 LM Head weights (converting to F32 for matmul).");
1112 std::vector<float> lm_head_f32_temp = bf16vec_to_float_vec(lm_head);
1113 if (lm_head_f32_temp.empty()) {
1114 Logger::error("[CPU_LOGITS_BATCH] Failed to convert BF16 LM Head to F32.");
1115 return {};
1116 }
1117 matmul_f32_f32_batch_cpu(lm_head_f32_temp, final_batch_norm_out, batch_logits_out,
1118 num_tokens_in_batch, vs, hs);
1119 } else {
1120 Logger::error("[CPU_LOGITS_BATCH] No valid LM Head weights found (F32, Q8_0, Q6_K, Q4_K, BF16).");
1121 return {};
1122 }
1123
1124 return batch_logits_out;
1125}
1126
1127std::vector<std::vector<float>> TinyLlamaModel::forward_cpu_batch_generation(
1128 const std::vector<float>& batch_input_activations, // [num_tokens, hidden_size]
1129 const std::vector<int>& token_positions, // Position of each token in its respective sequence
1130 const std::vector<int>& original_sequence_indices, // Original sequence index for each token
1131 int num_tokens_in_batch,
1132 KVCache* kv_cache) {
1133
1134 Logger::info("[CPU_BATCH_GEN] Entry: num_tokens=" + std::to_string(num_tokens_in_batch));
1135 std::string pos_str = "token_positions=[";
1136 for (int i = 0; i < std::min(num_tokens_in_batch, 3); ++i) {
1137 pos_str += std::to_string(token_positions[i]) + " ";
1138 }
1139 pos_str += "]";
1140 std::string seq_str = "original_sequence_indices=[";
1141 for (int i = 0; i < std::min(num_tokens_in_batch, 3); ++i) {
1142 seq_str += std::to_string(original_sequence_indices[i]) + " ";
1143 }
1144 seq_str += "]";
1145 Logger::info("[CPU_BATCH_GEN] " + pos_str + ", " + seq_str);
1146 if (batch_input_activations.size() != (size_t)num_tokens_in_batch * config_.hidden_size) {
1147 Logger::error("[CPU_BATCH_GENERATION] batch_input_activations size mismatch. Expected: " +
1148 std::to_string((size_t)num_tokens_in_batch * config_.hidden_size) + " Got: " +
1149 std::to_string(batch_input_activations.size()));
1150 return {};
1151 }
1152
1153 if (token_positions.size() != static_cast<size_t>(num_tokens_in_batch)) {
1154 Logger::error("[CPU_BATCH_GENERATION] token_positions size mismatch. Expected: " +
1155 std::to_string(num_tokens_in_batch) + " Got: " + std::to_string(token_positions.size()));
1156 return {};
1157 }
1158
1159 int hs = config_.hidden_size;
1160 int is = config_.intermediate_size;
1161 int n_heads = config_.num_attention_heads;
1162 int n_kv_heads = config_.num_key_value_heads;
1163 if (n_heads == 0) {
1164 Logger::error("[CPU_BATCH_GENERATION] Error: num_attention_heads is zero.");
1165 return {};
1166 }
1167 int head_dim = hs / n_heads;
1168 float eps = config_.rms_norm_eps;
1169 int max_pos_embeddings = config_.max_position_embeddings;
1170 bool use_rope_adjacent_pairing = config_.is_gguf_file_loaded;
1171 float attention_scale = 1.0f / SAFE_SQRT(static_cast<float>(head_dim));
1172 int vs = config_.vocab_size;
1173 int kv_group = n_heads / n_kv_heads; // Pre-calculate GQA grouping
1174
1175 std::vector<float> current_batch_activations = batch_input_activations;
1176 for (int l = 0; l < config_.num_cpu_offload_layers; ++l) {
1177 const auto& lw = layers[l];
1178
1179 // Batch RMSNorm for attention
1180 std::vector<float> batch_x_norm1(current_batch_activations.size());
1181 const std::vector<float>& w_input_norm_vec =
1182 lw.input_layernorm_f32.empty()
1183 ? bf16vec_to_float_vec(lw.input_layernorm)
1184 : lw.input_layernorm_f32;
1185 rmsnorm_batch_cpu(current_batch_activations, w_input_norm_vec, batch_x_norm1, num_tokens_in_batch, hs, eps);
1186
1187 std::vector<float> residual_batch_component_attn = current_batch_activations;
1188
1189 // Batch Q, K, V projections
1190 std::vector<float> q_batch((size_t)num_tokens_in_batch * hs);
1191 std::vector<float> k_batch((size_t)num_tokens_in_batch * n_kv_heads * head_dim);
1192 std::vector<float> v_batch((size_t)num_tokens_in_batch * n_kv_heads * head_dim);
1193
1194 // Q Projection (batched)
1195 if (!lw.q_proj_f32.empty()) {
1196 matmul_f32_f32_batch_cpu(lw.q_proj_f32, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
1197 } else if (!lw.q_proj_q8_0.empty()) {
1198 matmul_q8_0_f32_batch_cpu(lw.q_proj_q8_0, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
1199 } else if (!lw.q_proj_q6k.empty()) {
1200 matmul_q6k_f32_batch_cpu(lw.q_proj_q6k, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
1201 } else if (!lw.q_proj_q4k.empty()) {
1202 matmul_q4k_f32_batch_cpu(lw.q_proj_q4k, batch_x_norm1, q_batch, num_tokens_in_batch, hs, hs);
1203 } else {
1204 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No Q proj weights found for CPU (batched)");
1205 return {};
1206 }
1207
1208 // K Projection (batched)
1209 if (!lw.k_proj_f32.empty()) {
1210 matmul_f32_f32_batch_cpu(lw.k_proj_f32, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1211 } else if (!lw.k_proj_q8_0.empty()) {
1212 matmul_q8_0_f32_batch_cpu(lw.k_proj_q8_0, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1213 } else if (!lw.k_proj_q6k.empty()) {
1214 matmul_q6k_f32_batch_cpu(lw.k_proj_q6k, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1215 } else if (!lw.k_proj_q4k.empty()) {
1216 matmul_q4k_f32_batch_cpu(lw.k_proj_q4k, batch_x_norm1, k_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1217 } else {
1218 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No K proj weights found for CPU (batched)");
1219 return {};
1220 }
1221
1222 // V Projection (batched)
1223 if (!lw.v_proj_f32.empty()) {
1224 matmul_f32_f32_batch_cpu(lw.v_proj_f32, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1225 } else if (!lw.v_proj_q8_0.empty()) {
1226 matmul_q8_0_f32_batch_cpu(lw.v_proj_q8_0, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1227 } else if (!lw.v_proj_q6k.empty()) {
1228 matmul_q6k_f32_batch_cpu(lw.v_proj_q6k, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1229 } else if (!lw.v_proj_q4k.empty()) {
1230 matmul_q4k_f32_batch_cpu(lw.v_proj_q4k, batch_x_norm1, v_batch, num_tokens_in_batch, n_kv_heads * head_dim, hs);
1231 } else {
1232 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No V proj weights found for CPU (batched)");
1233 return {};
1234 }
1235
1236 // Optimized RoPE, KV cache update, and attention with OpenMP and SIMD
1237 std::vector<float> batch_attn_output((size_t)num_tokens_in_batch * hs);
1238
1239 // Ensure weights are dequantized before parallel processing
1247
1248 #pragma omp parallel if(num_tokens_in_batch > 1)
1249 {
1250 // Thread-local buffers to avoid allocations in loop
1251 std::vector<float> q_token(hs);
1252 std::vector<float> k_token(n_kv_heads * head_dim);
1253 std::vector<float> v_token(n_kv_heads * head_dim);
1254 std::vector<float> scores_buffer;
1255
1256 #pragma omp for
1257 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1258 int pos = token_positions[token_idx];
1259
1260 // Extract Q, K, V for this token (reuse thread-local buffers)
1261 std::copy(q_batch.begin() + (size_t)token_idx * hs,
1262 q_batch.begin() + (size_t)(token_idx + 1) * hs,
1263 q_token.begin());
1264 std::copy(k_batch.begin() + (size_t)token_idx * n_kv_heads * head_dim,
1265 k_batch.begin() + (size_t)(token_idx + 1) * n_kv_heads * head_dim,
1266 k_token.begin());
1267 std::copy(v_batch.begin() + (size_t)token_idx * n_kv_heads * head_dim,
1268 v_batch.begin() + (size_t)(token_idx + 1) * n_kv_heads * head_dim,
1269 v_token.begin());
1270
1271 // Apply RoPE individually
1272 apply_rope_vector(q_token, n_heads, head_dim, pos, precomputed_freqs_cis_, max_pos_embeddings, use_rope_adjacent_pairing);
1273 apply_rope_vector(k_token, n_kv_heads, head_dim, pos, precomputed_freqs_cis_, max_pos_embeddings, use_rope_adjacent_pairing);
1274
1275 // Update KV cache at specific position - Sequence-Major Layout
1276 if (kv_cache && static_cast<size_t>(l) < kv_cache->layers.size()) {
1277 auto& layer_cache = kv_cache->layers[l];
1278 // Use sequence-major layout to match prefill behavior
1279 int seq_idx = original_sequence_indices[token_idx];
1280 int sequence_base_offset = seq_idx * kv_cache->max_seq_len_config_;
1281 int kv_offset = (sequence_base_offset + pos) * n_kv_heads * head_dim;
1282 #pragma omp critical
1283 {
1284 if (kv_offset + n_kv_heads * head_dim <= static_cast<int>(layer_cache.k.size())) {
1285 std::copy(k_token.begin(), k_token.end(), layer_cache.k.begin() + kv_offset);
1286 std::copy(v_token.begin(), v_token.end(), layer_cache.v.begin() + kv_offset);
1287 }
1288 }
1289 }
1290
1291 // Copy back RoPE'd Q values to batch
1292 std::copy(q_token.begin(), q_token.end(), q_batch.begin() + (size_t)token_idx * hs);
1293
1294 // SIMD-optimized attention computation for this token
1295 int seq_idx = original_sequence_indices[token_idx];
1296 int history_len = (seq_idx < kv_cache->current_batch_size) ? kv_cache->batch_seq_lens[seq_idx] : pos + 1;
1297 scores_buffer.resize(history_len);
1298
1299 const float* q_token_ptr = q_batch.data() + (size_t)token_idx * hs;
1300 float* attn_output_ptr = batch_attn_output.data() + (size_t)token_idx * hs;
1301
1302 if (kv_cache && static_cast<size_t>(l) < kv_cache->layers.size()) {
1303 const auto& layer_cache = kv_cache->layers[l];
1304
1305 // Process all heads for this token efficiently with SIMD
1306 for (int h = 0; h < n_heads; ++h) {
1307 int kv_head_idx = h / kv_group; // Use pre-calculated kv_group
1308 const float* q_head_ptr = q_token_ptr + h * head_dim;
1309 float* head_output_ptr = attn_output_ptr + h * head_dim;
1310
1311 // SIMD-optimized attention score computation
1312 for (int t = 0; t < history_len; ++t) {
1313 // sequence-major layout: each sequence has contiguous region
1314 int seq_idx = original_sequence_indices[token_idx];
1315 int sequence_base_offset = seq_idx * kv_cache->max_seq_len_config_;
1316 const float* k_ptr = layer_cache.k.data() + (sequence_base_offset + t) * n_kv_heads * head_dim + kv_head_idx * head_dim;
1317
1318 // Use SIMD dot product for Q·K computation
1319#if defined(__AVX2__) || defined(__SSE2__) || defined(__ARM_NEON)
1320 float score = simd_dot_product(q_head_ptr, k_ptr, head_dim);
1321#else
1322 float score = 0.0f;
1323 for (int d = 0; d < head_dim; ++d) {
1324 score += q_head_ptr[d] * k_ptr[d];
1325 }
1326#endif
1327 scores_buffer[t] = score * attention_scale;
1328 }
1329
1330 // Softmax
1331 softmax_vector_cpu(scores_buffer, scores_buffer);
1332
1333 // SIMD-optimized weighted sum with V
1334 std::fill(head_output_ptr, head_output_ptr + head_dim, 0.0f);
1335 for (int t = 0; t < history_len; ++t) {
1336 // Sequence-major layout: each sequence has contiguous region
1337 int seq_idx = original_sequence_indices[token_idx];
1338 int sequence_base_offset = seq_idx * kv_cache->max_seq_len_config_;
1339 const float* v_ptr = layer_cache.v.data() + (sequence_base_offset + t) * n_kv_heads * head_dim + kv_head_idx * head_dim;
1340 float score = scores_buffer[t];
1341
1342 // Use SIMD scaled vector addition for score * V accumulation
1343#if defined(__AVX2__) || defined(__SSE2__) || defined(__ARM_NEON)
1344 simd_scaled_add(head_output_ptr, v_ptr, score, head_dim);
1345#else
1346 for (int d = 0; d < head_dim; ++d) {
1347 head_output_ptr[d] += score * v_ptr[d];
1348 }
1349#endif
1350 }
1351 }
1352 } else {
1353 std::fill(attn_output_ptr, attn_output_ptr + hs, 0.0f);
1354 }
1355 }
1356 }
1357 // O-Projection (batched)
1358 std::vector<float> batch_attn_proj_out((size_t)num_tokens_in_batch * hs);
1359 if(!lw.o_proj_f32.empty()) {
1360 matmul_f32_f32_batch_cpu(lw.o_proj_f32, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
1361 } else if (!lw.o_proj_q8_0.empty()) {
1362 matmul_q8_0_f32_batch_cpu(lw.o_proj_q8_0, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
1363 } else if (!lw.o_proj_q6k.empty()) {
1364 matmul_q6k_f32_batch_cpu(lw.o_proj_q6k, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
1365 } else if (!lw.o_proj_q4k.empty()) {
1366 matmul_q4k_f32_batch_cpu(lw.o_proj_q4k, batch_attn_output, batch_attn_proj_out, num_tokens_in_batch, hs, hs);
1367 } else {
1368 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No O proj weights found for CPU");
1369 return {};
1370 }
1371
1372 // First Residual Connection (batched)
1373 for(size_t i=0; i < current_batch_activations.size(); ++i) {
1374 current_batch_activations[i] = residual_batch_component_attn[i] + batch_attn_proj_out[i];
1375 }
1376
1377 // MLP processing (batched where possible)
1378 std::vector<float> residual_batch_component_mlp = current_batch_activations;
1379 std::vector<float> batch_x_norm2(current_batch_activations.size());
1380
1381 const std::vector<float>& w_post_attn_norm_vec =
1382 lw.post_attention_layernorm_f32.empty()
1383 ? bf16vec_to_float_vec(lw.post_attention_layernorm)
1384 : lw.post_attention_layernorm_f32;
1385 rmsnorm_batch_cpu(current_batch_activations, w_post_attn_norm_vec, batch_x_norm2, num_tokens_in_batch, hs, eps);
1386
1387 std::vector<float> batch_gate_proj_out((size_t)num_tokens_in_batch * is);
1388 std::vector<float> batch_up_proj_out((size_t)num_tokens_in_batch * is);
1389
1390 // Gate and Up projections (batched)
1391 if (!lw.gate_proj_f32.empty()) {
1392 matmul_f32_f32_batch_cpu(lw.gate_proj_f32, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
1393 } else if (!lw.gate_proj_q8_0.empty()) {
1394 matmul_q8_0_f32_batch_cpu(lw.gate_proj_q8_0, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
1395 } else if (!lw.gate_proj_q6k.empty()) {
1396 matmul_q6k_f32_batch_cpu(lw.gate_proj_q6k, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
1397 } else if (!lw.gate_proj_q4k.empty()) {
1398 matmul_q4k_f32_batch_cpu(lw.gate_proj_q4k, batch_x_norm2, batch_gate_proj_out, num_tokens_in_batch, is, hs);
1399 } else {
1400 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No gate_proj weights found for CPU");
1401 return {};
1402 }
1403
1404 if (!lw.up_proj_f32.empty()) {
1405 matmul_f32_f32_batch_cpu(lw.up_proj_f32, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
1406 } else if (!lw.up_proj_q8_0.empty()) {
1407 matmul_q8_0_f32_batch_cpu(lw.up_proj_q8_0, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
1408 } else if (!lw.up_proj_q6k.empty()) {
1409 matmul_q6k_f32_batch_cpu(lw.up_proj_q6k, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
1410 } else if (!lw.up_proj_q4k.empty()) {
1411 matmul_q4k_f32_batch_cpu(lw.up_proj_q4k, batch_x_norm2, batch_up_proj_out, num_tokens_in_batch, is, hs);
1412 } else {
1413 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No up_proj weights found for CPU");
1414 return {};
1415 }
1416
1417 // SwiGLU (batched)
1418 std::vector<float> batch_swiglu_out((size_t)num_tokens_in_batch * is);
1419 for (size_t i = 0; i < batch_gate_proj_out.size(); ++i) {
1420 float gate_val = batch_gate_proj_out[i];
1421 float silu_gate_val = gate_val / (1.0f + std::exp(-gate_val));
1422 batch_swiglu_out[i] = silu_gate_val * batch_up_proj_out[i];
1423 }
1424
1425 // Down Projection (batched)
1426 std::vector<float> batch_mlp_down_proj_out((size_t)num_tokens_in_batch * hs);
1427 if (!lw.down_proj_f32.empty()) {
1428 matmul_f32_f32_batch_cpu(lw.down_proj_f32, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
1429 } else if (!lw.down_proj_q8_0.empty()) {
1430 matmul_q8_0_f32_batch_cpu(lw.down_proj_q8_0, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
1431 } else if (!lw.down_proj_q6k.empty()) {
1432 matmul_q6k_f32_batch_cpu(lw.down_proj_q6k, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
1433 } else if (!lw.down_proj_q4k.empty()) {
1434 matmul_q4k_f32_batch_cpu(lw.down_proj_q4k, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
1435 } else {
1436 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No down_proj weights found for CPU");
1437 return {};
1438 }
1439
1440 // Second Residual Connection (batched)
1441 for(size_t i = 0; i < current_batch_activations.size(); ++i) {
1442 current_batch_activations[i] = residual_batch_component_mlp[i] + batch_mlp_down_proj_out[i];
1443 }
1444 }
1445
1446 // Update KV cache sequence length
1447 if (kv_cache && num_tokens_in_batch > 0) {
1448 // For batch mode, track positions per sequence
1449 if (kv_cache->current_batch_size > 0) {
1450 // Update batch_seq_lens based on the highest position seen for each sequence
1451 std::vector<int> max_positions_per_seq(kv_cache->current_batch_size, -1);
1452
1453 for (int i = 0; i < num_tokens_in_batch; ++i) {
1454 int seq_idx = original_sequence_indices[i];
1455 int pos = token_positions[i];
1456
1457 if (seq_idx >= 0 && seq_idx < kv_cache->current_batch_size) {
1458 max_positions_per_seq[seq_idx] = std::max(max_positions_per_seq[seq_idx], pos);
1459 }
1460 }
1461
1462 // Update batch_seq_lens to reflect new positions (pos + 1 since pos is 0-indexed)
1463 for (int seq_idx = 0; seq_idx < kv_cache->current_batch_size; ++seq_idx) {
1464 if (max_positions_per_seq[seq_idx] >= 0) {
1465 kv_cache->batch_seq_lens[seq_idx] = max_positions_per_seq[seq_idx] + 1;
1466 Logger::info("[CPU_BATCH_GEN] KV Length Max Update: seq_idx=" + std::to_string(seq_idx) +
1467 ", old_batch_seq_len=" + std::to_string(kv_cache->batch_seq_lens[seq_idx]) +
1468 ", new_batch_seq_len=" + std::to_string(max_positions_per_seq[seq_idx] + 1));
1469 }
1470 }
1471 // For single-sequence compatibility, update seq_len to the max
1472 kv_cache->seq_len = *std::max_element(kv_cache->batch_seq_lens.begin(),
1473 kv_cache->batch_seq_lens.begin() + kv_cache->current_batch_size);
1474 } else {
1475 // Fallback for single sequence mode
1476 int max_pos = *std::max_element(token_positions.begin(), token_positions.end());
1477 kv_cache->seq_len = std::max(kv_cache->seq_len, max_pos + 1);
1478 }
1479 }
1480
1481 // Final normalization and logits calculation for ALL tokens
1482 std::vector<float> batch_logits = forward_cpu_logits_batch(current_batch_activations, num_tokens_in_batch);
1483
1484 // Convert flat logits to per-token vectors
1485 std::vector<std::vector<float>> all_logits(num_tokens_in_batch, std::vector<float>(vs));
1486 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1487 std::copy(batch_logits.begin() + (size_t)token_idx * vs,
1488 batch_logits.begin() + (size_t)(token_idx + 1) * vs,
1489 all_logits[token_idx].begin());
1490 }
1491 return all_logits;
1492}
1493
1494#ifdef HAS_CUDA
1495std::vector<float> TinyLlamaModel::forward_device_batch_prefill(
1496 float* d_batch_input_embeddings, // This is now assumed to be activations *after* CPU layers if any
1497 int num_tokens_in_batch,
1498 int current_model_pos, // This should be the starting position *within the KV cache* for the batch
1499 KVCache* kv_cache,
1500 cudaStream_t stream) {
1501
1502 Logger::info("[FWD_DEV_BATCH_PREFILL_ENTRY] num_tokens_in_batch: " + std::to_string(num_tokens_in_batch) +
1503 ", current_model_pos: " + std::to_string(current_model_pos) +
1504 ", d_batch_input_embeddings: " + Logger::ptrToString(d_batch_input_embeddings));
1505
1506 const int hidden_size = config_.hidden_size;
1507 const int head_dim = config_.hidden_size / config_.num_attention_heads;
1508 const int ffn_intermediate_dim = config_.intermediate_size;
1509 const int n_kv_dim = config_.num_key_value_heads * head_dim;
1510 const int vocab_size = config_.vocab_size;
1511
1512 Logger::debug("[FWD_DEV_BATCH_PREFILL_PARAMS] hidden_size: " + std::to_string(hidden_size) +
1513 ", head_dim: " + std::to_string(head_dim) +
1514 ", ffn_intermediate_dim: " + std::to_string(ffn_intermediate_dim) +
1515 ", n_kv_dim: " + std::to_string(n_kv_dim) +
1516 ", vocab_size: " + std::to_string(vocab_size) +
1517 ", num_attention_heads: " + std::to_string(config_.num_attention_heads) +
1518 ", num_key_value_heads: " + std::to_string(config_.num_key_value_heads));
1519
1520 float* d_batch_x_ptr = d_batch_input_embeddings; // Input to the first GPU layer
1521 float* d_batch_x_norm_out_attn;
1522 float* d_batch_q_proj_out;
1523 float* d_batch_k_proj_out;
1524 float* d_batch_v_proj_out;
1525 float* d_batch_attn_heads_concat_out;
1526 float* d_batch_attn_final_proj_out;
1527 float* d_batch_residual_attn_in;
1528 float* d_batch_residual_ffn_in;
1529 float* d_batch_x_norm_out_ffn;
1530 float* d_batch_ffn_gate_proj_out;
1531 float* d_batch_ffn_up_proj_out;
1532 float* d_batch_ffn_swiglu_out;
1533 float* d_batch_ffn_down_proj_out;
1534 float* d_batch_layer_output = nullptr;
1535 size_t batch_hidden_size_elems = (size_t)num_tokens_in_batch * hidden_size;
1536 size_t batch_kv_proj_size_elems = (size_t)num_tokens_in_batch * n_kv_dim;
1537 size_t batch_ffn_intermediate_elems = (size_t)num_tokens_in_batch * ffn_intermediate_dim;
1538 size_t batch_hidden_size_bytes = batch_hidden_size_elems * sizeof(float);
1539 size_t batch_kv_proj_size_bytes = batch_kv_proj_size_elems * sizeof(float);
1540 size_t batch_ffn_intermediate_bytes = batch_ffn_intermediate_elems * sizeof(float);
1541 resize_persistent_batch_buffers_if_needed(num_tokens_in_batch);
1542
1543 // Assign persistent buffers instead of allocating per forward pass
1544 d_batch_x_norm_out_attn = d_persistent_batch_norm_out_;
1545 d_batch_q_proj_out = d_persistent_q_batch_;
1546 d_batch_k_proj_out = d_persistent_k_batch_;
1547 d_batch_v_proj_out = d_persistent_v_batch_;
1548 d_batch_attn_heads_concat_out = d_persistent_attn_output_;
1549 d_batch_attn_final_proj_out = d_persistent_attn_proj_out_;
1550 d_batch_residual_attn_in = d_persistent_batch_residual_;
1551 d_batch_residual_ffn_in = d_persistent_batch_residual_ + num_tokens_in_batch * hidden_size; // Offset for second residual
1552 d_batch_x_norm_out_ffn = d_persistent_batch_norm_out_; // Can reuse norm buffer
1553 d_batch_ffn_gate_proj_out = d_persistent_gate_proj_out_;
1554 d_batch_ffn_up_proj_out = d_persistent_up_proj_out_;
1555 d_batch_ffn_swiglu_out = d_persistent_swiglu_out_;
1556 d_batch_ffn_down_proj_out = d_persistent_mlp_down_out_;
1557
1559 d_batch_layer_output = d_persistent_batch_input_; // Can reuse input buffer for layer output
1560 }
1561 const float alpha = 1.0f;
1562 const float beta = 0.0f;
1563
1564 cublasStatus_t stream_status = cublasSetStream(cublas_handle_, stream);
1565 if (stream_status != CUBLAS_STATUS_SUCCESS) {
1566 Logger::fatal("cublasSetStream failed in forward_device_batch_prefill");
1567 throw std::runtime_error("cublasSetStream failed");
1568 }
1569
1570 Logger::info("[FWD_DEV_BATCH_PREFILL_MAIN_LOOP_ENTRY] num_cpu_offload_layers: " + std::to_string(config_.num_cpu_offload_layers) +
1571 ", total_hidden_layers: " + std::to_string(config_.num_hidden_layers));
1572
1573 for (int l_model_idx = config_.num_cpu_offload_layers; l_model_idx < config_.num_hidden_layers; ++l_model_idx) {
1574 int l_gpu_idx = l_model_idx - config_.num_cpu_offload_layers;
1575 Logger::info("[FWD_DEV_BATCH_PREFILL_LAYER_START] Processing Layer: model_idx=" + std::to_string(l_model_idx) + ", gpu_idx=" + std::to_string(l_gpu_idx) +
1576 ". Current d_batch_x_ptr: " + Logger::ptrToString(d_batch_x_ptr));
1577 // Weight dequantization handled by ensure_f32_concatenated_weights_loaded() below
1578 gpuErrchk(cudaMemcpyAsync(d_batch_residual_attn_in, d_batch_x_ptr, batch_hidden_size_bytes, cudaMemcpyDeviceToDevice, stream));
1579
1580 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
1581 layers[l_model_idx].input_layernorm_dev,
1582 num_tokens_in_batch, hidden_size, config_.rms_norm_eps, stream);
1583
1585
1586 const float* w_q_layer_ptr = w_q_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
1587 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, hidden_size, hidden_size, &alpha,
1588 d_batch_x_norm_out_attn, hidden_size, w_q_layer_ptr, hidden_size, &beta,
1589 d_batch_q_proj_out, hidden_size, stream, "Q_PROJ_GEN");
1590 Logger::info("[GPU_Q_PROJ] Layer=" + std::to_string(l_model_idx) +
1591 ", input_ptr=" + Logger::ptrToString(d_batch_x_norm_out_attn) +
1592 ", weight_ptr=" + Logger::ptrToString(w_q_layer_ptr) +
1593 ", output_ptr=" + Logger::ptrToString(d_batch_q_proj_out));
1594 // Q_PROJ_OUT LOGGING (Unchanged)
1595 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 0 && hidden_size > 0) { // Existing logging condition
1596 int log_elements_common = std::min(static_cast<int>(head_dim), 3);
1597 if (log_elements_common <= 0 && hidden_size > 0) log_elements_common = std::min(static_cast<int>(hidden_size), 3);
1598 if (log_elements_common > 0) {
1599 std::vector<float> h_sample_t0(log_elements_common);
1600 gpuErrchk(cudaMemcpyAsync(h_sample_t0.data(), d_batch_q_proj_out, log_elements_common * sizeof(float), cudaMemcpyDeviceToHost, stream));
1601 if (num_tokens_in_batch <= 1) gpuErrchk(cudaStreamSynchronize(stream));
1602 std::string str_t0 = ""; for(float val : h_sample_t0) { str_t0 += std::to_string(val) + " "; }
1603 Logger::debug("[FWD_DEV_BATCH_PREFILL_LAYER_L" + std::to_string(l_model_idx) + "] Q_PROJ_OUT (T0, H0, first " + std::to_string(log_elements_common) + "): " + str_t0);
1604 if (num_tokens_in_batch > 1) {
1605 std::vector<float> h_sample_t1(log_elements_common);
1606 gpuErrchk(cudaMemcpyAsync(h_sample_t1.data(), d_batch_q_proj_out + hidden_size, log_elements_common * sizeof(float), cudaMemcpyDeviceToHost, stream));
1607 gpuErrchk(cudaStreamSynchronize(stream));
1608 std::string str_t1 = ""; for(float val : h_sample_t1) { str_t1 += std::to_string(val) + " "; }
1609 Logger::debug("[FWD_DEV_BATCH_PREFILL_LAYER_L" + std::to_string(l_model_idx) + "] Q_PROJ_OUT (T1, H0, first " + std::to_string(log_elements_common) + "): " + str_t1);
1610 }
1611 }
1612 }
1613
1614 const float* w_k_layer_ptr = w_k_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1615 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, n_kv_dim, hidden_size, &alpha,
1616 d_batch_x_norm_out_attn, hidden_size, w_k_layer_ptr, n_kv_dim, &beta,
1617 d_batch_k_proj_out, n_kv_dim, stream, "K_PROJ_GEN");
1618
1619 const float* w_v_layer_ptr = w_v_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1620 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, n_kv_dim, hidden_size, &alpha,
1621 d_batch_x_norm_out_attn, hidden_size, w_v_layer_ptr, n_kv_dim, &beta,
1622 d_batch_v_proj_out, n_kv_dim, stream, "V_PROJ_GEN");
1623
1624 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 0 && head_dim > 0) { // Existing logging condition
1625 int log_elements_rope = std::min(3, head_dim);
1626 for (int token_to_log_idx_rope = 0; token_to_log_idx_rope < std::min(num_tokens_in_batch, 2); ++token_to_log_idx_rope) {
1627 if (d_batch_q_proj_out && config_.num_attention_heads > 0) {
1628 std::vector<float> h_q_pre_rope(log_elements_rope);
1629 size_t q_log_offset = (size_t)token_to_log_idx_rope * config_.num_attention_heads * head_dim;
1630 gpuErrchk(cudaMemcpyAsync(h_q_pre_rope.data(), d_batch_q_proj_out + q_log_offset, log_elements_rope * sizeof(float), cudaMemcpyDeviceToHost, stream));
1631 gpuErrchk(cudaStreamSynchronize(stream));
1632 std::string str_q_pre_rope = ""; for(float val : h_q_pre_rope) { str_q_pre_rope += std::to_string(val) + " "; }
1633 }
1634 if (d_batch_k_proj_out && config_.num_key_value_heads > 0) {
1635 std::vector<float> h_k_pre_rope(log_elements_rope);
1636 size_t k_log_offset = (size_t)token_to_log_idx_rope * config_.num_key_value_heads * head_dim;
1637 gpuErrchk(cudaMemcpyAsync(h_k_pre_rope.data(), d_batch_k_proj_out + k_log_offset, log_elements_rope * sizeof(float), cudaMemcpyDeviceToHost, stream));
1638 gpuErrchk(cudaStreamSynchronize(stream));
1639 std::string str_k_pre_rope = ""; for(float val : h_k_pre_rope) { str_k_pre_rope += std::to_string(val) + " "; }
1640 }
1641 }
1642 }
1643
1644 rope_batch_cuda(d_batch_q_proj_out, d_batch_k_proj_out, all_freqs_cis_dev, num_tokens_in_batch,
1646 current_model_pos, config_.is_gguf_file_loaded, stream);
1647 gpuErrchk(cudaStreamSynchronize(stream));
1648
1649 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 0 && head_dim > 0) { // Existing logging condition
1650 int log_elements_rope = std::min(3, head_dim);
1651 for (int token_to_log_idx_rope = 0; token_to_log_idx_rope < std::min(num_tokens_in_batch, 2); ++token_to_log_idx_rope) {
1652 if (d_batch_q_proj_out && config_.num_attention_heads > 0) {
1653 std::vector<float> h_q_post_rope(log_elements_rope);
1654 size_t q_log_offset = (size_t)token_to_log_idx_rope * config_.num_attention_heads * head_dim;
1655 gpuErrchk(cudaMemcpy(h_q_post_rope.data(), d_batch_q_proj_out + q_log_offset, log_elements_rope * sizeof(float), cudaMemcpyDeviceToHost));
1656 std::string str_q_post_rope = ""; for(float val : h_q_post_rope) { str_q_post_rope += std::to_string(val) + " "; }
1657 }
1658 if (d_batch_k_proj_out && config_.num_key_value_heads > 0) {
1659 std::vector<float> h_k_post_rope(log_elements_rope);
1660 size_t k_log_offset = (size_t)token_to_log_idx_rope * config_.num_key_value_heads * head_dim;
1661 gpuErrchk(cudaMemcpy(h_k_post_rope.data(), d_batch_k_proj_out + k_log_offset, log_elements_rope * sizeof(float), cudaMemcpyDeviceToHost));
1662 std::string str_k_post_rope = ""; for(float val : h_k_post_rope) { str_k_post_rope += std::to_string(val) + " "; }
1663 }
1664 }
1665 }
1666
1667 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 0 && head_dim > 0 && config_.num_key_value_heads > 0) { // Existing logging
1668 int log_elements = std::min(3, head_dim);
1669 if (d_batch_k_proj_out) {
1670 std::vector<float> h_k_log_token0(log_elements);
1671 gpuErrchk(cudaMemcpyAsync(h_k_log_token0.data(), d_batch_k_proj_out, log_elements * sizeof(float), cudaMemcpyDeviceToHost, stream));
1672 std::vector<float> h_k_log_token1(log_elements);
1673 if (num_tokens_in_batch > 1) { gpuErrchk(cudaMemcpyAsync(h_k_log_token1.data(), d_batch_k_proj_out + n_kv_dim, log_elements * sizeof(float), cudaMemcpyDeviceToHost, stream));}
1674 gpuErrchk(cudaStreamSynchronize(stream));
1675 }
1676 if (d_batch_v_proj_out) {
1677 std::vector<float> h_v_log_token0(log_elements);
1678 gpuErrchk(cudaMemcpyAsync(h_v_log_token0.data(), d_batch_v_proj_out, log_elements * sizeof(float), cudaMemcpyDeviceToHost, stream));
1679 std::vector<float> h_v_log_token1(log_elements);
1680 if (num_tokens_in_batch > 1) { gpuErrchk(cudaMemcpyAsync(h_v_log_token1.data(), d_batch_v_proj_out + n_kv_dim, log_elements * sizeof(float), cudaMemcpyDeviceToHost, stream));}
1681 gpuErrchk(cudaStreamSynchronize(stream));
1682 }
1683 }
1684
1685 float* d_layer_k_cache_ptr = kv_cache->layers[l_model_idx].k_dev_fp32;
1686 float* d_layer_v_cache_ptr = kv_cache->layers[l_model_idx].v_dev_fp32;
1687 update_kv_cache_batch_cuda(d_layer_k_cache_ptr, d_batch_k_proj_out, current_model_pos, num_tokens_in_batch,
1688 config_.num_key_value_heads, head_dim, kv_cache->max_seq_len_config_, stream);
1689 update_kv_cache_batch_cuda(d_layer_v_cache_ptr, d_batch_v_proj_out, current_model_pos, num_tokens_in_batch,
1690 config_.num_key_value_heads, head_dim, kv_cache->max_seq_len_config_, stream);
1691 gpuErrchk(cudaStreamSynchronize(stream));
1692
1693 float current_attention_scale = 1.0f / sqrtf((float)head_dim);
1694 attention_batch_prefill_cuda(d_batch_q_proj_out, nullptr, nullptr,
1695 d_layer_k_cache_ptr, d_layer_v_cache_ptr,
1696 d_batch_attn_heads_concat_out, num_tokens_in_batch, current_model_pos,
1698 config_.num_key_value_heads, head_dim, current_attention_scale, stream, nullptr);
1699 const float* w_o_layer_ptr = w_o_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
1700 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, hidden_size, hidden_size, &alpha,
1701 d_batch_attn_heads_concat_out, hidden_size, w_o_layer_ptr, hidden_size, &beta,
1702 d_batch_attn_final_proj_out, hidden_size, stream, "O_PROJ_GEN");
1703
1704 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && hidden_size > 0) { /* ... */ }
1705 add_residual_batch_cuda(d_batch_residual_ffn_in, d_batch_attn_final_proj_out, d_batch_residual_attn_in,
1706 num_tokens_in_batch, hidden_size, stream);
1707 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && hidden_size > 0) { /* ... */ }
1708
1709
1710 rmsnorm_batch_cuda(d_batch_x_norm_out_ffn, d_batch_residual_ffn_in,
1711 layers[l_model_idx].post_attention_layernorm_dev,
1712 num_tokens_in_batch, hidden_size, config_.rms_norm_eps, stream);
1713 // RMSNORM_FFN_OUT LOGGING (Unchanged) - Logging Point 3
1714 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && hidden_size > 0) { /* ... */ }
1715
1716
1717 const float* w1_layer_ptr = w_gate_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
1718 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, ffn_intermediate_dim, hidden_size, &alpha,
1719 d_batch_x_norm_out_ffn, hidden_size, w1_layer_ptr, ffn_intermediate_dim, &beta,
1720 d_batch_ffn_gate_proj_out, ffn_intermediate_dim, stream, "FFN_GATE_PROJ_GEN");
1721 // FFN_GATE_PROJ_OUT LOGGING (Unchanged) - Logging Point 4
1722 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && ffn_intermediate_dim > 0) { /* ... */ }
1723
1724
1725 const float* w3_layer_ptr = w_up_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
1726 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, ffn_intermediate_dim, hidden_size, &alpha,
1727 d_batch_x_norm_out_ffn, hidden_size, w3_layer_ptr, ffn_intermediate_dim, &beta,
1728 d_batch_ffn_up_proj_out, ffn_intermediate_dim, stream, "FFN_UP_PROJ_GEN");
1729 // FFN_UP_PROJ_OUT LOGGING (Unchanged) - Logging Point 5
1730 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && ffn_intermediate_dim > 0) { /* ... */ }
1731
1732
1733 swiglu_batch_cuda(d_batch_ffn_swiglu_out, d_batch_ffn_gate_proj_out, d_batch_ffn_up_proj_out,
1734 num_tokens_in_batch, ffn_intermediate_dim, stream);
1735 // FFN_SWIGLU_OUT LOGGING (Unchanged) - Logging Point 6
1736 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && ffn_intermediate_dim > 0) { /* ... */ }
1737
1738
1739 const float* w2_layer_ptr = w_down_f32_dev_ + (size_t)l_gpu_idx * ffn_intermediate_dim * hidden_size;
1740 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, hidden_size, ffn_intermediate_dim, &alpha,
1741 d_batch_ffn_swiglu_out, ffn_intermediate_dim, w2_layer_ptr, hidden_size, &beta,
1742 d_batch_ffn_down_proj_out, hidden_size, stream, "FFN_DOWN_PROJ_GEN");
1743 // FFN_DOWN_PROJ_OUT LOGGING (Unchanged) - Logging Point 7
1744 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && hidden_size > 0) { /* ... */ }
1745
1746
1747 add_residual_batch_cuda(d_batch_layer_output, d_batch_ffn_down_proj_out, d_batch_residual_ffn_in,
1748 num_tokens_in_batch, hidden_size, stream);
1749 // POST_RESIDUAL_FFN LOGGING (Unchanged) - Logging Point 8
1750 if (l_model_idx == config_.num_cpu_offload_layers && num_tokens_in_batch > 1 && hidden_size > 0) { /* ... */ }
1751
1752
1753 d_batch_x_ptr = d_batch_layer_output;
1754 Logger::info("[FWD_DEV_BATCH_PREFILL_LAYER_END] Layer " + std::to_string(l_model_idx) + " finished. Next d_batch_x_ptr: " + Logger::ptrToString(d_batch_x_ptr));
1755 }
1756
1757 if (num_tokens_in_batch > 0) {
1758 std::vector<float> h_last_token_hidden_state(config_.hidden_size);
1759 size_t offset_last_token_hidden_state = (size_t)(num_tokens_in_batch - 1) * config_.hidden_size;
1760
1761 gpuErrchk(cudaMemcpyAsync(h_last_token_hidden_state.data(),
1762 d_batch_x_ptr + offset_last_token_hidden_state,
1763 config_.hidden_size * sizeof(float),
1764 cudaMemcpyDeviceToHost, stream));
1765 gpuErrchk(cudaStreamSynchronize(stream));
1766 Logger::log_vector_stats("[FWD_DEV_BATCH_PREFILL_LAST_TOKEN_HIDDEN_STATE_PRE_FINAL_RMSNORM]", h_last_token_hidden_state, 20);
1767 }
1768 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
1769 final_norm_dev,
1770 num_tokens_in_batch, hidden_size, config_.rms_norm_eps, stream);
1771
1772 if (config_.num_cpu_offload_layers < config_.num_hidden_layers && num_tokens_in_batch > 0 && hidden_size > 0) { /* ... */ }
1773 float* d_logits_last_token;
1774 gpuErrchk(cudaMalloc(&d_logits_last_token, (size_t)vocab_size * sizeof(float)));
1775
1776 // Only calculate logits for the last token in the batch for prefill output
1777 float* d_last_token_activations_for_logits = d_batch_x_norm_out_attn + (size_t)(num_tokens_in_batch - 1) * hidden_size;
1778
1779 matvec_f32_f32_cuda(cublas_handle_, lm_head_f32_dev_, d_last_token_activations_for_logits,
1780 d_logits_last_token, vocab_size, hidden_size, stream);
1781
1782 std::vector<float> h_logits(vocab_size);
1783 gpuErrchk(cudaMemcpyAsync(h_logits.data(), d_logits_last_token, (size_t)vocab_size * sizeof(float),
1784 cudaMemcpyDeviceToHost, stream));
1785 gpuErrchk(cudaStreamSynchronize(stream));
1786
1787 Logger::log_vector_stats("[FWD_DEV_BATCH_PREFILL_FINAL_LOGITS]", h_logits, 20);
1788
1789 if (config_.num_hidden_layers > config_.num_cpu_offload_layers && kv_cache != nullptr && num_tokens_in_batch > 0) {
1790 int first_gpu_layer_model_idx = config_.num_cpu_offload_layers;
1791 if (static_cast<size_t>(first_gpu_layer_model_idx) < kv_cache->layers.size()) {
1792 const KVCacheLayer& cache_layer_to_log = kv_cache->layers[first_gpu_layer_model_idx];
1793 const float* d_k_cache_ptr = cache_layer_to_log.k_dev_fp32;
1794 const float* d_v_cache_ptr = cache_layer_to_log.v_dev_fp32;
1795 const int num_kv_h = config_.num_key_value_heads;
1796 const int local_n_kv_dim_for_log = num_kv_h * head_dim;
1797 const int log_elems_kv = std::min(3, head_dim);
1798
1799 if (d_k_cache_ptr && d_v_cache_ptr && log_elems_kv > 0 && local_n_kv_dim_for_log > 0) {
1800 for (int tk_idx = 0; tk_idx < num_tokens_in_batch; ++tk_idx) {
1801 int cache_pos_for_token = current_model_pos + tk_idx;
1802 if (cache_pos_for_token >= kv_cache->max_seq_len_config_) {
1803 Logger::warning("[KVDUMP_POST_BATCH_PREFILL] L" + std::to_string(first_gpu_layer_model_idx) +
1804 " Token " + std::to_string(tk_idx) + " (CachePos " + std::to_string(cache_pos_for_token) +
1805 ") would be out of bounds (" + std::to_string(kv_cache->max_seq_len_config_) + "). Skipping.");
1806 continue;
1807 }
1808 for (int kvh_idx = 0; kvh_idx < num_kv_h; ++kvh_idx) {
1809 size_t offset_in_cache = (size_t)cache_pos_for_token * local_n_kv_dim_for_log + (size_t)kvh_idx * head_dim;
1810
1811 std::vector<float> h_k_dump(log_elems_kv);
1812 gpuErrchk(cudaMemcpy(h_k_dump.data(), d_k_cache_ptr + offset_in_cache, log_elems_kv * sizeof(float), cudaMemcpyDeviceToHost));
1813 std::string str_k_dump = ""; for(float val : h_k_dump) { str_k_dump += std::to_string(val) + " "; }
1814 std::vector<float> h_v_dump(log_elems_kv);
1815 gpuErrchk(cudaMemcpy(h_v_dump.data(), d_v_cache_ptr + offset_in_cache, log_elems_kv * sizeof(float), cudaMemcpyDeviceToHost));
1816 std::string str_v_dump = ""; for(float val : h_v_dump) { str_v_dump += std::to_string(val) + " "; }
1817 }
1818 }
1819 } else {
1820 Logger::warning("[KVDUMP_POST_BATCH_PREFILL] L" + std::to_string(first_gpu_layer_model_idx) +
1821 " cannot log K/V cache: null pointers, log_elems_kv <= 0, or local_n_kv_dim_for_log <=0.");
1822 }
1823 } else {
1824 Logger::warning("[KVDUMP_POST_BATCH_PREFILL] First GPU layer index " + std::to_string(first_gpu_layer_model_idx) +
1825 " out of bounds for kv_cache->layers (size " + std::to_string(kv_cache->layers.size()) + ")");
1826 }
1827 }
1828 gpuErrchk(cudaFree(d_logits_last_token));
1829 Logger::info("[FWD_DEV_BATCH_PREFILL_EXIT] Function finished.");
1830 return h_logits;
1831}
1832std::vector<std::vector<float>> TinyLlamaModel::forward_device_batch_generation(
1833 float* d_batch_input_embeddings, // Device pointer to [num_tokens_in_batch, config_.hidden_size]
1834 const std::vector<int>& token_positions, // Position of each token in its respective sequence
1835 const std::vector<int>& original_sequence_indices, // Original sequence index for each token
1836 int num_tokens_in_batch,
1837 KVCache* kv_cache,
1838 cudaStream_t stream) {
1839 Logger::info("[FWD_DEV_BATCH_GENERATION_ENTRY] num_tokens_in_batch: " + std::to_string(num_tokens_in_batch) +
1840 ", d_batch_input_embeddings: " + Logger::ptrToString(d_batch_input_embeddings));
1841
1842 if (token_positions.size() != static_cast<size_t>(num_tokens_in_batch)) {
1843 Logger::error("[FWD_DEV_BATCH_GENERATION] token_positions size mismatch. Expected: " +
1844 std::to_string(num_tokens_in_batch) + " Got: " + std::to_string(token_positions.size()));
1845 return {};
1846 }
1847 if (original_sequence_indices.size() != static_cast<size_t>(num_tokens_in_batch)) {
1848 Logger::error("[CPU_BATCH_GENERATION] original_sequence_indices size mismatch. Expected: " +
1849 std::to_string(num_tokens_in_batch) + " Got: " + std::to_string(original_sequence_indices.size()));
1850 return {};
1851 }
1852 const int hidden_size = config_.hidden_size;
1853 const int head_dim = config_.hidden_size / config_.num_attention_heads;
1854 const int ffn_intermediate_dim = config_.intermediate_size;
1855 const int n_kv_dim = config_.num_key_value_heads * head_dim;
1856 const int vocab_size = config_.vocab_size;
1857
1858 const size_t batch_hidden_size_bytes = (size_t)num_tokens_in_batch * hidden_size * sizeof(float);
1859 const size_t batch_intermediate_size_bytes = (size_t)num_tokens_in_batch * ffn_intermediate_dim * sizeof(float);
1860 const size_t batch_q_size_bytes = (size_t)num_tokens_in_batch * hidden_size * sizeof(float);
1861 const size_t batch_kv_size_bytes = (size_t)num_tokens_in_batch * n_kv_dim * sizeof(float);
1862
1863 float* d_batch_x_norm_out_attn;
1864 float* d_batch_q_proj_out;
1865 float* d_batch_k_proj_out;
1866 float* d_batch_v_proj_out;
1867 float* d_batch_attn_heads_concat_out;
1868 float* d_batch_attn_final_proj_out;
1869 float* d_batch_residual_attn_in;
1870 float* d_batch_residual_ffn_in;
1871 float* d_batch_x_norm_out_ffn;
1872 float* d_batch_ffn_gate_proj_out;
1873 float* d_batch_ffn_up_proj_out;
1874 float* d_batch_ffn_swiglu_out;
1875 float* d_batch_ffn_down_proj_out;
1876 float* d_batch_layer_output = nullptr;
1877
1878 resize_persistent_batch_buffers_if_needed(num_tokens_in_batch);
1879
1880 d_batch_x_norm_out_attn = d_persistent_batch_norm_out_;
1881 d_batch_q_proj_out = d_persistent_q_batch_;
1882 d_batch_k_proj_out = d_persistent_k_batch_;
1883 d_batch_v_proj_out = d_persistent_v_batch_;
1884 d_batch_attn_heads_concat_out = d_persistent_attn_output_;
1885 d_batch_attn_final_proj_out = d_persistent_attn_proj_out_;
1886 d_batch_residual_attn_in = d_persistent_batch_residual_;
1887 d_batch_residual_ffn_in = d_persistent_batch_residual_ + num_tokens_in_batch * hidden_size;
1888 d_batch_x_norm_out_ffn = d_persistent_batch_norm_out_;
1889 d_batch_ffn_gate_proj_out = d_persistent_gate_proj_out_;
1890 d_batch_ffn_up_proj_out = d_persistent_up_proj_out_;
1891 d_batch_ffn_swiglu_out = d_persistent_swiglu_out_;
1892 d_batch_ffn_down_proj_out = d_persistent_mlp_down_out_;
1893 d_batch_layer_output = d_persistent_batch_input_;
1894
1895 const float alpha = 1.0f, beta = 0.0f;
1896
1897 cublasStatus_t stream_status = cublasSetStream(cublas_handle_, stream);
1898 if (stream_status != CUBLAS_STATUS_SUCCESS) {
1899 Logger::fatal("cublasSetStream failed in forward_device_batch_generation");
1900 throw std::runtime_error("cublasSetStream failed");
1901 }
1902
1903 float* d_batch_x_ptr = d_batch_input_embeddings;
1904
1905 Logger::info("[FWD_DEV_BATCH_GENERATION_MAIN_LOOP_ENTRY] num_cpu_offload_layers: " + std::to_string(config_.num_cpu_offload_layers) +
1906 ", total_hidden_layers: " + std::to_string(config_.num_hidden_layers));
1907
1908 for (int l_model_idx = config_.num_cpu_offload_layers; l_model_idx < config_.num_hidden_layers; ++l_model_idx) {
1909 int l_gpu_idx = l_model_idx - config_.num_cpu_offload_layers;
1910 Logger::info("[FWD_DEV_BATCH_GENERATION_LAYER_START] Processing Layer: model_idx=" + std::to_string(l_model_idx) + ", gpu_idx=" + std::to_string(l_gpu_idx) +
1911 ". Current d_batch_x_ptr: " + Logger::ptrToString(d_batch_x_ptr));
1912
1913 gpuErrchk(cudaMemcpyAsync(d_batch_residual_attn_in, d_batch_x_ptr, batch_hidden_size_bytes, cudaMemcpyDeviceToDevice, stream));
1914
1915 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
1916 layers[l_model_idx].input_layernorm_dev,
1917 num_tokens_in_batch, hidden_size, config_.rms_norm_eps, stream);
1919
1920 const float* w_q_layer_ptr = w_q_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
1921 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, hidden_size, hidden_size, &alpha,
1922 d_batch_x_norm_out_attn, hidden_size, w_q_layer_ptr, hidden_size, &beta,
1923 d_batch_q_proj_out, hidden_size, stream, "Q_PROJ_GEN");
1924
1925 const float* w_k_layer_ptr = w_k_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1926 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, n_kv_dim, hidden_size, &alpha,
1927 d_batch_x_norm_out_attn, hidden_size, w_k_layer_ptr, n_kv_dim, &beta,
1928 d_batch_k_proj_out, n_kv_dim, stream, "K_PROJ_GEN");
1929
1930 const float* w_v_layer_ptr = w_v_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1931 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, n_kv_dim, hidden_size, &alpha,
1932 d_batch_x_norm_out_attn, hidden_size, w_v_layer_ptr, n_kv_dim, &beta,
1933 d_batch_v_proj_out, n_kv_dim, stream, "V_PROJ_GEN");
1934
1935 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1936 int current_pos = token_positions[token_idx];
1937
1938 float* q_token_ptr = d_batch_q_proj_out + (size_t)token_idx * config_.num_attention_heads * head_dim;
1939 float* k_token_ptr = d_batch_k_proj_out + (size_t)token_idx * config_.num_key_value_heads * head_dim;
1940
1941 rope_cuda(q_token_ptr, config_.num_attention_heads, head_dim, all_freqs_cis_dev,
1942 current_pos, config_.is_gguf_file_loaded, stream);
1943 rope_cuda(k_token_ptr, config_.num_key_value_heads, head_dim, all_freqs_cis_dev,
1944 current_pos, config_.is_gguf_file_loaded, stream);
1945 }
1946// Update KV cache for each token at its specific position with sequence-aware offsets
1947float* d_layer_k_cache_ptr = kv_cache->layers[l_model_idx].k_dev_fp32;
1948float* d_layer_v_cache_ptr = kv_cache->layers[l_model_idx].v_dev_fp32;
1949
1950for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1951 int current_pos = token_positions[token_idx];
1952 int sequence_idx = original_sequence_indices[token_idx];
1953
1954 // Calculate sequence-specific offset in the cache
1955 int sequence_cache_offset = sequence_idx * kv_cache->max_seq_len_config_;
1956 int actual_cache_pos = sequence_cache_offset + current_pos;
1957
1958 const float* k_token_ptr = d_batch_k_proj_out + (size_t)token_idx * config_.num_key_value_heads * head_dim;
1959 const float* v_token_ptr = d_batch_v_proj_out + (size_t)token_idx * config_.num_key_value_heads * head_dim;
1960
1961 // Update individual positions in the KV cache with sequence-specific offsets
1962 for (int kvh = 0; kvh < config_.num_key_value_heads; ++kvh) {
1963 const float* current_k_head_ptr = k_token_ptr + kvh * head_dim;
1964 const float* current_v_head_ptr = v_token_ptr + kvh * head_dim;
1965
1966 update_kv_cache_cuda(d_layer_k_cache_ptr, current_k_head_ptr, actual_cache_pos,
1967 kvh, kv_cache->allocated_max_seq_len * kv_cache->max_batch_size,
1968 kv_cache->allocated_num_kv_heads,
1969 kv_cache->allocated_head_dim, stream);
1970
1971 update_kv_cache_cuda(d_layer_v_cache_ptr, current_v_head_ptr, actual_cache_pos,
1972 kvh, kv_cache->allocated_max_seq_len * kv_cache->max_batch_size,
1973 kv_cache->allocated_num_kv_heads,
1974 kv_cache->allocated_head_dim, stream);
1975 }
1976}
1977 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1978 int current_pos = token_positions[token_idx];
1979
1980 float* q_token_ptr = d_batch_q_proj_out + (size_t)token_idx * config_.num_attention_heads * head_dim;
1981 float* attn_output_token_ptr = d_batch_attn_heads_concat_out + (size_t)token_idx * config_.num_attention_heads * head_dim;
1982
1983 float scale = 1.0f / SAFE_SQRT(static_cast<float>(head_dim));
1984
1986 selective_k_dequant_buffer_dev_ && selective_v_dequant_buffer_dev_) {
1987 KVCacheLayer& batch_kv_layer = kv_cache->layers[l_model_idx];
1988 attention_cuda_selective_dequant(
1989 q_token_ptr,
1990 batch_kv_layer.k_dev_quantized,
1991 batch_kv_layer.v_dev_quantized,
1992 batch_kv_layer.k_dev_scales,
1993 batch_kv_layer.v_dev_scales,
1994 selective_k_dequant_buffer_dev_,
1995 selective_v_dequant_buffer_dev_,
1996 attn_output_token_ptr,
1998 current_pos + 1,
1999 head_dim,
2000 scale,
2001 kv_cache->allocated_max_seq_len,
2002 kv_cache->allocated_num_kv_heads,
2003 stream
2004 );
2005 } else {
2006 // Use optimized kernels if enabled, fallback to standard if needed
2008 attention_cuda_optimized(q_token_ptr, d_layer_k_cache_ptr, d_layer_v_cache_ptr,
2009 attn_output_token_ptr, config_.num_attention_heads, current_pos + 1, head_dim,
2010 scale, kv_cache->allocated_max_seq_len, kv_cache->allocated_num_kv_heads, stream);
2011 } else {
2012 attention_cuda(q_token_ptr, d_layer_k_cache_ptr, d_layer_v_cache_ptr,
2013 attn_output_token_ptr, config_.num_attention_heads, current_pos + 1, head_dim,
2014 scale, kv_cache->allocated_max_seq_len, kv_cache->allocated_num_kv_heads, stream);
2015 }
2016 }
2017 }
2018
2019 const float* w_o_layer_ptr = w_o_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
2020 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, hidden_size, hidden_size, &alpha,
2021 d_batch_attn_heads_concat_out, hidden_size, w_o_layer_ptr, hidden_size, &beta,
2022 d_batch_attn_final_proj_out, hidden_size, stream, "O_PROJ");
2023
2024 add_residual_batch_cuda(d_batch_residual_ffn_in, d_batch_attn_final_proj_out, d_batch_residual_attn_in,
2025 num_tokens_in_batch, hidden_size, stream);
2026
2027 rmsnorm_batch_cuda(d_batch_x_norm_out_ffn, d_batch_residual_ffn_in,
2028 layers[l_model_idx].post_attention_layernorm_dev,
2029 num_tokens_in_batch, hidden_size, config_.rms_norm_eps, stream);
2030
2031 const float* w1_layer_ptr = w_gate_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
2032 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, ffn_intermediate_dim, hidden_size, &alpha,
2033 d_batch_x_norm_out_ffn, hidden_size, w1_layer_ptr, ffn_intermediate_dim, &beta,
2034 d_batch_ffn_gate_proj_out, ffn_intermediate_dim, stream, "FFN_GATE_PROJ_GEN");
2035
2036 const float* w3_layer_ptr = w_up_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
2037 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, ffn_intermediate_dim, hidden_size, &alpha,
2038 d_batch_x_norm_out_ffn, hidden_size, w3_layer_ptr, ffn_intermediate_dim, &beta,
2039 d_batch_ffn_up_proj_out, ffn_intermediate_dim, stream, "FFN_UP_PROJ_GEN");
2040
2041 swiglu_batch_cuda(d_batch_ffn_swiglu_out, d_batch_ffn_gate_proj_out, d_batch_ffn_up_proj_out,
2042 num_tokens_in_batch, ffn_intermediate_dim, stream);
2043
2044 const float* w2_layer_ptr = w_down_f32_dev_ + (size_t)l_gpu_idx * ffn_intermediate_dim * hidden_size;
2045 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, hidden_size, ffn_intermediate_dim, &alpha,
2046 d_batch_ffn_swiglu_out, ffn_intermediate_dim, w2_layer_ptr, hidden_size, &beta,
2047 d_batch_ffn_down_proj_out, hidden_size, stream, "FFN_DOWN_PROJ_GEN");
2048
2049 add_residual_batch_cuda(d_batch_layer_output, d_batch_ffn_down_proj_out, d_batch_residual_ffn_in,
2050 num_tokens_in_batch, hidden_size, stream);
2051
2052 d_batch_x_ptr = d_batch_layer_output;
2053 Logger::info("[FWD_DEV_BATCH_GENERATION_LAYER_END] Layer " + std::to_string(l_model_idx) + " finished. Next d_batch_x_ptr: " + Logger::ptrToString(d_batch_x_ptr));
2054 }
2055
2056 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
2057 final_norm_dev,
2058 num_tokens_in_batch, hidden_size, config_.rms_norm_eps, stream);
2059
2060 // Calculate logits for ALL tokens in the batch (not just the last one)
2061 float* d_logits_batch;
2062 gpuErrchk(cudaMalloc(&d_logits_batch, (size_t)num_tokens_in_batch * vocab_size * sizeof(float)));
2063
2064 // Use GEMM instead of individual MatVec calls for efficiency
2065 smart_gemm_batch_cuda(false, true, num_tokens_in_batch, vocab_size, hidden_size, &alpha,
2066 d_batch_x_norm_out_attn, hidden_size, lm_head_f32_dev_, vocab_size, &beta,
2067 d_logits_batch, vocab_size, stream, "LM_HEAD_GEN");
2068
2069 // Copy logits back to host for all tokens
2070 std::vector<std::vector<float>> all_logits(num_tokens_in_batch, std::vector<float>(vocab_size));
2071 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
2072 gpuErrchk(cudaMemcpyAsync(all_logits[token_idx].data(),
2073 d_logits_batch + (size_t)token_idx * vocab_size,
2074 vocab_size * sizeof(float),
2075 cudaMemcpyDeviceToHost, stream));
2076 }
2077 gpuErrchk(cudaStreamSynchronize(stream));
2078
2079 Logger::info("[FWD_DEV_BATCH_GENERATION_FINAL_LOGITS] Calculated logits for " + std::to_string(num_tokens_in_batch) + " tokens");
2080 gpuErrchk(cudaFree(d_logits_batch));
2081 Logger::info("[FWD_DEV_BATCH_GENERATION_EXIT] Function finished.");
2082 return all_logits;
2083}
2084#endif // HAS_CUDA
2085
2087 const std::vector<float>& batch_input_activations, // [num_tokens, hidden_size]
2088 int num_tokens_in_batch,
2089 int num_cpu_layers_to_process,
2090 int start_pos_in_sequence, // Starting position of this batch in the overall sequence (for KVCache)
2091 KVCache* kv_cache,
2092 const std::vector<int>& prompt_lengths) {
2093
2094 if (!cpu_batch_processor_) {
2095 cpu_batch_processor_ = std::make_unique<CPUBatchProcessor>(this);
2096 }
2097
2098 return cpu_batch_processor_->forward_cpu_batch(
2099 batch_input_activations,
2100 num_tokens_in_batch,
2101 num_cpu_layers_to_process,
2102 start_pos_in_sequence,
2103 kv_cache,
2104 prompt_lengths
2105 );
2106}
2107
2108// Smart GEMM wrapper that chooses between BF16 Tensor Cores and FP32 based on batch size
2109void TinyLlamaModel::smart_gemm_batch_cuda(bool transa_user, bool transb_user,
2110 int m_user, int n_user, int k_user,
2111 const float* alpha_user,
2112 const float* A_f32_user, int lda_user,
2113 const float* B_f32_user, int ldb_user,
2114 const float* beta_user,
2115 float* C_f32_user, int ldc_user,
2116 cudaStream_t stream,
2117 const char* operation_name) {
2118
2119 // Tensor Cores are most beneficial for larger batch sizes
2120 const int tensor_core_threshold = 4; // Use BF16 Tensor Cores when batch size >= 4
2121 bool use_tensor_cores = use_bf16_tensor_cores_ && (m_user >= tensor_core_threshold);
2122
2123 // Determine which BF16 weight to use based on operation name and weight pointer
2124 uint16_t* bf16_weight_ptr = nullptr;
2125
2126 if (use_tensor_cores) {
2127 Logger::info("[SMART_GEMM] Using BF16 Tensor Cores for " + std::string(operation_name) +
2128 " (batch_size=" + std::to_string(m_user) + " >= " + std::to_string(tensor_core_threshold) + ")");
2129
2130 // Ensure BF16 weights are loaded
2132
2133 // Map F32 weight pointer to corresponding BF16 weight pointer
2134 if (B_f32_user == w_q_f32_dev_) {
2135 bf16_weight_ptr = w_q_bf16_dev_;
2136 } else if (B_f32_user == w_k_f32_dev_) {
2137 bf16_weight_ptr = w_k_bf16_dev_;
2138 } else if (B_f32_user == w_v_f32_dev_) {
2139 bf16_weight_ptr = w_v_bf16_dev_;
2140 } else if (B_f32_user == w_o_f32_dev_) {
2141 bf16_weight_ptr = w_o_bf16_dev_;
2142 } else if (B_f32_user == w_gate_f32_dev_) {
2143 bf16_weight_ptr = w_gate_bf16_dev_;
2144 } else if (B_f32_user == w_up_f32_dev_) {
2145 bf16_weight_ptr = w_up_bf16_dev_;
2146 } else if (B_f32_user == w_down_f32_dev_) {
2147 bf16_weight_ptr = w_down_bf16_dev_;
2148 } else {
2149 // Map layer-specific pointers by calculating offset from base pointer
2150 size_t offset_bytes = 0;
2151 bool found = false;
2152
2153 // Check if it's a layer-specific pointer within the concatenated weights
2154 if (B_f32_user >= w_q_f32_dev_ && B_f32_user < (w_q_f32_dev_ + config_.num_hidden_layers * config_.hidden_size * config_.hidden_size)) {
2155 offset_bytes = (B_f32_user - w_q_f32_dev_) * sizeof(float);
2156 bf16_weight_ptr = w_q_bf16_dev_ + (offset_bytes / sizeof(uint16_t));
2157 found = true;
2158 } else if (B_f32_user >= w_k_f32_dev_ && w_k_f32_dev_ && B_f32_user < (w_k_f32_dev_ + config_.num_hidden_layers * config_.hidden_size * config_.num_key_value_heads * (config_.hidden_size / config_.num_attention_heads))) {
2159 offset_bytes = (B_f32_user - w_k_f32_dev_) * sizeof(float);
2160 bf16_weight_ptr = w_k_bf16_dev_ + (offset_bytes / sizeof(uint16_t));
2161 found = true;
2162 } else if (B_f32_user >= w_v_f32_dev_ && w_v_f32_dev_ && B_f32_user < (w_v_f32_dev_ + config_.num_hidden_layers * config_.hidden_size * config_.num_key_value_heads * (config_.hidden_size / config_.num_attention_heads))) {
2163 offset_bytes = (B_f32_user - w_v_f32_dev_) * sizeof(float);
2164 bf16_weight_ptr = w_v_bf16_dev_ + (offset_bytes / sizeof(uint16_t));
2165 found = true;
2166 } else if (B_f32_user >= w_o_f32_dev_ && w_o_f32_dev_ && B_f32_user < (w_o_f32_dev_ + config_.num_hidden_layers * config_.hidden_size * config_.hidden_size)) {
2167 offset_bytes = (B_f32_user - w_o_f32_dev_) * sizeof(float);
2168 bf16_weight_ptr = w_o_bf16_dev_ + (offset_bytes / sizeof(uint16_t));
2169 found = true;
2170 } else if (B_f32_user >= w_gate_f32_dev_ && w_gate_f32_dev_ && B_f32_user < (w_gate_f32_dev_ + config_.num_hidden_layers * config_.hidden_size * config_.intermediate_size)) {
2171 offset_bytes = (B_f32_user - w_gate_f32_dev_) * sizeof(float);
2172 bf16_weight_ptr = w_gate_bf16_dev_ + (offset_bytes / sizeof(uint16_t));
2173 found = true;
2174 } else if (B_f32_user >= w_up_f32_dev_ && w_up_f32_dev_ && B_f32_user < (w_up_f32_dev_ + config_.num_hidden_layers * config_.hidden_size * config_.intermediate_size)) {
2175 offset_bytes = (B_f32_user - w_up_f32_dev_) * sizeof(float);
2176 bf16_weight_ptr = w_up_bf16_dev_ + (offset_bytes / sizeof(uint16_t));
2177 found = true;
2178 } else if (B_f32_user >= w_down_f32_dev_ && w_down_f32_dev_ && B_f32_user < (w_down_f32_dev_ + config_.num_hidden_layers * config_.intermediate_size * config_.hidden_size)) {
2179 offset_bytes = (B_f32_user - w_down_f32_dev_) * sizeof(float);
2180 bf16_weight_ptr = w_down_bf16_dev_ + (offset_bytes / sizeof(uint16_t));
2181 found = true;
2182 }
2183
2184 if (!found) {
2185 // If we can't identify the weight, fall back to FP32
2186 Logger::warning("[SMART_GEMM] Unknown weight pointer for " + std::string(operation_name) +
2187 ", falling back to FP32");
2188 use_tensor_cores = false;
2189 }
2190 }
2191
2192 if (use_tensor_cores && bf16_weight_ptr) {
2193 try {
2194 // Use mixed precision: FP32 input x BF16 weights = FP32 output
2195 gemm_f32_to_bf16_f32_cuda(cublas_handle_, transa_user, transb_user,
2196 m_user, n_user, k_user, alpha_user,
2197 A_f32_user, lda_user, bf16_weight_ptr, ldb_user,
2198 beta_user, C_f32_user, ldc_user, stream);
2199 Logger::info("[SMART_GEMM] Successfully used BF16 Tensor Cores for " + std::string(operation_name));
2200 return;
2201 } catch (const std::exception& e) {
2202 Logger::warning("[SMART_GEMM] BF16 Tensor Cores failed for " + std::string(operation_name) +
2203 ": " + e.what() + ". Falling back to FP32.");
2204 use_tensor_cores = false;
2205 }
2206 }
2207 }
2208
2209 // Fallback to standard FP32 GEMM
2210 Logger::info("[SMART_GEMM] Using FP32 GEMM for " + std::string(operation_name) +
2211 " (batch_size=" + std::to_string(m_user) + " < " + std::to_string(tensor_core_threshold) +
2212 " or Tensor Cores unavailable)");
2213 gemm_f32_f32_cuda(cublas_handle_, transa_user, transb_user,
2214 m_user, n_user, k_user, alpha_user,
2215 A_f32_user, lda_user, B_f32_user, ldb_user,
2216 beta_user, C_f32_user, ldc_user, stream);
2217}
2218
#define SAFE_SQRT(x)
static void log_vector_stats(const std::string &name, const std::vector< float > &v, int n_show=5)
Definition logger.cpp:160
static void debug(const std::string &message)
Definition logger.cpp:131
static void warning(const std::string &message)
Definition logger.cpp:139
static std::string ptrToString(const void *ptr)
Definition logger.cpp:225
static void info(const std::string &message)
Definition logger.cpp:135
static void error(const std::string &message)
Definition logger.cpp:143
static void fatal(const std::string &message)
Definition logger.cpp:151
Main class for loading tensors from SafeTensors format files (single or sharded)
static bool load_model_config_from_json(const std::string &model_path_or_dir, ModelConfig &config_to_populate)
Loads model configuration from a JSON file corresponding to a .safetensors model path.
bool use_bf16_tensor_cores_
Definition model.h:481
~TinyLlamaModel()
Destructor. Cleans up all allocated resources.
Definition model.cpp:330
std::vector< block_q6_K > embed_tokens_q6k
Definition model.h:488
void ensure_up_proj_dequantized(int layer_idx)
std::vector< float > final_norm_f32
Definition model.h:486
std::vector< uint16_t > final_norm
Definition model.h:485
void ensure_v_proj_dequantized(int layer_idx)
std::vector< block_q4_K > lm_head_q4k
Definition model.h:487
std::vector< float > forward_cpu_logits_batch(const std::vector< float > &final_batch_activations, int num_tokens_in_batch)
Definition model.cpp:1063
friend void map_gguf_weights(const GGUFData &gguf, TinyLlamaModel &model)
std::vector< block_q6_K > lm_head_q6k
Definition model.h:488
std::vector< std::pair< float, float > > precomputed_freqs_cis_
Definition model.h:554
TinyLlamaModel(const ModelConfig &config, const SafeTensorsLoader &loader)
Construct a TinyLlamaModel from a SafeTensorsLoader.
Definition model.cpp:144
std::string model_path_
Definition model.h:557
std::vector< LayerWeights > layers
Definition model.h:491
std::vector< block_q8_0 > embed_tokens_q8_0
Definition model.h:489
ModelConfig config_
Definition model.h:480
void ensure_o_proj_dequantized(int layer_idx)
void clear_layer_dequantized_weights(int layer_idx)
std::vector< block_q4_K > embed_tokens_q4k
Definition model.h:487
void smart_gemm_batch_cuda(bool transa_user, bool transb_user, int m_user, int n_user, int k_user, const float *alpha_user, const float *A_f32_user, int lda_user, const float *B_f32_user, int ldb_user, const float *beta_user, float *C_f32_user, int ldc_user, cudaStream_t stream, const char *operation_name="GEMM")
Definition model.cpp:2109
void ensure_k_proj_dequantized(int layer_idx)
std::unique_ptr< class CPUBatchProcessor > cpu_batch_processor_
Definition model.h:560
std::vector< block_q8_0 > lm_head_q8_0
Definition model.h:489
std::vector< uint16_t > lm_head
Definition model.h:484
void ensure_f32_concatenated_weights_loaded()
std::vector< std::vector< float > > forward_cpu_batch_generation(const std::vector< float > &batch_input_activations, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, int num_tokens_in_batch, KVCache *kv_cache)
Definition model.cpp:1127
std::vector< uint16_t > embed_tokens
Definition model.h:483
std::vector< block_q8_K > embed_tokens_q8k
Definition model.h:490
void ensure_bf16_concatenated_weights_loaded()
void ensure_q_proj_dequantized(int layer_idx)
void initialize_weights(const SafeTensorsLoader *loader, const GGUFData *gguf)
Definition model.cpp:38
std::vector< float > forward_cpu_batch(const std::vector< float > &batch_input_activations, int num_tokens_in_batch, int num_cpu_layers_to_process, int start_pos_in_sequence, KVCache *kv_cache, const std::vector< int > &prompt_lengths={})
Definition model.cpp:2086
void ensure_down_proj_dequantized(int layer_idx)
void ensure_gate_proj_dequantized(int layer_idx)
std::vector< float > embed_tokens_f32
Definition model.h:486
std::vector< float > forward(std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask)
Run the forward pass for the model on CPU layers.
Definition model.cpp:536
void ensure_lm_head_dequantized()
std::unique_ptr< GGUFData > gguf_data_
Definition model.h:556
std::vector< float > lm_head_f32
Definition model.h:486
std::vector< block_q8_K > lm_head_q8k
Definition model.h:490
GGUFData load_gguf_meta(const std::string &filename, bool use_mmap)
Loads GGUF metadata and optionally memory-maps tensor data.
Parser for GGUF (GPT-Generated Unified Format) files.
Logging utilities for the TinyLlama implementation.
ModelConfig parse_model_config_from_gguf(const GGUFData &gguf)
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count=5)
Definition utils.cpp:207
Constants used throughout the TinyLlama model implementation.
void dequantize_vector_q6k_to_f32(const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q6_K blocks to a vector of float32.
void dequantize_vector_q8_0_to_f32(const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q8_0 blocks to a vector of float32.
void dequantize_vector_q4k_to_f32(const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q4_K blocks to a vector of float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
Weight quantization structures and functions for model compression.
SafeTensors format loader for efficient tensor loading, supporting single and sharded models.
Complete representation of a GGUF file's contents.
std::vector< uint8_t > tensor_data
void * mapped_tensor_data
Key-Value cache for a single transformer layer.
Definition model.h:130
std::vector< float > v
Definition model.h:132
std::vector< float > k
Definition model.h:131
Complete Key-Value cache for all transformer layers.
Definition model.h:151
int max_batch_size
Definition model.h:159
int max_seq_len_config_
Definition model.h:163
std::vector< KVCacheLayer > layers
Definition model.h:152
int seq_len
Definition model.h:155
std::vector< int > batch_seq_lens
Definition model.h:158
int current_batch_size
Definition model.h:160
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
std::string architecture
Definition model.h:96
float rms_norm_eps
Definition model.h:88
int num_attention_heads
Definition model.h:83
bool use_mmap_for_gguf
Definition model.h:102
int intermediate_size
Definition model.h:82
int num_cpu_offload_layers
Definition model.h:104
bool enable_memory_efficient_layers
Definition model.h:107
bool is_gguf_file_loaded
Definition model.h:101
bool use_kvcache_quantization
Definition model.h:103
int num_hidden_layers
Definition model.h:85
bool use_optimized_cuda_kernels
Definition model.h:110
int num_key_value_heads
Definition model.h:84
int max_position_embeddings
Definition model.h:87
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:428
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:399
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:763
void matvec_bf16_f32_vector_cpu(const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:1025
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:349
void simd_scaled_add(float *dst, const float *src, float scale, int n)
Definition utils.cpp:92
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:988
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198
void softmax_vector_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:675
float simd_dot_product(const float *a, const float *b, int n)
Definition utils.cpp:35
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:816
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:869
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:709
void silu_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:700
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:950
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps)
Definition utils.cpp:613
void rmsnorm_vector_cpu(const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps)
Definition utils.cpp:648
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:293