48 Logger::info(
"Processing weights from GGUF data source...");
52 Logger::info(
"[INIT_WEIGHTS_GGUF] map_gguf_weights(*gguf, *this) CALLED (using function parameter).");
55 Logger::info(
"[INIT_WEIGHTS_GGUF] map_gguf_weights(*gguf_data_, *this) CALLED (using member gguf_data_).");
57 Logger::error(
"[INIT_WEIGHTS_GGUF] map_gguf_weights failed - tensor data not available. No GGUF weights mapped.");
61 Logger::info(
"[INIT_WEIGHTS_GGUF] Using lazy dequantization to prevent OOM");
86 Logger::info(
"[INIT_WEIGHTS_GGUF_DEQUANT] Successfully converted final_norm (BF16) to final_norm_f32. Size: " + std::to_string(this->
final_norm_f32.size()));
92 Logger::info(
"[INIT_WEIGHTS_GGUF] Deferring lm_head dequantization until needed to save memory");
95 Logger::info(
"[INIT_WEIGHTS_GGUF] Deferring all layer weight dequantization until layers are used");
98 for (
int l = 0; l < nhl; ++l) {
101 if (lw.input_layernorm_f32.empty() && !lw.input_layernorm.empty()) {
103 if (!lw.input_layernorm_f32.empty())
Logger::info(
" L" + std::to_string(l) +
" input_layernorm_f32 populated from BF16. Size: " + std::to_string(lw.input_layernorm_f32.size()));
105 if (lw.post_attention_layernorm_f32.empty() && !lw.post_attention_layernorm.empty()) {
107 if (!lw.post_attention_layernorm_f32.empty())
Logger::info(
" L" + std::to_string(l) +
" post_attention_layernorm_f32 populated from BF16. Size: " + std::to_string(lw.post_attention_layernorm_f32.size()));
112 for (
int l = 0; l < nhl; ++l) {
113 const auto& lw =
layers[l];
114 if (lw.input_layernorm_f32.empty()) {
115 Logger::error(
"[INIT_WEIGHTS_GGUF_CHECK] Layer " + std::to_string(l) +
116 ": input_layernorm_f32 is EMPTY post-GGUF. This WILL cause GPU init errors if this layer is on GPU.");
118 if (lw.post_attention_layernorm_f32.empty()) {
119 Logger::error(
"[INIT_WEIGHTS_GGUF_CHECK] Layer " + std::to_string(l) +
120 ": post_attention_layernorm_f32 is EMPTY post-GGUF. This WILL cause GPU init errors if this layer is on GPU.");
123 Logger::info(
"[INIT_WEIGHTS_GGUF] Finished per-layer NORM F32 vector checks post-GGUF.");
126 Logger::fatal(
"TinyLlamaModel::initialize_weights called with neither GGUF nor SafeTensors loader. Cannot initialize weights.");
127 throw std::runtime_error(
"Model weights source (GGUF or SafeTensors) not provided to initialize_weights.");
130 Logger::info(
"Finished initializing model weights logic block.");
133 Logger::error(
"[INIT_WEIGHTS_FINAL_CHECK] final_norm_f32 is EMPTY. This WILL cause errors if final normalization is needed in F32.");
139 Logger::error(
"[INIT_WEIGHTS_FINAL_CHECK] embed_tokens_f32 is EMPTY. This WILL cause errors if token embeddings are needed in F32.");
148 Logger::info(
"Constructing TinyLlamaModel from SafeTensorsLoader (is_gguf_file_loaded set to false).");
151 Logger::info(
"TinyLlamaModel construction from SafeTensorsLoader complete.");
155 const std::string& model_path)
156 : model_path_(model_path)
158 , cublas_handle_(nullptr), token_embedding_table_dev_(nullptr), lm_head_dev_(nullptr), final_norm_dev(nullptr), w_q_dev_(nullptr), w_k_dev_(nullptr), w_v_dev_(nullptr), w_o_dev_(nullptr), w_gate_dev_(nullptr), w_up_dev_(nullptr), w_down_dev_(nullptr), all_freqs_cis_dev(nullptr), x_dev_(nullptr), x_norm_dev_(nullptr), x_resid1_dev_(nullptr), x_resid2_dev_(nullptr), q_dev_(nullptr), k_dev_(nullptr), v_dev_(nullptr), attn_out_dev_(nullptr), attn_proj_dev_(nullptr), gate_vec_dev_(nullptr), up_vec_dev_(nullptr), swiglu_vec_dev_(nullptr), mlp_down_dev_(nullptr), logits_dev_(nullptr), token_embedding_table_f32_dev_(nullptr), lm_head_f32_dev_(nullptr), w_q_f32_dev_(nullptr), w_k_f32_dev_(nullptr), w_v_f32_dev_(nullptr), w_o_f32_dev_(nullptr), w_gate_f32_dev_(nullptr), w_up_f32_dev_(nullptr), w_down_f32_dev_(nullptr)
161 Logger::info(
"TinyLlamaModel constructor entered. Model path (from string): " + model_path);
164 this->
config_ = initial_config;
165 if (this->
model_path_.empty() && !model_path.empty()) {
168 std::unique_ptr<SafeTensorsLoader> loader =
nullptr;
169 if (!this->
model_path_.empty() && this->model_path_.size() > 5 &&
170 this->model_path_.substr(this->model_path_.size() - 5) ==
".gguf") {
173 bool force_mmap_for_gguf_load = cli_mmap_preference;
174 Logger::info(
"TinyLlamaModel GGUF path: Using mmap setting " + std::string(force_mmap_for_gguf_load ?
"true" :
"false") +
175 " for gguf_meta/weight loading based on CLI mmap preference: " +
176 std::string(cli_mmap_preference ?
"true" :
"false"));
181 this->
config_ = config_from_gguf;
184 if (cli_gpu_layer_request < 0) {
186 Logger::info(
"TinyLlamaModel GGUF Ctor CALC: CLI hint < 0 (all GPU). num_cpu_offload_layers set to 0.");
187 }
else if (cli_gpu_layer_request == 0) {
194 Logger::info(
"TinyLlamaModel GGUF Ctor CALC: CLI GPU layer request ("+ std::to_string(cli_gpu_layer_request) +
") >= total layers. num_cpu_offload_layers set to 0.");
197 Logger::info(
"TinyLlamaModel GGUF Ctor CALC: Partial GPU. CLI GPU req: " + std::to_string(cli_gpu_layer_request) +
". num_cpu_offload_layers set to " + std::to_string(this->
config_.
num_cpu_offload_layers));
201 Logger::warning(
"TinyLlamaModel GGUF Ctor CALC: num_hidden_layers from GGUF is <= 0. Defaulting num_cpu_offload_layers to 0. CLI GPU req: " + std::to_string(cli_gpu_layer_request));
206 ", this->config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
207 }
catch (
const std::exception& e) {
208 Logger::error(
"Failed to load or parse GGUF file: " + std::string(e.what()));
211 }
else if (model_path.size() > 12 &&
212 model_path.substr(model_path.size() - 12) ==
".safetensors") {
213 Logger::info(
"SafeTensors file detected: " + model_path);
218 if (json_loaded_successfully) {
219 Logger::info(
"Successfully loaded and parsed config.json for SafeTensors model.");
220 this->
config_ = config_from_json;
222 Logger::warning(
"Failed to load config.json or it was not found for SafeTensors model. Proceeding with initial_config defaults and CLI overrides.");
227 if (cli_gpu_layer_request < 0) {
229 }
else if (cli_gpu_layer_request == 0) {
240 Logger::warning(
"SafeTensors path: num_hidden_layers is 0 from JSON/default. Defaulting num_cpu_offload_layers to 0 despite CLI GPU request: " + std::to_string(cli_gpu_layer_request));
246 loader = std::make_unique<SafeTensorsLoader>(model_path);
247 Logger::info(
"SafeTensorsLoader initialized for: " + model_path);
248 }
catch (
const std::exception& e) {
249 Logger::error(
"Failed to initialize SafeTensorsLoader: " + std::string(e.what()));
253 throw std::runtime_error(
254 "Unsupported model file type. Please use .gguf or .safetensors");
258 ", config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
263 Logger::warning(
"Clamping num_cpu_offload_layers: was < 0, set to 0.");
265 if (this->
config_.
num_hidden_layers > 0 && this->config_.num_cpu_offload_layers > this->config_.num_hidden_layers) {
267 ") exceeds total hidden layers (" + std::to_string(this->config_.num_hidden_layers) +
268 "). Clamping to " + std::to_string(this->config_.num_hidden_layers) +
" (all CPU).");
273 ", this->config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
274 Logger::info(
"Final ModelConfig (before initialize_weights/rope):");
287 if (this->
gguf_data_->tensor_infos_map.empty()) {
288 Logger::error(
"[CTOR_GGUF_PRE_INIT_W] CRITICAL: gguf_data_->tensor_infos_map is EMPTY. Weights will not be loaded by map_gguf_weights.");
291 Logger::error(
"[CTOR_GGUF_PRE_INIT_W] CRITICAL: config_.is_gguf_file_loaded is TRUE, but gguf_data_ pointer IS NULL. Weights cannot be loaded.");
293 Logger::info(
"[CTOR_GGUF_PRE_INIT_W] Not a GGUF file load context (e.g., SafeTensors). Skipping gguf_data_ check here.");
299 Logger::info(
"TinyLlamaModel (from path string) constructed and initialized successfully.");
302 std::unique_ptr<GGUFData> gguf_data_from_session)
303 : config_(config_from_session),
304 gguf_data_(std::move(gguf_data_from_session)),
305 model_path_(
"loaded_from_gguf_data_memory")
308 , cublas_handle_(nullptr), token_embedding_table_dev_(nullptr), lm_head_dev_(nullptr), final_norm_dev(nullptr), w_q_dev_(nullptr), w_k_dev_(nullptr), w_v_dev_(nullptr), w_o_dev_(nullptr), w_gate_dev_(nullptr), w_up_dev_(nullptr), w_down_dev_(nullptr), all_freqs_cis_dev(nullptr), x_dev_(nullptr), x_norm_dev_(nullptr), x_resid1_dev_(nullptr), x_resid2_dev_(nullptr), q_dev_(nullptr), k_dev_(nullptr), v_dev_(nullptr), attn_out_dev_(nullptr), attn_proj_dev_(nullptr), gate_vec_dev_(nullptr), up_vec_dev_(nullptr), swiglu_vec_dev_(nullptr), mlp_down_dev_(nullptr), logits_dev_(nullptr), token_embedding_table_f32_dev_(nullptr), lm_head_f32_dev_(nullptr), w_q_f32_dev_(nullptr), w_k_f32_dev_(nullptr), w_v_f32_dev_(nullptr), w_o_f32_dev_(nullptr), w_gate_f32_dev_(nullptr), w_up_f32_dev_(nullptr), w_down_f32_dev_(nullptr)
311 Logger::info(
"TinyLlamaModel constructor entered (with pre-loaded GGUFData). Model path placeholder: " +
model_path_);
317 if (this->
config_.
num_hidden_layers > 0 && this->config_.num_cpu_offload_layers > this->config_.num_hidden_layers) {
319 ") exceeds total hidden layers (" + std::to_string(this->config_.num_hidden_layers) +
320 "). Clamping to " + std::to_string(this->config_.num_hidden_layers) +
" layers on CPU (all CPU).");
327 Logger::info(
"TinyLlamaModel (with pre-loaded GGUFData) constructed and initialized successfully.");
334 if (active_num_gpu_layers > 0) {
335 Logger::info(
"Freeing TinyLlamaModel CUDA resources...");
336 if (cublas_handle_) {
337 cublasStatus_t cublas_status = cublasDestroy(cublas_handle_);
338 if (cublas_status != CUBLAS_STATUS_SUCCESS) {
339 Logger::error(
"cuBLAS handle destruction failed with error code: " +
340 std::to_string(cublas_status));
342 cublas_handle_ =
nullptr;
347 if (cublas_handle_) {
348 cublasStatus_t cublas_status = cublasDestroy(cublas_handle_);
349 if (cublas_status != CUBLAS_STATUS_SUCCESS) {
350 Logger::error(
"cuBLAS handle destruction failed with error code: " +
351 std::to_string(cublas_status));
353 cublas_handle_ =
nullptr;
357 if (active_num_gpu_layers > 0) {
358 if (final_norm_dev) {
359 gpuErrchk(cudaFree(final_norm_dev));
360 final_norm_dev =
nullptr;
363 for (
auto& layer :
layers) {
364 if (layer.input_layernorm_dev) {
365 gpuErrchk(cudaFree(layer.input_layernorm_dev));
366 layer.input_layernorm_dev =
nullptr;
368 if (layer.post_attention_layernorm_dev) {
369 gpuErrchk(cudaFree(layer.post_attention_layernorm_dev));
370 layer.post_attention_layernorm_dev =
nullptr;
374 if (all_freqs_cis_dev) {
375 gpuErrchk(cudaFree(all_freqs_cis_dev));
376 all_freqs_cis_dev =
nullptr;
378 if (token_embedding_table_dev_) {
379 gpuErrchk(cudaFree(token_embedding_table_dev_));
380 token_embedding_table_dev_ =
nullptr;
383 gpuErrchk(cudaFree(lm_head_dev_));
384 lm_head_dev_ =
nullptr;
387 gpuErrchk(cudaFree(w_q_dev_));
391 gpuErrchk(cudaFree(w_k_dev_));
395 gpuErrchk(cudaFree(w_v_dev_));
399 gpuErrchk(cudaFree(w_o_dev_));
403 gpuErrchk(cudaFree(w_gate_dev_));
404 w_gate_dev_ =
nullptr;
407 gpuErrchk(cudaFree(w_up_dev_));
411 gpuErrchk(cudaFree(w_down_dev_));
412 w_down_dev_ =
nullptr;
414 if (token_embedding_table_f32_dev_) {
415 gpuErrchk(cudaFree(token_embedding_table_f32_dev_));
416 token_embedding_table_f32_dev_ =
nullptr;
418 if (lm_head_f32_dev_) {
419 gpuErrchk(cudaFree(lm_head_f32_dev_));
420 lm_head_f32_dev_ =
nullptr;
423 gpuErrchk(cudaFree(w_q_f32_dev_));
424 w_q_f32_dev_ =
nullptr;
427 gpuErrchk(cudaFree(w_k_f32_dev_));
428 w_k_f32_dev_ =
nullptr;
431 gpuErrchk(cudaFree(w_v_f32_dev_));
432 w_v_f32_dev_ =
nullptr;
435 gpuErrchk(cudaFree(w_o_f32_dev_));
436 w_o_f32_dev_ =
nullptr;
438 if (w_gate_f32_dev_) {
439 gpuErrchk(cudaFree(w_gate_f32_dev_));
440 w_gate_f32_dev_ =
nullptr;
443 gpuErrchk(cudaFree(w_up_f32_dev_));
444 w_up_f32_dev_ =
nullptr;
446 if (w_down_f32_dev_) {
447 gpuErrchk(cudaFree(w_down_f32_dev_));
448 w_down_f32_dev_ =
nullptr;
452 gpuErrchk(cudaFree(x_dev_));
456 gpuErrchk(cudaFree(x_norm_dev_));
457 x_norm_dev_ =
nullptr;
460 gpuErrchk(cudaFree(x_resid1_dev_));
461 x_resid1_dev_ =
nullptr;
464 gpuErrchk(cudaFree(x_resid2_dev_));
465 x_resid2_dev_ =
nullptr;
468 gpuErrchk(cudaFree(q_dev_));
472 gpuErrchk(cudaFree(k_dev_));
476 gpuErrchk(cudaFree(v_dev_));
480 gpuErrchk(cudaFree(attn_out_dev_));
481 attn_out_dev_ =
nullptr;
483 if (attn_proj_dev_) {
484 gpuErrchk(cudaFree(attn_proj_dev_));
485 attn_proj_dev_ =
nullptr;
488 gpuErrchk(cudaFree(gate_vec_dev_));
489 gate_vec_dev_ =
nullptr;
492 gpuErrchk(cudaFree(up_vec_dev_));
493 up_vec_dev_ =
nullptr;
495 if (swiglu_vec_dev_) {
496 gpuErrchk(cudaFree(swiglu_vec_dev_));
497 swiglu_vec_dev_ =
nullptr;
500 gpuErrchk(cudaFree(mlp_down_dev_));
501 mlp_down_dev_ =
nullptr;
504 gpuErrchk(cudaFree(logits_dev_));
505 logits_dev_ =
nullptr;
508 if (dequant_k_cache_buffer_dev_) {
509 gpuErrchk(cudaFree(dequant_k_cache_buffer_dev_));
510 dequant_k_cache_buffer_dev_ =
nullptr;
512 if (dequant_v_cache_buffer_dev_) {
513 gpuErrchk(cudaFree(dequant_v_cache_buffer_dev_));
514 dequant_v_cache_buffer_dev_ =
nullptr;
517 if (selective_k_dequant_buffer_dev_) {
518 gpuErrchk(cudaFree(selective_k_dequant_buffer_dev_));
519 selective_k_dequant_buffer_dev_ =
nullptr;
521 if (selective_v_dequant_buffer_dev_) {
522 gpuErrchk(cudaFree(selective_v_dequant_buffer_dev_));
523 selective_v_dequant_buffer_dev_ =
nullptr;
527 free_persistent_batch_buffers();
529 Logger::info(
"Freed persistent GPU workspace buffers.");
530 Logger::info(
"Finished freeing TinyLlamaModel CUDA weight memory.");
532 Logger::info(
"CPU-only mode: No GPU resources to free.");
537 std::vector<float>& input,
538 int n_tokens,
KVCache* kv_cache,
539 const std::vector<int>* attention_mask) {
547 int head_dim = hs / n_heads;
551 bool log_first_gen_step = (n_tokens == 0);
552 bool log_this_step = log_first_gen_step || (n_tokens == 12) || (n_tokens == 13);
556 Logger::info(
"[CPU_FWD_MEM] Starting layer " + std::to_string(l) +
" processing");
558 bool log_this_layer = log_this_step && (l == 0);
559 if (log_this_layer) {
560 Logger::info(
"[CPU_FWD] ------ START Layer " + std::to_string(l) +
561 " (pos=" + std::to_string(n_tokens) +
") ------");
565 const auto& lw =
layers[l];
566 std::vector<float> x_norm_vec1(hs);
567 const std::vector<float>& w_input_norm_vec =
568 lw.input_layernorm_f32.empty()
570 : lw.input_layernorm_f32;
572 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": Allocating QKV vectors");
573 std::vector<float> q_vec(hs), k_vec(n_kv_heads * head_dim), v_vec(n_kv_heads * head_dim);
574 bool enable_debug_logging = (l == 0);
575 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": About to ensure_q_proj_dequantized");
577 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": ensure_q_proj_dequantized completed");
584 else throw std::runtime_error(
"Layer " + std::to_string(l) +
": No Q proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
587 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": About to ensure_k_proj_dequantized");
589 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": ensure_k_proj_dequantized completed");
596 else throw std::runtime_error(
"Layer " + std::to_string(l) +
": No K proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
598 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": About to ensure_v_proj_dequantized");
600 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": ensure_v_proj_dequantized completed");
607 else throw std::runtime_error(
"Layer " + std::to_string(l) +
": No V proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
612 if (
static_cast<size_t>(l) < kv_cache->
layers.size()) {
615 if (
static_cast<size_t>(n_tokens) >= layer_max_seq_len && layer_max_seq_len > 0) {
616 Logger::error(
"KV Cache access out of bounds in CPU forward. Layer " + std::to_string(l) +
617 ", n_tokens: " + std::to_string(n_tokens) +
618 ", configured layer_max_seq_len: " + std::to_string(layer_max_seq_len) +
". Skipping KV update.");
619 }
else if (layer_max_seq_len == 0 && n_tokens > 0) {
620 Logger::error(
"KV Cache layer_max_seq_len is 0, but n_tokens > 0. Layer " + std::to_string(l) +
". Skipping KV update.");
622 for(
int h=0; h < n_kv_heads; ++h) {
623 std::copy(k_vec.begin() + h * head_dim, k_vec.begin() + (h+1) * head_dim, kv_layer.
k.begin() + n_tokens * (n_kv_heads * head_dim) + h * head_dim);
624 std::copy(v_vec.begin() + h * head_dim, v_vec.begin() + (h+1) * head_dim, kv_layer.
v.begin() + n_tokens * (n_kv_heads * head_dim) + h * head_dim);
628 Logger::error(
"KV Cache layer index " + std::to_string(l) +
" out of bounds for kv_cache->layers.size() = " + std::to_string(kv_cache->
layers.size()));
632 std::vector<float> attn_out_vec(hs);
633 std::vector<float> x_resid1_vec = input;
634 float att_scale = 1.0f / std::sqrt(
static_cast<float>(head_dim));
635 std::fill(attn_out_vec.begin(), attn_out_vec.end(), 0.0f);
636 for (
int h = 0; h < n_heads; ++h) {
637 std::vector<float> q_head(head_dim);
638 std::copy(q_vec.begin() + h * head_dim, q_vec.begin() + (h + 1) * head_dim, q_head.begin());
639 std::vector<float> current_multihead_attn_out(head_dim, 0.0f);
640 int kv_cache_num_kv_heads = n_kv_heads;
641 int kv_group = n_heads / kv_cache_num_kv_heads;
642 int kv_head_idx = h / kv_group;
644 if (kv_cache &&
static_cast<size_t>(l) < kv_cache->
layers.size()) {
646 int current_seq_len = n_tokens + 1;
647 std::vector<float> scores(current_seq_len);
648 for (
int t = 0; t < current_seq_len; ++t) {
650 for (
int d = 0; d < head_dim; ++d) {
651 score += q_head[d] * kv_layer.
k[t * (n_kv_heads * head_dim) + kv_head_idx * head_dim + d];
653 scores[t] = score * att_scale;
656 for (
int t = 0; t < current_seq_len; ++t) {
657 for (
int d = 0; d < head_dim; ++d) {
658 current_multihead_attn_out[d] += scores[t] * kv_layer.
v[t * (n_kv_heads * head_dim) + kv_head_idx * head_dim + d];
662 std::copy(current_multihead_attn_out.begin(), current_multihead_attn_out.end(), attn_out_vec.begin() + h * head_dim);
666 std::vector<float> attn_proj_vec(hs);
667 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": About to ensure_o_proj_dequantized");
669 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": ensure_o_proj_dequantized completed");
676 else throw std::runtime_error(
"Layer " + std::to_string(l) +
": No O proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
678 for(
size_t i=0; i<input.size(); ++i) input[i] = x_resid1_vec[i] + attn_proj_vec[i];
681 std::vector<float> x_norm_vec2(hs);
682 std::vector<float> x_resid2_vec = input;
683 const std::vector<float>& w_post_attn_norm_vec =
684 lw.post_attention_layernorm_f32.empty()
686 : lw.post_attention_layernorm_f32;
689 std::vector<float> gate_vec(is), up_vec(is);
691 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": About to ensure_gate_proj_dequantized");
693 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": ensure_gate_proj_dequantized completed");
700 else throw std::runtime_error(
"Layer " + std::to_string(l) +
": No Gate proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
703 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": About to ensure_up_proj_dequantized");
705 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": ensure_up_proj_dequantized completed");
712 else throw std::runtime_error(
"Layer " + std::to_string(l) +
": No Up proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
714 std::vector<float> silu_out_vec(is);
717 std::vector<float> swiglu_result_vec(is);
718 for(
size_t i=0; i<is; ++i) swiglu_result_vec[i] = silu_out_vec[i] * up_vec[i];
720 std::vector<float> mlp_out_vec(hs);
722 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": About to ensure_down_proj_dequantized");
724 Logger::info(
"[CPU_FWD_MEM] Layer " + std::to_string(l) +
": ensure_down_proj_dequantized completed");
731 else throw std::runtime_error(
"Layer " + std::to_string(l) +
": No Down proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
733 for(
size_t i=0; i<input.size(); ++i) input[i] = x_resid2_vec[i] + mlp_out_vec[i];
736 if (log_this_layer) {
737 Logger::info(
"[CPU_FWD] ------ END Layer " + std::to_string(l) +
738 " (pos=" + std::to_string(n_tokens) +
") ------");
741 int layer_to_clear = l - 2;
743 if (layer_to_clear < first_gpu_layer) {
750 Logger::info(
"[CPU_FWD] All layers processed on CPU. Performing final RMSNorm and Logits.");
751 const std::vector<float>& w_final_norm_vec =
754 std::vector<float> x_final_norm_vec(hs);
757 std::vector<float> logits(vs);
759 bool enable_lm_head_debug_logging =
true;
766 else throw std::runtime_error(
"No valid LM Head weights (f32, q8k, q8, q4k, q6k, bf16) found for CPU final stage.");
768 if (log_this_step || log_first_gen_step) {
769 log_vector_summary(
"[CPU_FWD] Final Logits (all CPU, pos=" + std::to_string(n_tokens) +
")", logits, 15);
779std::vector<float> TinyLlamaModel::forward_device(
782 const std::vector<int>* attention_mask, cudaStream_t stream) {
789 Logger::fatal(
"Number of attention heads is zero during forward_device.");
790 throw std::runtime_error(
"Division by zero: n_heads is zero.");
792 int head_dim = hs / n_heads;
795 int num_gpu_layers = total_model_layers - num_cpu_layers;
797 if (num_gpu_layers <= 0) {
798 Logger::warning(
"forward_device called with no GPU layers to process (num_gpu_layers = " + std::to_string(num_gpu_layers) +
"). Returning empty.");
802 Logger::error(
"forward_device called with null x_input_dev. This should be model_->x_dev_.");
814 cublasStatus_t stream_status = cublasSetStream(cublas_handle_, stream);
815 gpuErrchk(cudaMemcpyAsync(h_x_input_dev.data(), x_input_dev,
config_.
hidden_size *
sizeof(
float), cudaMemcpyDeviceToHost, stream));
816 gpuErrchk(cudaStreamSynchronize(stream));
818 if (stream_status != CUBLAS_STATUS_SUCCESS) {
822 float* current_x_dev = x_input_dev;
823 for (
int l_gpu_idx = 0; l_gpu_idx < num_gpu_layers; ++l_gpu_idx) {
824 int l_model_idx = num_cpu_layers + l_gpu_idx;
827 const float* lw_in_norm_dev =
layers[l_model_idx].input_layernorm_dev;
828 const float* lw_post_norm_dev =
layers[l_model_idx].post_attention_layernorm_dev;
830 gpuErrchk(cudaMemcpyAsync(x_resid1_dev_, x_dev_, hs *
sizeof(
float),
831 cudaMemcpyDeviceToDevice, stream));
833 if (!lw_in_norm_dev) {
834 throw std::runtime_error(
"[TM::fw_dev pos=" + std::to_string(pos) +
" L" + std::to_string(l_model_idx) +
"] Error: input_layernorm_dev is nullptr. GPU layer cannot proceed.");
839 rmsnorm_vector_cuda_optimized(x_dev_, lw_in_norm_dev, x_norm_dev_, hs, eps, stream);
841 rmsnorm_vector_cuda(x_dev_, lw_in_norm_dev, x_norm_dev_, hs, eps, stream);
846 if (w_q_f32_dev_ && w_k_f32_dev_ && w_v_f32_dev_) {
847 const float* w_q_layer_ptr = w_q_f32_dev_ + (size_t)l_gpu_idx * hs * hs;
848 const float* w_k_layer_ptr = w_k_f32_dev_ + (size_t)l_gpu_idx * n_kv_heads * head_dim * hs;
849 const float* w_v_layer_ptr = w_v_f32_dev_ + (size_t)l_gpu_idx * n_kv_heads * head_dim * hs;
851 matvec_f32_f32_cuda(cublas_handle_, w_q_layer_ptr, x_norm_dev_,
852 q_dev_, hs, hs, stream);
853 matvec_f32_f32_cuda(cublas_handle_, w_k_layer_ptr, x_norm_dev_,
854 k_dev_, n_kv_heads * head_dim, hs, stream);
855 matvec_f32_f32_cuda(cublas_handle_, w_v_layer_ptr, x_norm_dev_,
856 v_dev_, n_kv_heads * head_dim, hs, stream);
858 Logger::error(
"GPU L" + std::to_string(l_model_idx) +
" (gpu_idx " + std::to_string(l_gpu_idx) +
"): No valid concatenated QKV weights.");
return {};
864 if (
static_cast<size_t>(l_model_idx) < kv_cache->
layers.size()) {
867 for (
int kvh = 0; kvh < n_kv_heads; ++kvh) {
868 const float* current_k_head_ptr_fp32 = k_dev_ + kvh * head_dim;
869 const float* current_v_head_ptr_fp32 = v_dev_ + kvh * head_dim;
871 size_t token_head_offset_quant = (
static_cast<size_t>(pos) * n_kv_heads + kvh) * head_dim;
872 int8_t* k_quant_target_ptr = current_kv_layer.k_dev_quantized + token_head_offset_quant;
873 int8_t* v_quant_target_ptr = current_kv_layer.v_dev_quantized + token_head_offset_quant;
875 size_t scale_offset =
static_cast<size_t>(pos) * n_kv_heads + kvh;
876 float* k_scale_target_ptr = current_kv_layer.k_dev_scales + scale_offset;
877 float* v_scale_target_ptr = current_kv_layer.v_dev_scales + scale_offset;
879 quantize_fp32_to_int8_symmetric_per_tensor_cuda(
880 current_k_head_ptr_fp32, k_quant_target_ptr, k_scale_target_ptr, head_dim, stream);
881 quantize_fp32_to_int8_symmetric_per_tensor_cuda(
882 current_v_head_ptr_fp32, v_quant_target_ptr, v_scale_target_ptr, head_dim, stream);
885 for (
int kvh = 0; kvh < n_kv_heads; ++kvh) {
886 const float* current_k_head_ptr = k_dev_ + kvh * head_dim;
887 const float* current_v_head_ptr = v_dev_ + kvh * head_dim;
889 update_kv_cache_cuda(current_kv_layer.k_dev_fp32, current_k_head_ptr, pos,
890 kvh, kv_cache->allocated_max_seq_len,
891 kv_cache->allocated_num_kv_heads,
892 kv_cache->allocated_head_dim, stream);
894 update_kv_cache_cuda(current_kv_layer.v_dev_fp32, current_v_head_ptr, pos,
895 kvh, kv_cache->allocated_max_seq_len,
896 kv_cache->allocated_num_kv_heads,
897 kv_cache->allocated_head_dim, stream);
902 Logger::error(
"KVCache layer index " + std::to_string(l_model_idx) +
" out of bounds for kv_cache->layers access in forward_device.");
906 float scale = 1.0f /
SAFE_SQRT(
static_cast<float>(head_dim));
907 const float* attention_k_cache_ptr_dev =
nullptr;
908 const float* attention_v_cache_ptr_dev =
nullptr;
912 Logger::info(
"[GPU L" + std::to_string(l_model_idx) +
"] Using SELECTIVE KVCache dequantization");
914 attention_k_cache_ptr_dev = attention_kv_layer.k_dev_fp32;
915 attention_v_cache_ptr_dev = attention_kv_layer.v_dev_fp32;
918 float current_attention_scale = 1.0f / sqrtf((
float)head_dim);
921 selective_k_dequant_buffer_dev_ && selective_v_dequant_buffer_dev_) {
922 attention_cuda_selective_dequant(
924 attention_kv_layer.k_dev_quantized,
925 attention_kv_layer.v_dev_quantized,
926 attention_kv_layer.k_dev_scales,
927 attention_kv_layer.v_dev_scales,
928 selective_k_dequant_buffer_dev_,
929 selective_v_dequant_buffer_dev_,
934 current_attention_scale,
935 kv_cache->allocated_max_seq_len,
942 attention_cuda_optimized(
944 attention_k_cache_ptr_dev,
945 attention_v_cache_ptr_dev,
950 current_attention_scale,
951 kv_cache->allocated_max_seq_len,
958 attention_k_cache_ptr_dev,
959 attention_v_cache_ptr_dev,
964 current_attention_scale,
965 kv_cache->allocated_max_seq_len,
973 const float* lw_o_proj_f32_dev = w_o_f32_dev_ + (size_t)l_gpu_idx * hs * hs;
974 matvec_f32_f32_cuda(cublas_handle_, lw_o_proj_f32_dev, attn_out_dev_, attn_proj_dev_, hs, hs, stream);
976 Logger::error(
"GPU L" + std::to_string(l_model_idx) +
" (gpu_idx " + std::to_string(l_gpu_idx) +
"): No valid O proj weights (FP32/BF16).");
return {};
979 add_residual_cuda(attn_proj_dev_, x_resid1_dev_, current_x_dev, hs, stream);
981 gpuErrchk(cudaMemcpyAsync(x_resid2_dev_, current_x_dev, hs *
sizeof(
float), cudaMemcpyDeviceToDevice, stream));
983 if (!lw_post_norm_dev) {
Logger::error(
"Missing post_attention_layernorm_dev for GPU layer model_idx=" + std::to_string(l_model_idx));
return {}; }
987 rmsnorm_vector_cuda_optimized(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
989 rmsnorm_vector_cuda(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
993 const float* w_o_layer_ptr = w_o_f32_dev_ + (size_t)l_gpu_idx * hs * hs;
994 matvec_f32_f32_cuda(cublas_handle_, w_o_layer_ptr, attn_out_dev_, attn_proj_dev_, hs, hs, stream);
996 Logger::error(
"GPU L" + std::to_string(l_model_idx) +
": No valid O projection weights.");
return {};
999 add_residual_cuda(attn_proj_dev_, x_resid1_dev_, current_x_dev, hs, stream);
1000 gpuErrchk(cudaMemcpyAsync(x_resid2_dev_, current_x_dev, hs *
sizeof(
float), cudaMemcpyDeviceToDevice, stream));
1002 if (!lw_post_norm_dev) {
1003 Logger::error(
"Missing post_attention_layernorm_dev for GPU layer model_idx=" + std::to_string(l_model_idx));
return {};
1008 rmsnorm_vector_cuda_optimized(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
1010 rmsnorm_vector_cuda(current_x_dev, lw_post_norm_dev, x_norm_dev_, hs, eps, stream);
1013 if (w_gate_f32_dev_ && w_up_f32_dev_) {
1014 const float* w_gate_layer_ptr = w_gate_f32_dev_ + (size_t)l_gpu_idx * is * hs;
1015 const float* w_up_layer_ptr = w_up_f32_dev_ + (size_t)l_gpu_idx * is * hs;
1017 matvec_f32_f32_cuda(cublas_handle_, w_gate_layer_ptr, x_norm_dev_,
1018 gate_vec_dev_, is, hs, stream);
1019 matvec_f32_f32_cuda(cublas_handle_, w_up_layer_ptr, x_norm_dev_,
1020 up_vec_dev_, is, hs, stream);
1022 Logger::error(
"GPU L" + std::to_string(l_model_idx) +
": No valid Gate/Up projection weights.");
1026 swiglu_cuda(gate_vec_dev_, up_vec_dev_, swiglu_vec_dev_, is, stream);
1028 if (w_down_f32_dev_) {
1029 const float* w_down_layer_ptr = w_down_f32_dev_ + (size_t)l_gpu_idx * hs * is;
1030 matvec_f32_f32_cuda(cublas_handle_, w_down_layer_ptr, swiglu_vec_dev_,
1031 mlp_down_dev_, hs, is, stream);
1033 Logger::error(
"GPU L" + std::to_string(l_model_idx) +
": No valid Down projection weights.");
1036 add_residual_cuda(mlp_down_dev_, x_resid2_dev_, current_x_dev, hs, stream);
1041 rmsnorm_vector_cuda_optimized(x_dev_, final_norm_dev, x_norm_dev_, hs, eps, stream);
1043 rmsnorm_vector_cuda(x_dev_, final_norm_dev, x_norm_dev_, hs, eps, stream);
1047 matvec_bf16_f32_cuda(cublas_handle_, lm_head_dev_, x_norm_dev_, logits_dev_,
1050 Logger::error(
"LM head (lm_head_dev_ for BF16) is null. Cannot calculate logits on GPU.");
1054 gpuErrchk(cudaStreamSynchronize(stream));
1055 std::vector<float> logits(vs);
1056 gpuErrchk(cudaMemcpy(logits.data(), logits_dev_, vs *
sizeof(
float),
1057 cudaMemcpyDeviceToHost));
1064 const std::vector<float>& final_batch_activations,
1065 int num_tokens_in_batch) {
1067 if (final_batch_activations.size() != (
size_t)num_tokens_in_batch *
config_.
hidden_size) {
1068 Logger::error(
"[CPU_LOGITS_BATCH] final_batch_activations size mismatch. Expected: " +
1070 std::to_string(final_batch_activations.size()));
1079 std::vector<float> final_batch_norm_out(num_tokens_in_batch * hs);
1080 const std::vector<float>& w_final_norm_vec =
1083 if (w_final_norm_vec.empty()) {
1084 Logger::error(
"[CPU_LOGITS_BATCH] Final RMSNorm weights are empty (neither f32 nor bf16 available).");
1088 rmsnorm_batch_cpu(final_batch_activations, w_final_norm_vec, final_batch_norm_out,
1089 num_tokens_in_batch, hs, eps);
1092 std::vector<float> batch_logits_out(num_tokens_in_batch * vs);
1095 Logger::info(
"[CPU_LOGITS_BATCH] Using F32 LM Head weights.");
1097 num_tokens_in_batch, vs, hs);
1099 Logger::info(
"[CPU_LOGITS_BATCH] Using Q8_0 LM Head weights.");
1101 num_tokens_in_batch, vs, hs);
1103 Logger::info(
"[CPU_LOGITS_BATCH] Using Q6_K LM Head weights.");
1105 num_tokens_in_batch, vs, hs);
1107 Logger::info(
"[CPU_LOGITS_BATCH] Using Q4_K LM Head weights.");
1109 num_tokens_in_batch, vs, hs);
1110 }
else if (!
lm_head.empty()) {
1111 Logger::info(
"[CPU_LOGITS_BATCH] Using BF16 LM Head weights (converting to F32 for matmul).");
1113 if (lm_head_f32_temp.empty()) {
1114 Logger::error(
"[CPU_LOGITS_BATCH] Failed to convert BF16 LM Head to F32.");
1118 num_tokens_in_batch, vs, hs);
1120 Logger::error(
"[CPU_LOGITS_BATCH] No valid LM Head weights found (F32, Q8_0, Q6_K, Q4_K, BF16).");
1124 return batch_logits_out;
1128 const std::vector<float>& batch_input_activations,
1129 const std::vector<int>& token_positions,
1130 const std::vector<int>& original_sequence_indices,
1131 int num_tokens_in_batch,
1134 Logger::info(
"[CPU_BATCH_GEN] Entry: num_tokens=" + std::to_string(num_tokens_in_batch));
1135 std::string pos_str =
"token_positions=[";
1136 for (
int i = 0; i < std::min(num_tokens_in_batch, 3); ++i) {
1137 pos_str += std::to_string(token_positions[i]) +
" ";
1140 std::string seq_str =
"original_sequence_indices=[";
1141 for (
int i = 0; i < std::min(num_tokens_in_batch, 3); ++i) {
1142 seq_str += std::to_string(original_sequence_indices[i]) +
" ";
1145 Logger::info(
"[CPU_BATCH_GEN] " + pos_str +
", " + seq_str);
1146 if (batch_input_activations.size() != (
size_t)num_tokens_in_batch *
config_.
hidden_size) {
1147 Logger::error(
"[CPU_BATCH_GENERATION] batch_input_activations size mismatch. Expected: " +
1149 std::to_string(batch_input_activations.size()));
1153 if (token_positions.size() !=
static_cast<size_t>(num_tokens_in_batch)) {
1154 Logger::error(
"[CPU_BATCH_GENERATION] token_positions size mismatch. Expected: " +
1155 std::to_string(num_tokens_in_batch) +
" Got: " + std::to_string(token_positions.size()));
1164 Logger::error(
"[CPU_BATCH_GENERATION] Error: num_attention_heads is zero.");
1167 int head_dim = hs / n_heads;
1171 float attention_scale = 1.0f /
SAFE_SQRT(
static_cast<float>(head_dim));
1173 int kv_group = n_heads / n_kv_heads;
1175 std::vector<float> current_batch_activations = batch_input_activations;
1177 const auto& lw =
layers[l];
1180 std::vector<float> batch_x_norm1(current_batch_activations.size());
1181 const std::vector<float>& w_input_norm_vec =
1182 lw.input_layernorm_f32.empty()
1184 : lw.input_layernorm_f32;
1185 rmsnorm_batch_cpu(current_batch_activations, w_input_norm_vec, batch_x_norm1, num_tokens_in_batch, hs, eps);
1187 std::vector<float> residual_batch_component_attn = current_batch_activations;
1190 std::vector<float> q_batch((
size_t)num_tokens_in_batch * hs);
1191 std::vector<float> k_batch((
size_t)num_tokens_in_batch * n_kv_heads * head_dim);
1192 std::vector<float> v_batch((
size_t)num_tokens_in_batch * n_kv_heads * head_dim);
1195 if (!lw.q_proj_f32.empty()) {
1197 }
else if (!lw.q_proj_q8_0.empty()) {
1199 }
else if (!lw.q_proj_q6k.empty()) {
1201 }
else if (!lw.q_proj_q4k.empty()) {
1204 Logger::error(
"[CPU_BATCH_GENERATION] Layer " + std::to_string(l) +
": No Q proj weights found for CPU (batched)");
1209 if (!lw.k_proj_f32.empty()) {
1211 }
else if (!lw.k_proj_q8_0.empty()) {
1213 }
else if (!lw.k_proj_q6k.empty()) {
1215 }
else if (!lw.k_proj_q4k.empty()) {
1218 Logger::error(
"[CPU_BATCH_GENERATION] Layer " + std::to_string(l) +
": No K proj weights found for CPU (batched)");
1223 if (!lw.v_proj_f32.empty()) {
1225 }
else if (!lw.v_proj_q8_0.empty()) {
1227 }
else if (!lw.v_proj_q6k.empty()) {
1229 }
else if (!lw.v_proj_q4k.empty()) {
1232 Logger::error(
"[CPU_BATCH_GENERATION] Layer " + std::to_string(l) +
": No V proj weights found for CPU (batched)");
1237 std::vector<float> batch_attn_output((
size_t)num_tokens_in_batch * hs);
1248 #pragma omp parallel if(num_tokens_in_batch > 1)
1251 std::vector<float> q_token(hs);
1252 std::vector<float> k_token(n_kv_heads * head_dim);
1253 std::vector<float> v_token(n_kv_heads * head_dim);
1254 std::vector<float> scores_buffer;
1257 for (
int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1258 int pos = token_positions[token_idx];
1261 std::copy(q_batch.begin() + (
size_t)token_idx * hs,
1262 q_batch.begin() + (
size_t)(token_idx + 1) * hs,
1264 std::copy(k_batch.begin() + (
size_t)token_idx * n_kv_heads * head_dim,
1265 k_batch.begin() + (
size_t)(token_idx + 1) * n_kv_heads * head_dim,
1267 std::copy(v_batch.begin() + (
size_t)token_idx * n_kv_heads * head_dim,
1268 v_batch.begin() + (
size_t)(token_idx + 1) * n_kv_heads * head_dim,
1276 if (kv_cache &&
static_cast<size_t>(l) < kv_cache->
layers.size()) {
1277 auto& layer_cache = kv_cache->
layers[l];
1279 int seq_idx = original_sequence_indices[token_idx];
1281 int kv_offset = (sequence_base_offset + pos) * n_kv_heads * head_dim;
1282 #pragma omp critical
1284 if (kv_offset + n_kv_heads * head_dim <=
static_cast<int>(layer_cache.k.size())) {
1285 std::copy(k_token.begin(), k_token.end(), layer_cache.k.begin() + kv_offset);
1286 std::copy(v_token.begin(), v_token.end(), layer_cache.v.begin() + kv_offset);
1292 std::copy(q_token.begin(), q_token.end(), q_batch.begin() + (
size_t)token_idx * hs);
1295 int seq_idx = original_sequence_indices[token_idx];
1297 scores_buffer.resize(history_len);
1299 const float* q_token_ptr = q_batch.data() + (size_t)token_idx * hs;
1300 float* attn_output_ptr = batch_attn_output.data() + (size_t)token_idx * hs;
1302 if (kv_cache &&
static_cast<size_t>(l) < kv_cache->
layers.size()) {
1303 const auto& layer_cache = kv_cache->
layers[l];
1306 for (
int h = 0; h < n_heads; ++h) {
1307 int kv_head_idx = h / kv_group;
1308 const float* q_head_ptr = q_token_ptr + h * head_dim;
1309 float* head_output_ptr = attn_output_ptr + h * head_dim;
1312 for (
int t = 0; t < history_len; ++t) {
1314 int seq_idx = original_sequence_indices[token_idx];
1316 const float* k_ptr = layer_cache.k.data() + (sequence_base_offset + t) * n_kv_heads * head_dim + kv_head_idx * head_dim;
1319#if defined(__AVX2__) || defined(__SSE2__) || defined(__ARM_NEON)
1323 for (
int d = 0; d < head_dim; ++d) {
1324 score += q_head_ptr[d] * k_ptr[d];
1327 scores_buffer[t] = score * attention_scale;
1334 std::fill(head_output_ptr, head_output_ptr + head_dim, 0.0f);
1335 for (
int t = 0; t < history_len; ++t) {
1337 int seq_idx = original_sequence_indices[token_idx];
1339 const float* v_ptr = layer_cache.v.data() + (sequence_base_offset + t) * n_kv_heads * head_dim + kv_head_idx * head_dim;
1340 float score = scores_buffer[t];
1343#if defined(__AVX2__) || defined(__SSE2__) || defined(__ARM_NEON)
1346 for (
int d = 0; d < head_dim; ++d) {
1347 head_output_ptr[d] += score * v_ptr[d];
1353 std::fill(attn_output_ptr, attn_output_ptr + hs, 0.0f);
1358 std::vector<float> batch_attn_proj_out((
size_t)num_tokens_in_batch * hs);
1359 if(!lw.o_proj_f32.empty()) {
1361 }
else if (!lw.o_proj_q8_0.empty()) {
1363 }
else if (!lw.o_proj_q6k.empty()) {
1365 }
else if (!lw.o_proj_q4k.empty()) {
1368 Logger::error(
"[CPU_BATCH_GENERATION] Layer " + std::to_string(l) +
": No O proj weights found for CPU");
1373 for(
size_t i=0; i < current_batch_activations.size(); ++i) {
1374 current_batch_activations[i] = residual_batch_component_attn[i] + batch_attn_proj_out[i];
1378 std::vector<float> residual_batch_component_mlp = current_batch_activations;
1379 std::vector<float> batch_x_norm2(current_batch_activations.size());
1381 const std::vector<float>& w_post_attn_norm_vec =
1382 lw.post_attention_layernorm_f32.empty()
1384 : lw.post_attention_layernorm_f32;
1385 rmsnorm_batch_cpu(current_batch_activations, w_post_attn_norm_vec, batch_x_norm2, num_tokens_in_batch, hs, eps);
1387 std::vector<float> batch_gate_proj_out((
size_t)num_tokens_in_batch * is);
1388 std::vector<float> batch_up_proj_out((
size_t)num_tokens_in_batch * is);
1391 if (!lw.gate_proj_f32.empty()) {
1393 }
else if (!lw.gate_proj_q8_0.empty()) {
1395 }
else if (!lw.gate_proj_q6k.empty()) {
1397 }
else if (!lw.gate_proj_q4k.empty()) {
1400 Logger::error(
"[CPU_BATCH_GENERATION] Layer " + std::to_string(l) +
": No gate_proj weights found for CPU");
1404 if (!lw.up_proj_f32.empty()) {
1406 }
else if (!lw.up_proj_q8_0.empty()) {
1408 }
else if (!lw.up_proj_q6k.empty()) {
1410 }
else if (!lw.up_proj_q4k.empty()) {
1413 Logger::error(
"[CPU_BATCH_GENERATION] Layer " + std::to_string(l) +
": No up_proj weights found for CPU");
1418 std::vector<float> batch_swiglu_out((
size_t)num_tokens_in_batch * is);
1419 for (
size_t i = 0; i < batch_gate_proj_out.size(); ++i) {
1420 float gate_val = batch_gate_proj_out[i];
1421 float silu_gate_val = gate_val / (1.0f + std::exp(-gate_val));
1422 batch_swiglu_out[i] = silu_gate_val * batch_up_proj_out[i];
1426 std::vector<float> batch_mlp_down_proj_out((
size_t)num_tokens_in_batch * hs);
1427 if (!lw.down_proj_f32.empty()) {
1428 matmul_f32_f32_batch_cpu(lw.down_proj_f32, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
1429 }
else if (!lw.down_proj_q8_0.empty()) {
1431 }
else if (!lw.down_proj_q6k.empty()) {
1432 matmul_q6k_f32_batch_cpu(lw.down_proj_q6k, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
1433 }
else if (!lw.down_proj_q4k.empty()) {
1434 matmul_q4k_f32_batch_cpu(lw.down_proj_q4k, batch_swiglu_out, batch_mlp_down_proj_out, num_tokens_in_batch, hs, is);
1436 Logger::error(
"[CPU_BATCH_GENERATION] Layer " + std::to_string(l) +
": No down_proj weights found for CPU");
1441 for(
size_t i = 0; i < current_batch_activations.size(); ++i) {
1442 current_batch_activations[i] = residual_batch_component_mlp[i] + batch_mlp_down_proj_out[i];
1447 if (kv_cache && num_tokens_in_batch > 0) {
1453 for (
int i = 0; i < num_tokens_in_batch; ++i) {
1454 int seq_idx = original_sequence_indices[i];
1455 int pos = token_positions[i];
1457 if (seq_idx >= 0 && seq_idx < kv_cache->current_batch_size) {
1458 max_positions_per_seq[seq_idx] = std::max(max_positions_per_seq[seq_idx], pos);
1464 if (max_positions_per_seq[seq_idx] >= 0) {
1465 kv_cache->
batch_seq_lens[seq_idx] = max_positions_per_seq[seq_idx] + 1;
1466 Logger::info(
"[CPU_BATCH_GEN] KV Length Max Update: seq_idx=" + std::to_string(seq_idx) +
1467 ", old_batch_seq_len=" + std::to_string(kv_cache->
batch_seq_lens[seq_idx]) +
1468 ", new_batch_seq_len=" + std::to_string(max_positions_per_seq[seq_idx] + 1));
1476 int max_pos = *std::max_element(token_positions.begin(), token_positions.end());
1485 std::vector<std::vector<float>> all_logits(num_tokens_in_batch, std::vector<float>(vs));
1486 for (
int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1487 std::copy(batch_logits.begin() + (
size_t)token_idx * vs,
1488 batch_logits.begin() + (
size_t)(token_idx + 1) * vs,
1489 all_logits[token_idx].begin());
1495std::vector<float> TinyLlamaModel::forward_device_batch_prefill(
1496 float* d_batch_input_embeddings,
1497 int num_tokens_in_batch,
1498 int current_model_pos,
1500 cudaStream_t stream) {
1502 Logger::info(
"[FWD_DEV_BATCH_PREFILL_ENTRY] num_tokens_in_batch: " + std::to_string(num_tokens_in_batch) +
1503 ", current_model_pos: " + std::to_string(current_model_pos) +
1512 Logger::debug(
"[FWD_DEV_BATCH_PREFILL_PARAMS] hidden_size: " + std::to_string(hidden_size) +
1513 ", head_dim: " + std::to_string(head_dim) +
1514 ", ffn_intermediate_dim: " + std::to_string(ffn_intermediate_dim) +
1515 ", n_kv_dim: " + std::to_string(n_kv_dim) +
1516 ", vocab_size: " + std::to_string(vocab_size) +
1520 float* d_batch_x_ptr = d_batch_input_embeddings;
1521 float* d_batch_x_norm_out_attn;
1522 float* d_batch_q_proj_out;
1523 float* d_batch_k_proj_out;
1524 float* d_batch_v_proj_out;
1525 float* d_batch_attn_heads_concat_out;
1526 float* d_batch_attn_final_proj_out;
1527 float* d_batch_residual_attn_in;
1528 float* d_batch_residual_ffn_in;
1529 float* d_batch_x_norm_out_ffn;
1530 float* d_batch_ffn_gate_proj_out;
1531 float* d_batch_ffn_up_proj_out;
1532 float* d_batch_ffn_swiglu_out;
1533 float* d_batch_ffn_down_proj_out;
1534 float* d_batch_layer_output =
nullptr;
1535 size_t batch_hidden_size_elems = (size_t)num_tokens_in_batch * hidden_size;
1536 size_t batch_kv_proj_size_elems = (size_t)num_tokens_in_batch * n_kv_dim;
1537 size_t batch_ffn_intermediate_elems = (size_t)num_tokens_in_batch * ffn_intermediate_dim;
1538 size_t batch_hidden_size_bytes = batch_hidden_size_elems *
sizeof(float);
1539 size_t batch_kv_proj_size_bytes = batch_kv_proj_size_elems *
sizeof(float);
1540 size_t batch_ffn_intermediate_bytes = batch_ffn_intermediate_elems *
sizeof(float);
1541 resize_persistent_batch_buffers_if_needed(num_tokens_in_batch);
1544 d_batch_x_norm_out_attn = d_persistent_batch_norm_out_;
1545 d_batch_q_proj_out = d_persistent_q_batch_;
1546 d_batch_k_proj_out = d_persistent_k_batch_;
1547 d_batch_v_proj_out = d_persistent_v_batch_;
1548 d_batch_attn_heads_concat_out = d_persistent_attn_output_;
1549 d_batch_attn_final_proj_out = d_persistent_attn_proj_out_;
1550 d_batch_residual_attn_in = d_persistent_batch_residual_;
1551 d_batch_residual_ffn_in = d_persistent_batch_residual_ + num_tokens_in_batch * hidden_size;
1552 d_batch_x_norm_out_ffn = d_persistent_batch_norm_out_;
1553 d_batch_ffn_gate_proj_out = d_persistent_gate_proj_out_;
1554 d_batch_ffn_up_proj_out = d_persistent_up_proj_out_;
1555 d_batch_ffn_swiglu_out = d_persistent_swiglu_out_;
1556 d_batch_ffn_down_proj_out = d_persistent_mlp_down_out_;
1559 d_batch_layer_output = d_persistent_batch_input_;
1561 const float alpha = 1.0f;
1562 const float beta = 0.0f;
1564 cublasStatus_t stream_status = cublasSetStream(cublas_handle_, stream);
1565 if (stream_status != CUBLAS_STATUS_SUCCESS) {
1566 Logger::fatal(
"cublasSetStream failed in forward_device_batch_prefill");
1567 throw std::runtime_error(
"cublasSetStream failed");
1575 Logger::info(
"[FWD_DEV_BATCH_PREFILL_LAYER_START] Processing Layer: model_idx=" + std::to_string(l_model_idx) +
", gpu_idx=" + std::to_string(l_gpu_idx) +
1578 gpuErrchk(cudaMemcpyAsync(d_batch_residual_attn_in, d_batch_x_ptr, batch_hidden_size_bytes, cudaMemcpyDeviceToDevice, stream));
1580 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
1581 layers[l_model_idx].input_layernorm_dev,
1586 const float* w_q_layer_ptr = w_q_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
1588 d_batch_x_norm_out_attn, hidden_size, w_q_layer_ptr, hidden_size, &beta,
1589 d_batch_q_proj_out, hidden_size, stream,
"Q_PROJ_GEN");
1590 Logger::info(
"[GPU_Q_PROJ] Layer=" + std::to_string(l_model_idx) +
1596 int log_elements_common = std::min(
static_cast<int>(head_dim), 3);
1597 if (log_elements_common <= 0 && hidden_size > 0) log_elements_common = std::min(
static_cast<int>(hidden_size), 3);
1598 if (log_elements_common > 0) {
1599 std::vector<float> h_sample_t0(log_elements_common);
1600 gpuErrchk(cudaMemcpyAsync(h_sample_t0.data(), d_batch_q_proj_out, log_elements_common *
sizeof(
float), cudaMemcpyDeviceToHost, stream));
1601 if (num_tokens_in_batch <= 1) gpuErrchk(cudaStreamSynchronize(stream));
1602 std::string str_t0 =
"";
for(
float val : h_sample_t0) { str_t0 += std::to_string(val) +
" "; }
1603 Logger::debug(
"[FWD_DEV_BATCH_PREFILL_LAYER_L" + std::to_string(l_model_idx) +
"] Q_PROJ_OUT (T0, H0, first " + std::to_string(log_elements_common) +
"): " + str_t0);
1604 if (num_tokens_in_batch > 1) {
1605 std::vector<float> h_sample_t1(log_elements_common);
1606 gpuErrchk(cudaMemcpyAsync(h_sample_t1.data(), d_batch_q_proj_out + hidden_size, log_elements_common *
sizeof(
float), cudaMemcpyDeviceToHost, stream));
1607 gpuErrchk(cudaStreamSynchronize(stream));
1608 std::string str_t1 =
"";
for(
float val : h_sample_t1) { str_t1 += std::to_string(val) +
" "; }
1609 Logger::debug(
"[FWD_DEV_BATCH_PREFILL_LAYER_L" + std::to_string(l_model_idx) +
"] Q_PROJ_OUT (T1, H0, first " + std::to_string(log_elements_common) +
"): " + str_t1);
1614 const float* w_k_layer_ptr = w_k_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1616 d_batch_x_norm_out_attn, hidden_size, w_k_layer_ptr, n_kv_dim, &beta,
1617 d_batch_k_proj_out, n_kv_dim, stream,
"K_PROJ_GEN");
1619 const float* w_v_layer_ptr = w_v_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1621 d_batch_x_norm_out_attn, hidden_size, w_v_layer_ptr, n_kv_dim, &beta,
1622 d_batch_v_proj_out, n_kv_dim, stream,
"V_PROJ_GEN");
1625 int log_elements_rope = std::min(3, head_dim);
1626 for (
int token_to_log_idx_rope = 0; token_to_log_idx_rope < std::min(num_tokens_in_batch, 2); ++token_to_log_idx_rope) {
1628 std::vector<float> h_q_pre_rope(log_elements_rope);
1630 gpuErrchk(cudaMemcpyAsync(h_q_pre_rope.data(), d_batch_q_proj_out + q_log_offset, log_elements_rope *
sizeof(
float), cudaMemcpyDeviceToHost, stream));
1631 gpuErrchk(cudaStreamSynchronize(stream));
1632 std::string str_q_pre_rope =
"";
for(
float val : h_q_pre_rope) { str_q_pre_rope += std::to_string(val) +
" "; }
1635 std::vector<float> h_k_pre_rope(log_elements_rope);
1637 gpuErrchk(cudaMemcpyAsync(h_k_pre_rope.data(), d_batch_k_proj_out + k_log_offset, log_elements_rope *
sizeof(
float), cudaMemcpyDeviceToHost, stream));
1638 gpuErrchk(cudaStreamSynchronize(stream));
1639 std::string str_k_pre_rope =
"";
for(
float val : h_k_pre_rope) { str_k_pre_rope += std::to_string(val) +
" "; }
1644 rope_batch_cuda(d_batch_q_proj_out, d_batch_k_proj_out, all_freqs_cis_dev, num_tokens_in_batch,
1647 gpuErrchk(cudaStreamSynchronize(stream));
1650 int log_elements_rope = std::min(3, head_dim);
1651 for (
int token_to_log_idx_rope = 0; token_to_log_idx_rope < std::min(num_tokens_in_batch, 2); ++token_to_log_idx_rope) {
1653 std::vector<float> h_q_post_rope(log_elements_rope);
1655 gpuErrchk(cudaMemcpy(h_q_post_rope.data(), d_batch_q_proj_out + q_log_offset, log_elements_rope *
sizeof(
float), cudaMemcpyDeviceToHost));
1656 std::string str_q_post_rope =
"";
for(
float val : h_q_post_rope) { str_q_post_rope += std::to_string(val) +
" "; }
1659 std::vector<float> h_k_post_rope(log_elements_rope);
1661 gpuErrchk(cudaMemcpy(h_k_post_rope.data(), d_batch_k_proj_out + k_log_offset, log_elements_rope *
sizeof(
float), cudaMemcpyDeviceToHost));
1662 std::string str_k_post_rope =
"";
for(
float val : h_k_post_rope) { str_k_post_rope += std::to_string(val) +
" "; }
1668 int log_elements = std::min(3, head_dim);
1669 if (d_batch_k_proj_out) {
1670 std::vector<float> h_k_log_token0(log_elements);
1671 gpuErrchk(cudaMemcpyAsync(h_k_log_token0.data(), d_batch_k_proj_out, log_elements *
sizeof(
float), cudaMemcpyDeviceToHost, stream));
1672 std::vector<float> h_k_log_token1(log_elements);
1673 if (num_tokens_in_batch > 1) { gpuErrchk(cudaMemcpyAsync(h_k_log_token1.data(), d_batch_k_proj_out + n_kv_dim, log_elements *
sizeof(
float), cudaMemcpyDeviceToHost, stream));}
1674 gpuErrchk(cudaStreamSynchronize(stream));
1676 if (d_batch_v_proj_out) {
1677 std::vector<float> h_v_log_token0(log_elements);
1678 gpuErrchk(cudaMemcpyAsync(h_v_log_token0.data(), d_batch_v_proj_out, log_elements *
sizeof(
float), cudaMemcpyDeviceToHost, stream));
1679 std::vector<float> h_v_log_token1(log_elements);
1680 if (num_tokens_in_batch > 1) { gpuErrchk(cudaMemcpyAsync(h_v_log_token1.data(), d_batch_v_proj_out + n_kv_dim, log_elements *
sizeof(
float), cudaMemcpyDeviceToHost, stream));}
1681 gpuErrchk(cudaStreamSynchronize(stream));
1685 float* d_layer_k_cache_ptr = kv_cache->
layers[l_model_idx].k_dev_fp32;
1686 float* d_layer_v_cache_ptr = kv_cache->
layers[l_model_idx].v_dev_fp32;
1687 update_kv_cache_batch_cuda(d_layer_k_cache_ptr, d_batch_k_proj_out, current_model_pos, num_tokens_in_batch,
1689 update_kv_cache_batch_cuda(d_layer_v_cache_ptr, d_batch_v_proj_out, current_model_pos, num_tokens_in_batch,
1691 gpuErrchk(cudaStreamSynchronize(stream));
1693 float current_attention_scale = 1.0f / sqrtf((
float)head_dim);
1694 attention_batch_prefill_cuda(d_batch_q_proj_out,
nullptr,
nullptr,
1695 d_layer_k_cache_ptr, d_layer_v_cache_ptr,
1696 d_batch_attn_heads_concat_out, num_tokens_in_batch, current_model_pos,
1699 const float* w_o_layer_ptr = w_o_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
1701 d_batch_attn_heads_concat_out, hidden_size, w_o_layer_ptr, hidden_size, &beta,
1702 d_batch_attn_final_proj_out, hidden_size, stream,
"O_PROJ_GEN");
1705 add_residual_batch_cuda(d_batch_residual_ffn_in, d_batch_attn_final_proj_out, d_batch_residual_attn_in,
1706 num_tokens_in_batch, hidden_size, stream);
1710 rmsnorm_batch_cuda(d_batch_x_norm_out_ffn, d_batch_residual_ffn_in,
1711 layers[l_model_idx].post_attention_layernorm_dev,
1717 const float* w1_layer_ptr = w_gate_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
1719 d_batch_x_norm_out_ffn, hidden_size, w1_layer_ptr, ffn_intermediate_dim, &beta,
1720 d_batch_ffn_gate_proj_out, ffn_intermediate_dim, stream,
"FFN_GATE_PROJ_GEN");
1725 const float* w3_layer_ptr = w_up_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
1727 d_batch_x_norm_out_ffn, hidden_size, w3_layer_ptr, ffn_intermediate_dim, &beta,
1728 d_batch_ffn_up_proj_out, ffn_intermediate_dim, stream,
"FFN_UP_PROJ_GEN");
1733 swiglu_batch_cuda(d_batch_ffn_swiglu_out, d_batch_ffn_gate_proj_out, d_batch_ffn_up_proj_out,
1734 num_tokens_in_batch, ffn_intermediate_dim, stream);
1739 const float* w2_layer_ptr = w_down_f32_dev_ + (size_t)l_gpu_idx * ffn_intermediate_dim * hidden_size;
1741 d_batch_ffn_swiglu_out, ffn_intermediate_dim, w2_layer_ptr, hidden_size, &beta,
1742 d_batch_ffn_down_proj_out, hidden_size, stream,
"FFN_DOWN_PROJ_GEN");
1747 add_residual_batch_cuda(d_batch_layer_output, d_batch_ffn_down_proj_out, d_batch_residual_ffn_in,
1748 num_tokens_in_batch, hidden_size, stream);
1753 d_batch_x_ptr = d_batch_layer_output;
1754 Logger::info(
"[FWD_DEV_BATCH_PREFILL_LAYER_END] Layer " + std::to_string(l_model_idx) +
" finished. Next d_batch_x_ptr: " +
Logger::ptrToString(d_batch_x_ptr));
1757 if (num_tokens_in_batch > 0) {
1759 size_t offset_last_token_hidden_state = (size_t)(num_tokens_in_batch - 1) *
config_.
hidden_size;
1761 gpuErrchk(cudaMemcpyAsync(h_last_token_hidden_state.data(),
1762 d_batch_x_ptr + offset_last_token_hidden_state,
1764 cudaMemcpyDeviceToHost, stream));
1765 gpuErrchk(cudaStreamSynchronize(stream));
1766 Logger::log_vector_stats(
"[FWD_DEV_BATCH_PREFILL_LAST_TOKEN_HIDDEN_STATE_PRE_FINAL_RMSNORM]", h_last_token_hidden_state, 20);
1768 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
1773 float* d_logits_last_token;
1774 gpuErrchk(cudaMalloc(&d_logits_last_token, (
size_t)vocab_size *
sizeof(
float)));
1777 float* d_last_token_activations_for_logits = d_batch_x_norm_out_attn + (size_t)(num_tokens_in_batch - 1) * hidden_size;
1779 matvec_f32_f32_cuda(cublas_handle_, lm_head_f32_dev_, d_last_token_activations_for_logits,
1780 d_logits_last_token, vocab_size, hidden_size, stream);
1782 std::vector<float> h_logits(vocab_size);
1783 gpuErrchk(cudaMemcpyAsync(h_logits.data(), d_logits_last_token, (
size_t)vocab_size *
sizeof(
float),
1784 cudaMemcpyDeviceToHost, stream));
1785 gpuErrchk(cudaStreamSynchronize(stream));
1791 if (
static_cast<size_t>(first_gpu_layer_model_idx) < kv_cache->
layers.size()) {
1792 const KVCacheLayer& cache_layer_to_log = kv_cache->
layers[first_gpu_layer_model_idx];
1793 const float* d_k_cache_ptr = cache_layer_to_log.k_dev_fp32;
1794 const float* d_v_cache_ptr = cache_layer_to_log.v_dev_fp32;
1796 const int local_n_kv_dim_for_log = num_kv_h * head_dim;
1797 const int log_elems_kv = std::min(3, head_dim);
1799 if (d_k_cache_ptr && d_v_cache_ptr && log_elems_kv > 0 && local_n_kv_dim_for_log > 0) {
1800 for (
int tk_idx = 0; tk_idx < num_tokens_in_batch; ++tk_idx) {
1801 int cache_pos_for_token = current_model_pos + tk_idx;
1803 Logger::warning(
"[KVDUMP_POST_BATCH_PREFILL] L" + std::to_string(first_gpu_layer_model_idx) +
1804 " Token " + std::to_string(tk_idx) +
" (CachePos " + std::to_string(cache_pos_for_token) +
1805 ") would be out of bounds (" + std::to_string(kv_cache->
max_seq_len_config_) +
"). Skipping.");
1808 for (
int kvh_idx = 0; kvh_idx < num_kv_h; ++kvh_idx) {
1809 size_t offset_in_cache = (size_t)cache_pos_for_token * local_n_kv_dim_for_log + (
size_t)kvh_idx * head_dim;
1811 std::vector<float> h_k_dump(log_elems_kv);
1812 gpuErrchk(cudaMemcpy(h_k_dump.data(), d_k_cache_ptr + offset_in_cache, log_elems_kv *
sizeof(
float), cudaMemcpyDeviceToHost));
1813 std::string str_k_dump =
"";
for(
float val : h_k_dump) { str_k_dump += std::to_string(val) +
" "; }
1814 std::vector<float> h_v_dump(log_elems_kv);
1815 gpuErrchk(cudaMemcpy(h_v_dump.data(), d_v_cache_ptr + offset_in_cache, log_elems_kv *
sizeof(
float), cudaMemcpyDeviceToHost));
1816 std::string str_v_dump =
"";
for(
float val : h_v_dump) { str_v_dump += std::to_string(val) +
" "; }
1820 Logger::warning(
"[KVDUMP_POST_BATCH_PREFILL] L" + std::to_string(first_gpu_layer_model_idx) +
1821 " cannot log K/V cache: null pointers, log_elems_kv <= 0, or local_n_kv_dim_for_log <=0.");
1824 Logger::warning(
"[KVDUMP_POST_BATCH_PREFILL] First GPU layer index " + std::to_string(first_gpu_layer_model_idx) +
1825 " out of bounds for kv_cache->layers (size " + std::to_string(kv_cache->
layers.size()) +
")");
1828 gpuErrchk(cudaFree(d_logits_last_token));
1829 Logger::info(
"[FWD_DEV_BATCH_PREFILL_EXIT] Function finished.");
1832std::vector<std::vector<float>> TinyLlamaModel::forward_device_batch_generation(
1833 float* d_batch_input_embeddings,
1834 const std::vector<int>& token_positions,
1835 const std::vector<int>& original_sequence_indices,
1836 int num_tokens_in_batch,
1838 cudaStream_t stream) {
1839 Logger::info(
"[FWD_DEV_BATCH_GENERATION_ENTRY] num_tokens_in_batch: " + std::to_string(num_tokens_in_batch) +
1842 if (token_positions.size() !=
static_cast<size_t>(num_tokens_in_batch)) {
1843 Logger::error(
"[FWD_DEV_BATCH_GENERATION] token_positions size mismatch. Expected: " +
1844 std::to_string(num_tokens_in_batch) +
" Got: " + std::to_string(token_positions.size()));
1847 if (original_sequence_indices.size() !=
static_cast<size_t>(num_tokens_in_batch)) {
1848 Logger::error(
"[CPU_BATCH_GENERATION] original_sequence_indices size mismatch. Expected: " +
1849 std::to_string(num_tokens_in_batch) +
" Got: " + std::to_string(original_sequence_indices.size()));
1858 const size_t batch_hidden_size_bytes = (size_t)num_tokens_in_batch * hidden_size *
sizeof(
float);
1859 const size_t batch_intermediate_size_bytes = (size_t)num_tokens_in_batch * ffn_intermediate_dim *
sizeof(
float);
1860 const size_t batch_q_size_bytes = (size_t)num_tokens_in_batch * hidden_size *
sizeof(
float);
1861 const size_t batch_kv_size_bytes = (size_t)num_tokens_in_batch * n_kv_dim *
sizeof(
float);
1863 float* d_batch_x_norm_out_attn;
1864 float* d_batch_q_proj_out;
1865 float* d_batch_k_proj_out;
1866 float* d_batch_v_proj_out;
1867 float* d_batch_attn_heads_concat_out;
1868 float* d_batch_attn_final_proj_out;
1869 float* d_batch_residual_attn_in;
1870 float* d_batch_residual_ffn_in;
1871 float* d_batch_x_norm_out_ffn;
1872 float* d_batch_ffn_gate_proj_out;
1873 float* d_batch_ffn_up_proj_out;
1874 float* d_batch_ffn_swiglu_out;
1875 float* d_batch_ffn_down_proj_out;
1876 float* d_batch_layer_output =
nullptr;
1878 resize_persistent_batch_buffers_if_needed(num_tokens_in_batch);
1880 d_batch_x_norm_out_attn = d_persistent_batch_norm_out_;
1881 d_batch_q_proj_out = d_persistent_q_batch_;
1882 d_batch_k_proj_out = d_persistent_k_batch_;
1883 d_batch_v_proj_out = d_persistent_v_batch_;
1884 d_batch_attn_heads_concat_out = d_persistent_attn_output_;
1885 d_batch_attn_final_proj_out = d_persistent_attn_proj_out_;
1886 d_batch_residual_attn_in = d_persistent_batch_residual_;
1887 d_batch_residual_ffn_in = d_persistent_batch_residual_ + num_tokens_in_batch * hidden_size;
1888 d_batch_x_norm_out_ffn = d_persistent_batch_norm_out_;
1889 d_batch_ffn_gate_proj_out = d_persistent_gate_proj_out_;
1890 d_batch_ffn_up_proj_out = d_persistent_up_proj_out_;
1891 d_batch_ffn_swiglu_out = d_persistent_swiglu_out_;
1892 d_batch_ffn_down_proj_out = d_persistent_mlp_down_out_;
1893 d_batch_layer_output = d_persistent_batch_input_;
1895 const float alpha = 1.0f, beta = 0.0f;
1897 cublasStatus_t stream_status = cublasSetStream(cublas_handle_, stream);
1898 if (stream_status != CUBLAS_STATUS_SUCCESS) {
1899 Logger::fatal(
"cublasSetStream failed in forward_device_batch_generation");
1900 throw std::runtime_error(
"cublasSetStream failed");
1903 float* d_batch_x_ptr = d_batch_input_embeddings;
1910 Logger::info(
"[FWD_DEV_BATCH_GENERATION_LAYER_START] Processing Layer: model_idx=" + std::to_string(l_model_idx) +
", gpu_idx=" + std::to_string(l_gpu_idx) +
1913 gpuErrchk(cudaMemcpyAsync(d_batch_residual_attn_in, d_batch_x_ptr, batch_hidden_size_bytes, cudaMemcpyDeviceToDevice, stream));
1915 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
1916 layers[l_model_idx].input_layernorm_dev,
1920 const float* w_q_layer_ptr = w_q_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
1922 d_batch_x_norm_out_attn, hidden_size, w_q_layer_ptr, hidden_size, &beta,
1923 d_batch_q_proj_out, hidden_size, stream,
"Q_PROJ_GEN");
1925 const float* w_k_layer_ptr = w_k_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1927 d_batch_x_norm_out_attn, hidden_size, w_k_layer_ptr, n_kv_dim, &beta,
1928 d_batch_k_proj_out, n_kv_dim, stream,
"K_PROJ_GEN");
1930 const float* w_v_layer_ptr = w_v_f32_dev_ + (size_t)l_gpu_idx * n_kv_dim * hidden_size;
1932 d_batch_x_norm_out_attn, hidden_size, w_v_layer_ptr, n_kv_dim, &beta,
1933 d_batch_v_proj_out, n_kv_dim, stream,
"V_PROJ_GEN");
1935 for (
int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1936 int current_pos = token_positions[token_idx];
1947float* d_layer_k_cache_ptr = kv_cache->
layers[l_model_idx].k_dev_fp32;
1948float* d_layer_v_cache_ptr = kv_cache->
layers[l_model_idx].v_dev_fp32;
1950for (
int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1951 int current_pos = token_positions[token_idx];
1952 int sequence_idx = original_sequence_indices[token_idx];
1956 int actual_cache_pos = sequence_cache_offset + current_pos;
1963 const float* current_k_head_ptr = k_token_ptr + kvh * head_dim;
1964 const float* current_v_head_ptr = v_token_ptr + kvh * head_dim;
1966 update_kv_cache_cuda(d_layer_k_cache_ptr, current_k_head_ptr, actual_cache_pos,
1968 kv_cache->allocated_num_kv_heads,
1969 kv_cache->allocated_head_dim, stream);
1971 update_kv_cache_cuda(d_layer_v_cache_ptr, current_v_head_ptr, actual_cache_pos,
1973 kv_cache->allocated_num_kv_heads,
1974 kv_cache->allocated_head_dim, stream);
1977 for (
int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1978 int current_pos = token_positions[token_idx];
1983 float scale = 1.0f /
SAFE_SQRT(
static_cast<float>(head_dim));
1986 selective_k_dequant_buffer_dev_ && selective_v_dequant_buffer_dev_) {
1988 attention_cuda_selective_dequant(
1990 batch_kv_layer.k_dev_quantized,
1991 batch_kv_layer.v_dev_quantized,
1992 batch_kv_layer.k_dev_scales,
1993 batch_kv_layer.v_dev_scales,
1994 selective_k_dequant_buffer_dev_,
1995 selective_v_dequant_buffer_dev_,
1996 attn_output_token_ptr,
2001 kv_cache->allocated_max_seq_len,
2002 kv_cache->allocated_num_kv_heads,
2008 attention_cuda_optimized(q_token_ptr, d_layer_k_cache_ptr, d_layer_v_cache_ptr,
2010 scale, kv_cache->allocated_max_seq_len, kv_cache->allocated_num_kv_heads, stream);
2012 attention_cuda(q_token_ptr, d_layer_k_cache_ptr, d_layer_v_cache_ptr,
2014 scale, kv_cache->allocated_max_seq_len, kv_cache->allocated_num_kv_heads, stream);
2019 const float* w_o_layer_ptr = w_o_f32_dev_ + (size_t)l_gpu_idx * hidden_size * hidden_size;
2021 d_batch_attn_heads_concat_out, hidden_size, w_o_layer_ptr, hidden_size, &beta,
2022 d_batch_attn_final_proj_out, hidden_size, stream,
"O_PROJ");
2024 add_residual_batch_cuda(d_batch_residual_ffn_in, d_batch_attn_final_proj_out, d_batch_residual_attn_in,
2025 num_tokens_in_batch, hidden_size, stream);
2027 rmsnorm_batch_cuda(d_batch_x_norm_out_ffn, d_batch_residual_ffn_in,
2028 layers[l_model_idx].post_attention_layernorm_dev,
2031 const float* w1_layer_ptr = w_gate_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
2033 d_batch_x_norm_out_ffn, hidden_size, w1_layer_ptr, ffn_intermediate_dim, &beta,
2034 d_batch_ffn_gate_proj_out, ffn_intermediate_dim, stream,
"FFN_GATE_PROJ_GEN");
2036 const float* w3_layer_ptr = w_up_f32_dev_ + (size_t)l_gpu_idx * hidden_size * ffn_intermediate_dim;
2038 d_batch_x_norm_out_ffn, hidden_size, w3_layer_ptr, ffn_intermediate_dim, &beta,
2039 d_batch_ffn_up_proj_out, ffn_intermediate_dim, stream,
"FFN_UP_PROJ_GEN");
2041 swiglu_batch_cuda(d_batch_ffn_swiglu_out, d_batch_ffn_gate_proj_out, d_batch_ffn_up_proj_out,
2042 num_tokens_in_batch, ffn_intermediate_dim, stream);
2044 const float* w2_layer_ptr = w_down_f32_dev_ + (size_t)l_gpu_idx * ffn_intermediate_dim * hidden_size;
2046 d_batch_ffn_swiglu_out, ffn_intermediate_dim, w2_layer_ptr, hidden_size, &beta,
2047 d_batch_ffn_down_proj_out, hidden_size, stream,
"FFN_DOWN_PROJ_GEN");
2049 add_residual_batch_cuda(d_batch_layer_output, d_batch_ffn_down_proj_out, d_batch_residual_ffn_in,
2050 num_tokens_in_batch, hidden_size, stream);
2052 d_batch_x_ptr = d_batch_layer_output;
2053 Logger::info(
"[FWD_DEV_BATCH_GENERATION_LAYER_END] Layer " + std::to_string(l_model_idx) +
" finished. Next d_batch_x_ptr: " +
Logger::ptrToString(d_batch_x_ptr));
2056 rmsnorm_batch_cuda(d_batch_x_norm_out_attn, d_batch_x_ptr,
2061 float* d_logits_batch;
2062 gpuErrchk(cudaMalloc(&d_logits_batch, (
size_t)num_tokens_in_batch * vocab_size *
sizeof(
float)));
2066 d_batch_x_norm_out_attn, hidden_size, lm_head_f32_dev_, vocab_size, &beta,
2067 d_logits_batch, vocab_size, stream,
"LM_HEAD_GEN");
2070 std::vector<std::vector<float>> all_logits(num_tokens_in_batch, std::vector<float>(vocab_size));
2071 for (
int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
2072 gpuErrchk(cudaMemcpyAsync(all_logits[token_idx].data(),
2073 d_logits_batch + (
size_t)token_idx * vocab_size,
2074 vocab_size *
sizeof(
float),
2075 cudaMemcpyDeviceToHost, stream));
2077 gpuErrchk(cudaStreamSynchronize(stream));
2079 Logger::info(
"[FWD_DEV_BATCH_GENERATION_FINAL_LOGITS] Calculated logits for " + std::to_string(num_tokens_in_batch) +
" tokens");
2080 gpuErrchk(cudaFree(d_logits_batch));
2081 Logger::info(
"[FWD_DEV_BATCH_GENERATION_EXIT] Function finished.");
2087 const std::vector<float>& batch_input_activations,
2088 int num_tokens_in_batch,
2089 int num_cpu_layers_to_process,
2090 int start_pos_in_sequence,
2092 const std::vector<int>& prompt_lengths) {
2099 batch_input_activations,
2100 num_tokens_in_batch,
2101 num_cpu_layers_to_process,
2102 start_pos_in_sequence,
2110 int m_user,
int n_user,
int k_user,
2111 const float* alpha_user,
2112 const float* A_f32_user,
int lda_user,
2113 const float* B_f32_user,
int ldb_user,
2114 const float* beta_user,
2115 float* C_f32_user,
int ldc_user,
2116 cudaStream_t stream,
2117 const char* operation_name) {
2120 const int tensor_core_threshold = 4;
2124 uint16_t* bf16_weight_ptr =
nullptr;
2126 if (use_tensor_cores) {
2127 Logger::info(
"[SMART_GEMM] Using BF16 Tensor Cores for " + std::string(operation_name) +
2128 " (batch_size=" + std::to_string(m_user) +
" >= " + std::to_string(tensor_core_threshold) +
")");
2134 if (B_f32_user == w_q_f32_dev_) {
2135 bf16_weight_ptr = w_q_bf16_dev_;
2136 }
else if (B_f32_user == w_k_f32_dev_) {
2137 bf16_weight_ptr = w_k_bf16_dev_;
2138 }
else if (B_f32_user == w_v_f32_dev_) {
2139 bf16_weight_ptr = w_v_bf16_dev_;
2140 }
else if (B_f32_user == w_o_f32_dev_) {
2141 bf16_weight_ptr = w_o_bf16_dev_;
2142 }
else if (B_f32_user == w_gate_f32_dev_) {
2143 bf16_weight_ptr = w_gate_bf16_dev_;
2144 }
else if (B_f32_user == w_up_f32_dev_) {
2145 bf16_weight_ptr = w_up_bf16_dev_;
2146 }
else if (B_f32_user == w_down_f32_dev_) {
2147 bf16_weight_ptr = w_down_bf16_dev_;
2150 size_t offset_bytes = 0;
2155 offset_bytes = (B_f32_user - w_q_f32_dev_) *
sizeof(
float);
2156 bf16_weight_ptr = w_q_bf16_dev_ + (offset_bytes /
sizeof(uint16_t));
2159 offset_bytes = (B_f32_user - w_k_f32_dev_) *
sizeof(
float);
2160 bf16_weight_ptr = w_k_bf16_dev_ + (offset_bytes /
sizeof(uint16_t));
2163 offset_bytes = (B_f32_user - w_v_f32_dev_) *
sizeof(
float);
2164 bf16_weight_ptr = w_v_bf16_dev_ + (offset_bytes /
sizeof(uint16_t));
2167 offset_bytes = (B_f32_user - w_o_f32_dev_) *
sizeof(
float);
2168 bf16_weight_ptr = w_o_bf16_dev_ + (offset_bytes /
sizeof(uint16_t));
2171 offset_bytes = (B_f32_user - w_gate_f32_dev_) *
sizeof(
float);
2172 bf16_weight_ptr = w_gate_bf16_dev_ + (offset_bytes /
sizeof(uint16_t));
2175 offset_bytes = (B_f32_user - w_up_f32_dev_) *
sizeof(
float);
2176 bf16_weight_ptr = w_up_bf16_dev_ + (offset_bytes /
sizeof(uint16_t));
2179 offset_bytes = (B_f32_user - w_down_f32_dev_) *
sizeof(
float);
2180 bf16_weight_ptr = w_down_bf16_dev_ + (offset_bytes /
sizeof(uint16_t));
2186 Logger::warning(
"[SMART_GEMM] Unknown weight pointer for " + std::string(operation_name) +
2187 ", falling back to FP32");
2188 use_tensor_cores =
false;
2192 if (use_tensor_cores && bf16_weight_ptr) {
2195 gemm_f32_to_bf16_f32_cuda(cublas_handle_, transa_user, transb_user,
2196 m_user, n_user, k_user, alpha_user,
2197 A_f32_user, lda_user, bf16_weight_ptr, ldb_user,
2198 beta_user, C_f32_user, ldc_user, stream);
2199 Logger::info(
"[SMART_GEMM] Successfully used BF16 Tensor Cores for " + std::string(operation_name));
2201 }
catch (
const std::exception& e) {
2202 Logger::warning(
"[SMART_GEMM] BF16 Tensor Cores failed for " + std::string(operation_name) +
2203 ": " + e.what() +
". Falling back to FP32.");
2204 use_tensor_cores =
false;
2210 Logger::info(
"[SMART_GEMM] Using FP32 GEMM for " + std::string(operation_name) +
2211 " (batch_size=" + std::to_string(m_user) +
" < " + std::to_string(tensor_core_threshold) +
2212 " or Tensor Cores unavailable)");
2213 gemm_f32_f32_cuda(cublas_handle_, transa_user, transb_user,
2214 m_user, n_user, k_user, alpha_user,
2215 A_f32_user, lda_user, B_f32_user, ldb_user,
2216 beta_user, C_f32_user, ldc_user, stream);
static void log_vector_stats(const std::string &name, const std::vector< float > &v, int n_show=5)
static void debug(const std::string &message)
static void warning(const std::string &message)
static std::string ptrToString(const void *ptr)
static void info(const std::string &message)
static void error(const std::string &message)
static void fatal(const std::string &message)
Main class for loading tensors from SafeTensors format files (single or sharded)
static bool load_model_config_from_json(const std::string &model_path_or_dir, ModelConfig &config_to_populate)
Loads model configuration from a JSON file corresponding to a .safetensors model path.
bool use_bf16_tensor_cores_
~TinyLlamaModel()
Destructor. Cleans up all allocated resources.
std::vector< block_q6_K > embed_tokens_q6k
void ensure_up_proj_dequantized(int layer_idx)
std::vector< float > final_norm_f32
std::vector< uint16_t > final_norm
void ensure_v_proj_dequantized(int layer_idx)
std::vector< block_q4_K > lm_head_q4k
std::vector< float > forward_cpu_logits_batch(const std::vector< float > &final_batch_activations, int num_tokens_in_batch)
friend void map_gguf_weights(const GGUFData &gguf, TinyLlamaModel &model)
std::vector< block_q6_K > lm_head_q6k
std::vector< std::pair< float, float > > precomputed_freqs_cis_
void initialize_gpu_and_rope()
TinyLlamaModel(const ModelConfig &config, const SafeTensorsLoader &loader)
Construct a TinyLlamaModel from a SafeTensorsLoader.
std::vector< LayerWeights > layers
std::vector< block_q8_0 > embed_tokens_q8_0
void ensure_o_proj_dequantized(int layer_idx)
void clear_layer_dequantized_weights(int layer_idx)
std::vector< block_q4_K > embed_tokens_q4k
void smart_gemm_batch_cuda(bool transa_user, bool transb_user, int m_user, int n_user, int k_user, const float *alpha_user, const float *A_f32_user, int lda_user, const float *B_f32_user, int ldb_user, const float *beta_user, float *C_f32_user, int ldc_user, cudaStream_t stream, const char *operation_name="GEMM")
void ensure_k_proj_dequantized(int layer_idx)
std::unique_ptr< class CPUBatchProcessor > cpu_batch_processor_
std::vector< block_q8_0 > lm_head_q8_0
std::vector< uint16_t > lm_head
void ensure_f32_concatenated_weights_loaded()
std::vector< std::vector< float > > forward_cpu_batch_generation(const std::vector< float > &batch_input_activations, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, int num_tokens_in_batch, KVCache *kv_cache)
std::vector< uint16_t > embed_tokens
std::vector< block_q8_K > embed_tokens_q8k
void ensure_bf16_concatenated_weights_loaded()
void ensure_q_proj_dequantized(int layer_idx)
void initialize_weights(const SafeTensorsLoader *loader, const GGUFData *gguf)
std::vector< float > forward_cpu_batch(const std::vector< float > &batch_input_activations, int num_tokens_in_batch, int num_cpu_layers_to_process, int start_pos_in_sequence, KVCache *kv_cache, const std::vector< int > &prompt_lengths={})
void ensure_down_proj_dequantized(int layer_idx)
void ensure_gate_proj_dequantized(int layer_idx)
std::vector< float > embed_tokens_f32
std::vector< float > forward(std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask)
Run the forward pass for the model on CPU layers.
void ensure_lm_head_dequantized()
std::unique_ptr< GGUFData > gguf_data_
std::vector< float > lm_head_f32
std::vector< block_q8_K > lm_head_q8k
GGUFData load_gguf_meta(const std::string &filename, bool use_mmap)
Loads GGUF metadata and optionally memory-maps tensor data.
Parser for GGUF (GPT-Generated Unified Format) files.
Logging utilities for the TinyLlama implementation.
ModelConfig parse_model_config_from_gguf(const GGUFData &gguf)
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count=5)
Constants used throughout the TinyLlama model implementation.
void dequantize_vector_q6k_to_f32(const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q6_K blocks to a vector of float32.
void dequantize_vector_q8_0_to_f32(const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q8_0 blocks to a vector of float32.
void dequantize_vector_q4k_to_f32(const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q4_K blocks to a vector of float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
Weight quantization structures and functions for model compression.
SafeTensors format loader for efficient tensor loading, supporting single and sharded models.
Complete representation of a GGUF file's contents.
std::vector< uint8_t > tensor_data
void * mapped_tensor_data
Key-Value cache for a single transformer layer.
Complete Key-Value cache for all transformer layers.
std::vector< KVCacheLayer > layers
std::vector< int > batch_seq_lens
Model configuration structure holding architecture and hyperparameters.
int num_cpu_offload_layers
bool enable_memory_efficient_layers
bool use_kvcache_quantization
bool use_optimized_cuda_kernels
int max_position_embeddings
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
void matvec_bf16_f32_vector_cpu(const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
void simd_scaled_add(float *dst, const float *src, float scale, int n)
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
void softmax_vector_cpu(const std::vector< float > &x, std::vector< float > &out)
float simd_dot_product(const float *a, const float *b, int n)
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void silu_cpu(const std::vector< float > &x, std::vector< float > &out)
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps)
void rmsnorm_vector_cpu(const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps)
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)