TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
Public Member Functions | Private Member Functions | Private Attributes | Friends | List of all members
TinyLlamaModel Class Reference

Main transformer model class for TinyLlama. More...

#include <model.h>

Collaboration diagram for TinyLlamaModel:
Collaboration graph

Public Member Functions

 TinyLlamaModel (const ModelConfig &config, const SafeTensorsLoader &loader)
 Construct a TinyLlamaModel from a SafeTensorsLoader.
 
 TinyLlamaModel (const ModelConfig &initial_config, const std::string &model_path)
 Construct a TinyLlamaModel from a model path (GGUF or SafeTensors).
 
 TinyLlamaModel (const ModelConfig &config_from_session, std::unique_ptr< GGUFData > gguf_data_from_session)
 Construct a TinyLlamaModel from pre-loaded GGUFData.
 
 ~TinyLlamaModel ()
 Destructor. Cleans up all allocated resources.
 
std::vector< float > forward (std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask)
 Run the forward pass for the model on CPU layers.
 
void ensure_q_proj_dequantized (int layer_idx)
 
void ensure_k_proj_dequantized (int layer_idx)
 
void ensure_v_proj_dequantized (int layer_idx)
 
void ensure_o_proj_dequantized (int layer_idx)
 
void ensure_gate_proj_dequantized (int layer_idx)
 
void ensure_up_proj_dequantized (int layer_idx)
 
void ensure_down_proj_dequantized (int layer_idx)
 
void ensure_lm_head_dequantized ()
 
void ensure_embed_tokens_dequantized ()
 
void ensure_f32_concatenated_weights_loaded ()
 
void ensure_layer_weights_on_gpu (int layer_idx)
 
void free_layer_gpu_weights (int layer_idx)
 
void clear_layer_dequantized_weights (int layer_idx)
 
void initialize_gpu_and_rope ()
 
void ensure_bf16_concatenated_weights_loaded ()
 
void free_bf16_concatenated_weights ()
 
void smart_gemm_batch_cuda (bool transa_user, bool transb_user, int m_user, int n_user, int k_user, const float *alpha_user, const float *A_f32_user, int lda_user, const float *B_f32_user, int ldb_user, const float *beta_user, float *C_f32_user, int ldc_user, cudaStream_t stream, const char *operation_name="GEMM")
 
const ModelConfigget_config () const
 
const std::vector< uint16_t > & get_lm_head () const
 
const std::vector< uint16_t > & get_embed_tokens () const
 
std::vector< LayerWeights > & get_layers ()
 
std::vector< floatlookup_embedding (int token_id)
 Lookup the embedding vector for a given token ID.
 
int get_vocab_size () const
 Get the vocabulary size for the model.
 
const GGUFDataget_gguf_data () const
 
GGUFDataget_gguf_data_ptr ()
 
void initialize_rope_freqs ()
 
std::vector< floatforward_cpu_batch (const std::vector< float > &batch_input_activations, int num_tokens_in_batch, int num_cpu_layers_to_process, int start_pos_in_sequence, KVCache *kv_cache, const std::vector< int > &prompt_lengths={})
 
std::vector< floatforward_cpu_logits_batch (const std::vector< float > &final_batch_activations, int num_tokens_in_batch)
 
std::vector< std::vector< float > > forward_cpu_batch_generation (const std::vector< float > &batch_input_activations, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, int num_tokens_in_batch, KVCache *kv_cache)
 

Private Member Functions

void initialize_weights (const SafeTensorsLoader *loader, const GGUFData *gguf)
 

Private Attributes

ModelConfig config_
 
bool use_bf16_tensor_cores_ = false
 
std::vector< uint16_tembed_tokens
 
std::vector< uint16_tlm_head
 
std::vector< uint16_tfinal_norm
 
std::vector< floatembed_tokens_f32
 
std::vector< floatlm_head_f32
 
std::vector< floatfinal_norm_f32
 
std::vector< block_q4_Kembed_tokens_q4k
 
std::vector< block_q4_Klm_head_q4k
 
std::vector< block_q4_Kfinal_norm_q4k
 
std::vector< block_q6_Kembed_tokens_q6k
 
std::vector< block_q6_Klm_head_q6k
 
std::vector< block_q6_Kfinal_norm_q6k
 
std::vector< block_q8_0embed_tokens_q8_0
 
std::vector< block_q8_0lm_head_q8_0
 
std::vector< block_q8_Kembed_tokens_q8k
 
std::vector< block_q8_Klm_head_q8k
 
std::vector< LayerWeightslayers
 
std::vector< std::pair< float, float > > precomputed_freqs_cis_
 
std::unique_ptr< GGUFDatagguf_data_
 
std::string model_path_
 
bool f32_concatenated_weights_loaded_ = false
 
std::unique_ptr< class CPUBatchProcessorcpu_batch_processor_
 

Friends

class CPUBatchProcessor
 
void map_gguf_weights (const GGUFData &gguf, TinyLlamaModel &model)
 

Detailed Description

Main transformer model class for TinyLlama.

Handles weight loading, forward pass, and GPU/CPU offloading logic. Supports both GGUF and SafeTensors formats.

Definition at line 285 of file model.h.

Constructor & Destructor Documentation

◆ TinyLlamaModel() [1/3]

TinyLlamaModel::TinyLlamaModel ( const ModelConfig config,
const SafeTensorsLoader loader 
)

Construct a TinyLlamaModel from a SafeTensorsLoader.

Parameters
configModel configuration.
loaderSafeTensorsLoader instance.

Definition at line 144 of file model.cpp.

146 : config_(config) { // Copies the potentially faulty config first
147 config_.is_gguf_file_loaded = false; // Explicitly set to false for SafeTensors path
148 Logger::info("Constructing TinyLlamaModel from SafeTensorsLoader (is_gguf_file_loaded set to false).");
149 initialize_weights(&loader, nullptr);
151 Logger::info("TinyLlamaModel construction from SafeTensorsLoader complete.");
152}
static void info(const std::string &message)
Definition logger.cpp:135
ModelConfig config_
Definition model.h:480
void initialize_weights(const SafeTensorsLoader *loader, const GGUFData *gguf)
Definition model.cpp:38
std::vector< float > forward(std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask)
Run the forward pass for the model on CPU layers.
Definition model.cpp:536
bool is_gguf_file_loaded
Definition model.h:101

References config_, Logger::info(), initialize_gpu_and_rope(), initialize_weights(), and ModelConfig::is_gguf_file_loaded.

◆ TinyLlamaModel() [2/3]

TinyLlamaModel::TinyLlamaModel ( const ModelConfig initial_config,
const std::string &  model_path 
)

Construct a TinyLlamaModel from a model path (GGUF or SafeTensors).

Parameters
initial_configInitial model configuration (may be overridden by file metadata).
model_pathPath to the model file or directory.

Definition at line 154 of file model.cpp.

157#ifdef HAS_CUDA
158 , cublas_handle_(nullptr), token_embedding_table_dev_(nullptr), lm_head_dev_(nullptr), final_norm_dev(nullptr), w_q_dev_(nullptr), w_k_dev_(nullptr), w_v_dev_(nullptr), w_o_dev_(nullptr), w_gate_dev_(nullptr), w_up_dev_(nullptr), w_down_dev_(nullptr), all_freqs_cis_dev(nullptr), x_dev_(nullptr), x_norm_dev_(nullptr), x_resid1_dev_(nullptr), x_resid2_dev_(nullptr), q_dev_(nullptr), k_dev_(nullptr), v_dev_(nullptr), attn_out_dev_(nullptr), attn_proj_dev_(nullptr), gate_vec_dev_(nullptr), up_vec_dev_(nullptr), swiglu_vec_dev_(nullptr), mlp_down_dev_(nullptr), logits_dev_(nullptr), token_embedding_table_f32_dev_(nullptr), lm_head_f32_dev_(nullptr), w_q_f32_dev_(nullptr), w_k_f32_dev_(nullptr), w_v_f32_dev_(nullptr), w_o_f32_dev_(nullptr), w_gate_f32_dev_(nullptr), w_up_f32_dev_(nullptr), w_down_f32_dev_(nullptr)
159#endif
160{
161 Logger::info("TinyLlamaModel constructor entered. Model path (from string): " + model_path);
162 int cli_gpu_layer_request = initial_config.num_cpu_offload_layers;
163 bool cli_mmap_preference = initial_config.use_mmap_for_gguf;
164 this->config_ = initial_config;
165 if (this->model_path_.empty() && !model_path.empty()) {
166 this->model_path_ = model_path;
167 }
168 std::unique_ptr<SafeTensorsLoader> loader = nullptr;
169 if (!this->model_path_.empty() && this->model_path_.size() > 5 &&
170 this->model_path_.substr(this->model_path_.size() - 5) == ".gguf") {
171 Logger::info("GGUF file detected by path in Model Constructor: " + this->model_path_);
172 try {
174 Logger::info("TinyLlamaModel GGUF path: Using mmap setting " + std::string(force_mmap_for_gguf_load ? "true" : "false") +
175 " for gguf_meta/weight loading based on CLI mmap preference: " +
176 std::string(cli_mmap_preference ? "true" : "false"));
177
178 this->gguf_data_ = std::make_unique<GGUFData>(load_gguf_meta(this->model_path_, force_mmap_for_gguf_load));
179
181 this->config_ = config_from_gguf;
183 this->config_.is_gguf_file_loaded = true;
184 if (cli_gpu_layer_request < 0) {
186 Logger::info("TinyLlamaModel GGUF Ctor CALC: CLI hint < 0 (all GPU). num_cpu_offload_layers set to 0.");
187 } else if (cli_gpu_layer_request == 0) {
189 Logger::info("TinyLlamaModel GGUF Ctor CALC: CLI hint == 0 (all CPU). num_cpu_offload_layers set to num_hidden_layers (" + std::to_string(this->config_.num_cpu_offload_layers) + ").");
190 } else { // CLI hint > 0, meaning cli_gpu_layer_request is the number of desired GPU layers
191 if (this->config_.num_hidden_layers > 0) {
192 if (cli_gpu_layer_request >= this->config_.num_hidden_layers) {
193 this->config_.num_cpu_offload_layers = 0; // More GPU layers requested than available -> all on GPU
194 Logger::info("TinyLlamaModel GGUF Ctor CALC: CLI GPU layer request ("+ std::to_string(cli_gpu_layer_request) +") >= total layers. num_cpu_offload_layers set to 0.");
195 } else {
197 Logger::info("TinyLlamaModel GGUF Ctor CALC: Partial GPU. CLI GPU req: " + std::to_string(cli_gpu_layer_request) + ". num_cpu_offload_layers set to " + std::to_string(this->config_.num_cpu_offload_layers));
198 }
199 } else { // num_hidden_layers is 0 or negative, something is wrong with GGUF. Default to all CPU.
201 Logger::warning("TinyLlamaModel GGUF Ctor CALC: num_hidden_layers from GGUF is <= 0. Defaulting num_cpu_offload_layers to 0. CLI GPU req: " + std::to_string(cli_gpu_layer_request));
202 }
203 }
204 Logger::info("TinyLlamaModel GGUF Ctor: POST-CALC (within GGUF block) final num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
205 Logger::info("[CTOR_GGUF_DEBUG_L1860] After CLI hint logic: this->config_.num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers) +
206 ", this->config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
207 } catch (const std::exception& e) {
208 Logger::error("Failed to load or parse GGUF file: " + std::string(e.what()));
209 throw;
210 }
211 } else if (model_path.size() > 12 &&
212 model_path.substr(model_path.size() - 12) == ".safetensors") {
213 Logger::info("SafeTensors file detected: " + model_path);
216
217 // For SafeTensors, start with JSON config, then layer CLI preferences.
219 Logger::info("Successfully loaded and parsed config.json for SafeTensors model.");
220 this->config_ = config_from_json; // Base is from JSON
221 } else {
222 Logger::warning("Failed to load config.json or it was not found for SafeTensors model. Proceeding with initial_config defaults and CLI overrides.");
223 }
224 this->config_.is_gguf_file_loaded = false;
225 this->config_.use_mmap_for_gguf = cli_mmap_preference; // This field is GGUF specific, but store CLI pref anyway.
226
227 if (cli_gpu_layer_request < 0) {
229 } else if (cli_gpu_layer_request == 0) {
231 } else {
232 if (this->config_.num_hidden_layers > 0) {
233 if (cli_gpu_layer_request >= this->config_.num_hidden_layers) {
235 } else {
237 }
238 } else {
239 this->config_.num_cpu_offload_layers = 0; // Fallback if num_hidden_layers not known
240 Logger::warning("SafeTensors path: num_hidden_layers is 0 from JSON/default. Defaulting num_cpu_offload_layers to 0 despite CLI GPU request: " + std::to_string(cli_gpu_layer_request));
241 }
242 }
243 Logger::info("SafeTensors path: Calculated num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
244
245 try {
246 loader = std::make_unique<SafeTensorsLoader>(model_path);
247 Logger::info("SafeTensorsLoader initialized for: " + model_path);
248 } catch (const std::exception& e) {
249 Logger::error("Failed to initialize SafeTensorsLoader: " + std::string(e.what()));
250 throw;
251 }
252 } else {
253 throw std::runtime_error(
254 "Unsupported model file type. Please use .gguf or .safetensors");
255 }
256
257 Logger::info("TinyLlamaModel constructor: After specific loader block. Current config_.num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers) +
258 ", config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
259 Logger::info("TinyLlamaModel constructor: Current config_.use_mmap_for_gguf = " + std::string(this->config_.use_mmap_for_gguf ? "true" : "false"));
260
261 if (this->config_.num_cpu_offload_layers < 0) { // Should not happen if logic above is correct for -1 CLI hint
263 Logger::warning("Clamping num_cpu_offload_layers: was < 0, set to 0.");
264 }
265 if (this->config_.num_hidden_layers > 0 && this->config_.num_cpu_offload_layers > this->config_.num_hidden_layers) {
266 Logger::warning("Clamping num_cpu_offload_layers: Requested CPU offload layers (" + std::to_string(this->config_.num_cpu_offload_layers) +
267 ") exceeds total hidden layers (" + std::to_string(this->config_.num_hidden_layers) +
268 "). Clamping to " + std::to_string(this->config_.num_hidden_layers) + " (all CPU).");
270 }
271 Logger::info("TinyLlamaModel constructor: Final clamped num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
272 Logger::info("[CTOR_DEBUG_L1921] End of Model Ctor (before initialize_weights/rope call): this->config_.num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers) +
273 ", this->config_.num_hidden_layers = " + std::to_string(this->config_.num_hidden_layers));
274 Logger::info("Final ModelConfig (before initialize_weights/rope):");
275 Logger::info(" hidden_size: " + std::to_string(config_.hidden_size));
276 Logger::info(" intermediate_size: " + std::to_string(config_.intermediate_size));
277 Logger::info(" num_attention_heads: " + std::to_string(config_.num_attention_heads));
278 Logger::info(" num_key_value_heads: " + std::to_string(config_.num_key_value_heads));
279 Logger::info(" num_hidden_layers: " + std::to_string(config_.num_hidden_layers));
280 Logger::info(" vocab_size: " + std::to_string(config_.vocab_size));
281 Logger::info(" max_position_embeddings: " + std::to_string(config_.max_position_embeddings));
282 Logger::info(" architecture: " + config_.architecture);
283 Logger::info(" is_gguf_file_loaded: " + std::string(config_.is_gguf_file_loaded ? "true" : "false"));
284 Logger::info(" use_mmap_for_gguf: " + std::string(config_.use_mmap_for_gguf ? "true" : "false"));
285 // --- BEGIN GGUFData Integrity Check ---
286 if (this->config_.is_gguf_file_loaded && this->gguf_data_) {
287 if (this->gguf_data_->tensor_infos_map.empty()) {
288 Logger::error("[CTOR_GGUF_PRE_INIT_W] CRITICAL: gguf_data_->tensor_infos_map is EMPTY. Weights will not be loaded by map_gguf_weights.");
289 }
290 } else if (this->config_.is_gguf_file_loaded && !this->gguf_data_) {
291 Logger::error("[CTOR_GGUF_PRE_INIT_W] CRITICAL: config_.is_gguf_file_loaded is TRUE, but gguf_data_ pointer IS NULL. Weights cannot be loaded.");
292 } else if (!this->config_.is_gguf_file_loaded) {
293 Logger::info("[CTOR_GGUF_PRE_INIT_W] Not a GGUF file load context (e.g., SafeTensors). Skipping gguf_data_ check here.");
294 }
295 // --- END GGUFData Integrity Check ---
296 initialize_weights(loader.get(), this->gguf_data_.get());
298
299 Logger::info("TinyLlamaModel (from path string) constructed and initialized successfully.");
300}
static void warning(const std::string &message)
Definition logger.cpp:139
static void error(const std::string &message)
Definition logger.cpp:143
static bool load_model_config_from_json(const std::string &model_path_or_dir, ModelConfig &config_to_populate)
Loads model configuration from a JSON file corresponding to a .safetensors model path.
std::string model_path_
Definition model.h:557
std::unique_ptr< GGUFData > gguf_data_
Definition model.h:556
GGUFData load_gguf_meta(const std::string &filename, bool use_mmap)
Loads GGUF metadata and optionally memory-maps tensor data.
ModelConfig parse_model_config_from_gguf(const GGUFData &gguf)
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
std::string architecture
Definition model.h:96
int num_attention_heads
Definition model.h:83
bool use_mmap_for_gguf
Definition model.h:102
int intermediate_size
Definition model.h:82
int num_cpu_offload_layers
Definition model.h:104
int num_hidden_layers
Definition model.h:85
int num_key_value_heads
Definition model.h:84
int max_position_embeddings
Definition model.h:87

References ModelConfig::architecture, config_, Logger::error(), gguf_data_, ModelConfig::hidden_size, Logger::info(), initialize_gpu_and_rope(), initialize_weights(), ModelConfig::intermediate_size, ModelConfig::is_gguf_file_loaded, load_gguf_meta(), SafeTensorsLoader::load_model_config_from_json(), ModelConfig::max_position_embeddings, model_path_, ModelConfig::num_attention_heads, ModelConfig::num_cpu_offload_layers, ModelConfig::num_hidden_layers, ModelConfig::num_key_value_heads, parse_model_config_from_gguf(), ModelConfig::use_mmap_for_gguf, ModelConfig::vocab_size, and Logger::warning().

◆ TinyLlamaModel() [3/3]

TinyLlamaModel::TinyLlamaModel ( const ModelConfig config_from_session,
std::unique_ptr< GGUFData gguf_data_from_session 
)

Construct a TinyLlamaModel from pre-loaded GGUFData.

Parameters
config_from_sessionModel configuration.
gguf_data_from_sessionUnique pointer to GGUFData.

Definition at line 301 of file model.cpp.

305 model_path_("loaded_from_gguf_data_memory")
306#ifdef HAS_CUDA
307 // Initialize all CUDA pointers to nullptr as in the other constructor
308 , cublas_handle_(nullptr), token_embedding_table_dev_(nullptr), lm_head_dev_(nullptr), final_norm_dev(nullptr), w_q_dev_(nullptr), w_k_dev_(nullptr), w_v_dev_(nullptr), w_o_dev_(nullptr), w_gate_dev_(nullptr), w_up_dev_(nullptr), w_down_dev_(nullptr), all_freqs_cis_dev(nullptr), x_dev_(nullptr), x_norm_dev_(nullptr), x_resid1_dev_(nullptr), x_resid2_dev_(nullptr), q_dev_(nullptr), k_dev_(nullptr), v_dev_(nullptr), attn_out_dev_(nullptr), attn_proj_dev_(nullptr), gate_vec_dev_(nullptr), up_vec_dev_(nullptr), swiglu_vec_dev_(nullptr), mlp_down_dev_(nullptr), logits_dev_(nullptr), token_embedding_table_f32_dev_(nullptr), lm_head_f32_dev_(nullptr), w_q_f32_dev_(nullptr), w_k_f32_dev_(nullptr), w_v_f32_dev_(nullptr), w_o_f32_dev_(nullptr), w_gate_f32_dev_(nullptr), w_up_f32_dev_(nullptr), w_down_f32_dev_(nullptr)
309#endif
310{
311 Logger::info("TinyLlamaModel constructor entered (with pre-loaded GGUFData). Model path placeholder: " + model_path_);
312 this->config_.is_gguf_file_loaded = true; // Ensure this is set
313
314 if (this->config_.num_cpu_offload_layers < 0) {
316 }
317 if (this->config_.num_hidden_layers > 0 && this->config_.num_cpu_offload_layers > this->config_.num_hidden_layers) {
318 Logger::warning("Requested CPU offload layers (" + std::to_string(this->config_.num_cpu_offload_layers) +
319 ") exceeds total hidden layers (" + std::to_string(this->config_.num_hidden_layers) +
320 "). Clamping to " + std::to_string(this->config_.num_hidden_layers) + " layers on CPU (all CPU).");
322 }
323 Logger::info("TinyLlamaModel (pre-loaded GGUF): Final clamped num_cpu_offload_layers = " + std::to_string(this->config_.num_cpu_offload_layers));
324
325 initialize_weights(nullptr, gguf_data_.get()); // Pass raw GGUFData pointer
327 Logger::info("TinyLlamaModel (with pre-loaded GGUFData) constructed and initialized successfully.");
328}

References config_, gguf_data_, Logger::info(), initialize_gpu_and_rope(), initialize_weights(), ModelConfig::is_gguf_file_loaded, model_path_, ModelConfig::num_cpu_offload_layers, ModelConfig::num_hidden_layers, and Logger::warning().

◆ ~TinyLlamaModel()

TinyLlamaModel::~TinyLlamaModel ( )

Destructor. Cleans up all allocated resources.

Definition at line 330 of file model.cpp.

330 {
331#ifdef HAS_CUDA
332 // Only perform GPU cleanup if GPU layers were actually used
334 if (active_num_gpu_layers > 0) {
335 Logger::info("Freeing TinyLlamaModel CUDA resources...");
336 if (cublas_handle_) {
339 Logger::error("cuBLAS handle destruction failed with error code: " +
340 std::to_string(cublas_status));
341 }
342 cublas_handle_ = nullptr;
343 Logger::info("cuBLAS handle destroyed.");
344 }
345 } else {
346 // CPU-only mode: just clean up cuBLAS handle if it exists
347 if (cublas_handle_) {
350 Logger::error("cuBLAS handle destruction failed with error code: " +
351 std::to_string(cublas_status));
352 }
353 cublas_handle_ = nullptr;
354 }
355 }
356 // Continue GPU cleanup only if GPU layers were active
357 if (active_num_gpu_layers > 0) {
358 if (final_norm_dev) {
360 final_norm_dev = nullptr;
361 }
362
363 for (auto& layer : layers) {
364 if (layer.input_layernorm_dev) {
365 gpuErrchk(cudaFree(layer.input_layernorm_dev));
366 layer.input_layernorm_dev = nullptr;
367 }
368 if (layer.post_attention_layernorm_dev) {
369 gpuErrchk(cudaFree(layer.post_attention_layernorm_dev));
370 layer.post_attention_layernorm_dev = nullptr;
371 }
372 }
373
374 if (all_freqs_cis_dev) {
376 all_freqs_cis_dev = nullptr;
377 }
381 }
382 if (lm_head_dev_) {
384 lm_head_dev_ = nullptr;
385 }
386 if (w_q_dev_) {
388 w_q_dev_ = nullptr;
389 }
390 if (w_k_dev_) {
392 w_k_dev_ = nullptr;
393 }
394 if (w_v_dev_) {
396 w_v_dev_ = nullptr;
397 }
398 if (w_o_dev_) {
400 w_o_dev_ = nullptr;
401 }
402 if (w_gate_dev_) {
404 w_gate_dev_ = nullptr;
405 }
406 if (w_up_dev_) {
408 w_up_dev_ = nullptr;
409 }
410 if (w_down_dev_) {
412 w_down_dev_ = nullptr;
413 }
417 }
418 if (lm_head_f32_dev_) {
420 lm_head_f32_dev_ = nullptr;
421 }
422 if (w_q_f32_dev_) {
424 w_q_f32_dev_ = nullptr;
425 }
426 if (w_k_f32_dev_) {
428 w_k_f32_dev_ = nullptr;
429 }
430 if (w_v_f32_dev_) {
432 w_v_f32_dev_ = nullptr;
433 }
434 if (w_o_f32_dev_) {
436 w_o_f32_dev_ = nullptr;
437 }
438 if (w_gate_f32_dev_) {
440 w_gate_f32_dev_ = nullptr;
441 }
442 if (w_up_f32_dev_) {
444 w_up_f32_dev_ = nullptr;
445 }
446 if (w_down_f32_dev_) {
448 w_down_f32_dev_ = nullptr;
449 }
450
451 if (x_dev_) {
453 x_dev_ = nullptr;
454 }
455 if (x_norm_dev_) {
457 x_norm_dev_ = nullptr;
458 }
459 if (x_resid1_dev_) {
461 x_resid1_dev_ = nullptr;
462 }
463 if (x_resid2_dev_) {
465 x_resid2_dev_ = nullptr;
466 }
467 if (q_dev_) {
469 q_dev_ = nullptr;
470 }
471 if (k_dev_) {
473 k_dev_ = nullptr;
474 }
475 if (v_dev_) {
477 v_dev_ = nullptr;
478 }
479 if (attn_out_dev_) {
481 attn_out_dev_ = nullptr;
482 }
483 if (attn_proj_dev_) {
485 attn_proj_dev_ = nullptr;
486 }
487 if (gate_vec_dev_) {
489 gate_vec_dev_ = nullptr;
490 }
491 if (up_vec_dev_) {
493 up_vec_dev_ = nullptr;
494 }
495 if (swiglu_vec_dev_) {
497 swiglu_vec_dev_ = nullptr;
498 }
499 if (mlp_down_dev_) {
501 mlp_down_dev_ = nullptr;
502 }
503 if (logits_dev_) {
505 logits_dev_ = nullptr;
506 }
507 // Free KVCache dequantization buffers
511 }
515 }
516 // Free selective KVCache dequantization buffers
520 }
524 }
525
526 // Free persistent batch processing buffers
528
529 Logger::info("Freed persistent GPU workspace buffers.");
530 Logger::info("Finished freeing TinyLlamaModel CUDA weight memory.");
531 } else {
532 Logger::info("CPU-only mode: No GPU resources to free.");
533 }
534#endif
535}
std::vector< LayerWeights > layers
Definition model.h:491

References config_, Logger::error(), Logger::info(), layers, ModelConfig::num_cpu_offload_layers, and ModelConfig::num_hidden_layers.

Member Function Documentation

◆ clear_layer_dequantized_weights()

void TinyLlamaModel::clear_layer_dequantized_weights ( int  layer_idx)

Definition at line 62 of file weight_management.cpp.

62 {
63 if (layer_idx < 0 || layer_idx >= static_cast<int>(layers.size())) {
64 Logger::warning("clear_layer_dequantized_weights: Invalid layer index " + std::to_string(layer_idx));
65 return;
66 }
67
68 Logger::info("Clearing dequantized weights for layer " + std::to_string(layer_idx) + " to save memory.");
69
70 auto& layer = layers[layer_idx];
71 layer.q_proj_f32.clear();
72 layer.q_proj_f32.shrink_to_fit();
73 layer.k_proj_f32.clear();
74 layer.k_proj_f32.shrink_to_fit();
75 layer.v_proj_f32.clear();
76 layer.v_proj_f32.shrink_to_fit();
77 layer.o_proj_f32.clear();
78 layer.o_proj_f32.shrink_to_fit();
79 layer.gate_proj_f32.clear();
80 layer.gate_proj_f32.shrink_to_fit();
81 layer.up_proj_f32.clear();
82 layer.up_proj_f32.shrink_to_fit();
83 layer.down_proj_f32.clear();
84 layer.down_proj_f32.shrink_to_fit();
85}

References forward(), Logger::info(), layers, and Logger::warning().

Referenced by forward().

◆ ensure_bf16_concatenated_weights_loaded()

void TinyLlamaModel::ensure_bf16_concatenated_weights_loaded ( )

Definition at line 943 of file weight_management.cpp.

943 {
944 Logger::info("CPU-only build: ensure_bf16_concatenated_weights_loaded is a no-op");
945}

References Logger::info().

Referenced by smart_gemm_batch_cuda().

◆ ensure_down_proj_dequantized()

void TinyLlamaModel::ensure_down_proj_dequantized ( int  layer_idx)

Definition at line 164 of file weight_management.cpp.

164 {
165 if (layer_idx < 0 || layer_idx >= layers.size()) return;
166 auto& lw = layers[layer_idx];
167 if (!lw.down_proj_f32.empty()) return;
168
169 int hs = config_.hidden_size;
171 size_t down_proj_elements = static_cast<size_t>(hs) * is;
172
173 if (!lw.down_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.down_proj_q6k, lw.down_proj_f32, down_proj_elements, 0);
174 else if (!lw.down_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.down_proj_q4k, lw.down_proj_f32, down_proj_elements, 0);
175 else if (!lw.down_proj_q8k.empty()) dequantize_q8_k(lw.down_proj_q8k, lw.down_proj_f32, down_proj_elements, false);
176 else if (!lw.down_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.down_proj_q8_0, lw.down_proj_f32, down_proj_elements, 0);
177 else if (!lw.down_proj.empty()) lw.down_proj_f32 = bf16vec_to_float_vec(lw.down_proj);
178}
void dequantize_vector_q6k_to_f32(const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q6_K blocks to a vector of float32.
void dequantize_vector_q8_0_to_f32(const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q8_0 blocks to a vector of float32.
void dequantize_vector_q4k_to_f32(const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q4_K blocks to a vector of float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, ModelConfig::intermediate_size, and layers.

Referenced by forward(), CPUBatchProcessor::forward_cpu_batch(), and forward_cpu_batch_generation().

◆ ensure_embed_tokens_dequantized()

void TinyLlamaModel::ensure_embed_tokens_dequantized ( )

Definition at line 10 of file weight_management.cpp.

10 {
11 if (!this->embed_tokens_f32.empty()) return;
12
13 size_t total_elements_embed = static_cast<size_t>(config_.vocab_size) * config_.hidden_size;
14 if (!this->embed_tokens_q6k.empty()) {
16 } else if (!this->embed_tokens_q4k.empty()) {
18 } else if (!this->embed_tokens_q8k.empty()) {
20 } else if (!this->embed_tokens_q8_0.empty()) {
22 } else if (!this->embed_tokens.empty()) {
24 }
25}
std::vector< block_q6_K > embed_tokens_q6k
Definition model.h:488
std::vector< block_q8_0 > embed_tokens_q8_0
Definition model.h:489
std::vector< block_q4_K > embed_tokens_q4k
Definition model.h:487
std::vector< uint16_t > embed_tokens
Definition model.h:483
std::vector< block_q8_K > embed_tokens_q8k
Definition model.h:490
std::vector< float > embed_tokens_f32
Definition model.h:486

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), embed_tokens, embed_tokens_f32, embed_tokens_q4k, embed_tokens_q6k, embed_tokens_q8_0, embed_tokens_q8k, forward(), ModelConfig::hidden_size, and ModelConfig::vocab_size.

Referenced by initialize_gpu_and_rope().

◆ ensure_f32_concatenated_weights_loaded()

void TinyLlamaModel::ensure_f32_concatenated_weights_loaded ( )

Definition at line 939 of file weight_management.cpp.

939 {
940 Logger::info("CPU-only build: ensure_f32_concatenated_weights_loaded is a no-op");
941}

References Logger::info().

◆ ensure_gate_proj_dequantized()

void TinyLlamaModel::ensure_gate_proj_dequantized ( int  layer_idx)

Definition at line 132 of file weight_management.cpp.

132 {
133 if (layer_idx < 0 || layer_idx >= layers.size()) return;
134 auto& lw = layers[layer_idx];
135 if (!lw.gate_proj_f32.empty()) return;
136
137 int hs = config_.hidden_size;
139 size_t gate_proj_elements = static_cast<size_t>(is) * hs;
140
141 if (!lw.gate_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.gate_proj_q6k, lw.gate_proj_f32, gate_proj_elements, 0);
142 else if (!lw.gate_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.gate_proj_q4k, lw.gate_proj_f32, gate_proj_elements, 0);
143 else if (!lw.gate_proj_q8k.empty()) dequantize_q8_k(lw.gate_proj_q8k, lw.gate_proj_f32, gate_proj_elements, false);
144 else if (!lw.gate_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.gate_proj_q8_0, lw.gate_proj_f32, gate_proj_elements, 0);
145 else if (!lw.gate_proj.empty()) lw.gate_proj_f32 = bf16vec_to_float_vec(lw.gate_proj);
146}

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, ModelConfig::intermediate_size, and layers.

Referenced by forward(), CPUBatchProcessor::forward_cpu_batch(), and forward_cpu_batch_generation().

◆ ensure_k_proj_dequantized()

void TinyLlamaModel::ensure_k_proj_dequantized ( int  layer_idx)

Definition at line 87 of file weight_management.cpp.

87 {
88 if (layer_idx < 0 || layer_idx >= layers.size()) return;
89 auto& lw = layers[layer_idx];
90 if (!lw.k_proj_f32.empty()) return;
91
93 size_t k_proj_elements = static_cast<size_t>(config_.num_key_value_heads * (hs / config_.num_attention_heads)) * hs;
94
95 if (!lw.k_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.k_proj_q6k, lw.k_proj_f32, k_proj_elements, 0);
96 else if (!lw.k_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.k_proj_q4k, lw.k_proj_f32, k_proj_elements, 0);
97 else if (!lw.k_proj_q8k.empty()) dequantize_q8_k(lw.k_proj_q8k, lw.k_proj_f32, k_proj_elements, false);
98 else if (!lw.k_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.k_proj_q8_0, lw.k_proj_f32, k_proj_elements, 0);
99 else if (!lw.k_proj.empty()) lw.k_proj_f32 = bf16vec_to_float_vec(lw.k_proj);
100}

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, layers, ModelConfig::num_attention_heads, and ModelConfig::num_key_value_heads.

Referenced by forward(), CPUBatchProcessor::forward_cpu_batch(), and forward_cpu_batch_generation().

◆ ensure_layer_weights_on_gpu()

void TinyLlamaModel::ensure_layer_weights_on_gpu ( int  layer_idx)

◆ ensure_lm_head_dequantized()

void TinyLlamaModel::ensure_lm_head_dequantized ( )

Definition at line 27 of file weight_management.cpp.

27 {
28 if (!this->lm_head_f32.empty()) return;
29
30 size_t total_elements_lm_head = static_cast<size_t>(config_.vocab_size) * config_.hidden_size;
31 if (!this->lm_head_q6k.empty()) {
33 } else if (!this->lm_head_q4k.empty()) {
35 } else if (!this->lm_head_q8k.empty()) {
37 } else if (!this->lm_head_q8_0.empty()) {
39 } else if (!this->lm_head.empty()) {
41 }
42}
std::vector< block_q4_K > lm_head_q4k
Definition model.h:487
std::vector< block_q6_K > lm_head_q6k
Definition model.h:488
std::vector< block_q8_0 > lm_head_q8_0
Definition model.h:489
std::vector< uint16_t > lm_head
Definition model.h:484
std::vector< float > lm_head_f32
Definition model.h:486
std::vector< block_q8_K > lm_head_q8k
Definition model.h:490

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, lm_head, lm_head_f32, lm_head_q4k, lm_head_q6k, lm_head_q8_0, lm_head_q8k, and ModelConfig::vocab_size.

Referenced by forward(), and initialize_gpu_and_rope().

◆ ensure_o_proj_dequantized()

void TinyLlamaModel::ensure_o_proj_dequantized ( int  layer_idx)

Definition at line 117 of file weight_management.cpp.

117 {
118 if (layer_idx < 0 || layer_idx >= layers.size()) return;
119 auto& lw = layers[layer_idx];
120 if (!lw.o_proj_f32.empty()) return;
121
122 int hs = config_.hidden_size;
123 size_t o_proj_elements = static_cast<size_t>(hs) * hs;
124
125 if (!lw.o_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.o_proj_q6k, lw.o_proj_f32, o_proj_elements, 0);
126 else if (!lw.o_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.o_proj_q4k, lw.o_proj_f32, o_proj_elements, 0);
127 else if (!lw.o_proj_q8k.empty()) dequantize_q8_k(lw.o_proj_q8k, lw.o_proj_f32, o_proj_elements, false);
128 else if (!lw.o_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.o_proj_q8_0, lw.o_proj_f32, o_proj_elements, 0);
129 else if (!lw.o_proj.empty()) lw.o_proj_f32 = bf16vec_to_float_vec(lw.o_proj);
130}

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, and layers.

Referenced by forward(), CPUBatchProcessor::forward_cpu_batch(), and forward_cpu_batch_generation().

◆ ensure_q_proj_dequantized()

void TinyLlamaModel::ensure_q_proj_dequantized ( int  layer_idx)

Definition at line 44 of file weight_management.cpp.

44 {
45 if (layer_idx < 0 || layer_idx >= layers.size()) return;
46 auto& lw = layers[layer_idx];
47 if (!lw.q_proj_f32.empty()) return;
48
50 size_t q_proj_elements = static_cast<size_t>(hs) * hs;
51 Logger::info("[DEQUANT_MEM] Layer " + std::to_string(layer_idx) + ": Q-proj dequantization starting, elements=" + std::to_string(q_proj_elements));
52
53 if (!lw.q_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.q_proj_q6k, lw.q_proj_f32, q_proj_elements, 0);
54 else if (!lw.q_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.q_proj_q4k, lw.q_proj_f32, q_proj_elements, 0);
55 else if (!lw.q_proj_q8k.empty()) dequantize_q8_k(lw.q_proj_q8k, lw.q_proj_f32, q_proj_elements, true);
56 else if (!lw.q_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.q_proj_q8_0, lw.q_proj_f32, q_proj_elements, 0);
57 else if (!lw.q_proj.empty()) lw.q_proj_f32 = bf16vec_to_float_vec(lw.q_proj);
58
59 Logger::info("[DEQUANT_MEM] Layer " + std::to_string(layer_idx) + ": Q-proj dequantization completed, f32 size=" + std::to_string(lw.q_proj_f32.size()));
60}

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, Logger::info(), and layers.

Referenced by forward(), CPUBatchProcessor::forward_cpu_batch(), and forward_cpu_batch_generation().

◆ ensure_up_proj_dequantized()

void TinyLlamaModel::ensure_up_proj_dequantized ( int  layer_idx)

Definition at line 148 of file weight_management.cpp.

148 {
149 if (layer_idx < 0 || layer_idx >= layers.size()) return;
150 auto& lw = layers[layer_idx];
151 if (!lw.up_proj_f32.empty()) return;
152
153 int hs = config_.hidden_size;
155 size_t up_proj_elements = static_cast<size_t>(is) * hs;
156
157 if (!lw.up_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.up_proj_q6k, lw.up_proj_f32, up_proj_elements, 0);
158 else if (!lw.up_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.up_proj_q4k, lw.up_proj_f32, up_proj_elements, 0);
159 else if (!lw.up_proj_q8k.empty()) dequantize_q8_k(lw.up_proj_q8k, lw.up_proj_f32, up_proj_elements, false);
160 else if (!lw.up_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.up_proj_q8_0, lw.up_proj_f32, up_proj_elements, 0);
161 else if (!lw.up_proj.empty()) lw.up_proj_f32 = bf16vec_to_float_vec(lw.up_proj);
162}

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, ModelConfig::intermediate_size, and layers.

Referenced by forward(), CPUBatchProcessor::forward_cpu_batch(), and forward_cpu_batch_generation().

◆ ensure_v_proj_dequantized()

void TinyLlamaModel::ensure_v_proj_dequantized ( int  layer_idx)

Definition at line 102 of file weight_management.cpp.

102 {
103 if (layer_idx < 0 || layer_idx >= layers.size()) return;
104 auto& lw = layers[layer_idx];
105 if (!lw.v_proj_f32.empty()) return;
106
107 int hs = config_.hidden_size;
108 size_t v_proj_elements = static_cast<size_t>(config_.num_key_value_heads * (hs / config_.num_attention_heads)) * hs;
109
110 if (!lw.v_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.v_proj_q6k, lw.v_proj_f32, v_proj_elements, 0);
111 else if (!lw.v_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.v_proj_q4k, lw.v_proj_f32, v_proj_elements, 0);
112 else if (!lw.v_proj_q8k.empty()) dequantize_q8_k(lw.v_proj_q8k, lw.v_proj_f32, v_proj_elements, false);
113 else if (!lw.v_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.v_proj_q8_0, lw.v_proj_f32, v_proj_elements, 0);
114 else if (!lw.v_proj.empty()) lw.v_proj_f32 = bf16vec_to_float_vec(lw.v_proj);
115}

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), forward(), ModelConfig::hidden_size, layers, ModelConfig::num_attention_heads, and ModelConfig::num_key_value_heads.

Referenced by forward(), CPUBatchProcessor::forward_cpu_batch(), and forward_cpu_batch_generation().

◆ forward()

std::vector< float > TinyLlamaModel::forward ( std::vector< float > &  input,
int  n_tokens,
KVCache kv_cache,
const std::vector< int > *  attention_mask 
)

Run the forward pass for the model on CPU layers.

Parameters
inputInput vector (modified in-place).
n_tokensCurrent token position.
kv_cachePointer to the key-value cache.
attention_maskOptional attention mask.
Returns
Output logits or intermediate activations.

Definition at line 536 of file model.cpp.

539 {
540 Logger::info("[CPU_FWD] Entered. Processing up to layer " + std::to_string(config_.num_cpu_offload_layers -1) + ". Input n_tokens: " + std::to_string(n_tokens));
541
542 int hs = config_.hidden_size;
543 int vs = config_.vocab_size;
547 int head_dim = hs / n_heads;
548 float eps = config_.rms_norm_eps;
550
551 bool log_first_gen_step = (n_tokens == 0);
552 bool log_this_step = log_first_gen_step || (n_tokens == 12) || (n_tokens == 13);
553
554 // Layer processing loop - ONLY for CPU-offloaded layers
555 for (int l = 0; l < config_.num_cpu_offload_layers; ++l) {
556 Logger::info("[CPU_FWD_MEM] Starting layer " + std::to_string(l) + " processing");
557
558 bool log_this_layer = log_this_step && (l == 0); // Log details only for layer 0 on specific steps
559 if (log_this_layer) {
560 Logger::info("[CPU_FWD] ------ START Layer " + std::to_string(l) +
561 " (pos=" + std::to_string(n_tokens) + ") ------");
562 log_vector_summary("Layer " + std::to_string(l) + " Input (input)", input);
563 }
564
565 const auto& lw = layers[l];
566 std::vector<float> x_norm_vec1(hs);
567 const std::vector<float>& w_input_norm_vec =
568 lw.input_layernorm_f32.empty()
569 ? bf16vec_to_float_vec(lw.input_layernorm)
570 : lw.input_layernorm_f32;
572 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": Allocating QKV vectors");
573 std::vector<float> q_vec(hs), k_vec(n_kv_heads * head_dim), v_vec(n_kv_heads * head_dim);
574 bool enable_debug_logging = (l == 0);
575 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_q_proj_dequantized");
577 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_q_proj_dequantized completed");
578 if (!lw.q_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.q_proj_f32, x_norm_vec1, q_vec, hs, hs);
579 else if (!lw.q_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.q_proj_q8k, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
580 else if (!lw.q_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.q_proj_q8_0, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
581 else if (!lw.q_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.q_proj_q4k, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
582 else if (!lw.q_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.q_proj_q6k, x_norm_vec1, q_vec, hs, hs, enable_debug_logging);
583 else if (!lw.q_proj.empty()) matvec_bf16_f32_vector_cpu(lw.q_proj, x_norm_vec1, q_vec, hs, hs); // BF16 from SafeTensors
584 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Q proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
585
586 // ... K, V projections ...
587 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_k_proj_dequantized");
589 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_k_proj_dequantized completed");
590 if (!lw.k_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.k_proj_f32, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs);
595 else if (!lw.k_proj.empty()) matvec_bf16_f32_vector_cpu(lw.k_proj, x_norm_vec1, k_vec, n_kv_heads * head_dim, hs);
596 else throw std::runtime_error("Layer " + std::to_string(l) + ": No K proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
597
598 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_v_proj_dequantized");
600 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_v_proj_dequantized completed");
601 if (!lw.v_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.v_proj_f32, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs);
606 else if (!lw.v_proj.empty()) matvec_bf16_f32_vector_cpu(lw.v_proj, x_norm_vec1, v_vec, n_kv_heads * head_dim, hs);
607 else throw std::runtime_error("Layer " + std::to_string(l) + ": No V proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
608
611 if (kv_cache) {
612 if (static_cast<size_t>(l) < kv_cache->layers.size()) {
613 KVCacheLayer& kv_layer = kv_cache->layers[l];
614 size_t layer_max_seq_len = static_cast<size_t>(kv_cache->max_seq_len_config_);
615 if (static_cast<size_t>(n_tokens) >= layer_max_seq_len && layer_max_seq_len > 0) {
616 Logger::error("KV Cache access out of bounds in CPU forward. Layer " + std::to_string(l) +
617 ", n_tokens: " + std::to_string(n_tokens) +
618 ", configured layer_max_seq_len: " + std::to_string(layer_max_seq_len) + ". Skipping KV update.");
619 } else if (layer_max_seq_len == 0 && n_tokens > 0) {
620 Logger::error("KV Cache layer_max_seq_len is 0, but n_tokens > 0. Layer " + std::to_string(l) + ". Skipping KV update.");
621 } else {
622 for(int h=0; h < n_kv_heads; ++h) {
623 std::copy(k_vec.begin() + h * head_dim, k_vec.begin() + (h+1) * head_dim, kv_layer.k.begin() + n_tokens * (n_kv_heads * head_dim) + h * head_dim);
624 std::copy(v_vec.begin() + h * head_dim, v_vec.begin() + (h+1) * head_dim, kv_layer.v.begin() + n_tokens * (n_kv_heads * head_dim) + h * head_dim);
625 }
626 }
627 } else {
628 Logger::error("KV Cache layer index " + std::to_string(l) + " out of bounds for kv_cache->layers.size() = " + std::to_string(kv_cache->layers.size()));
629 }
630 }
631
632 std::vector<float> attn_out_vec(hs);
633 std::vector<float> x_resid1_vec = input; // Store residual
634 float att_scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
635 std::fill(attn_out_vec.begin(), attn_out_vec.end(), 0.0f);
636 for (int h = 0; h < n_heads; ++h) {
637 std::vector<float> q_head(head_dim);
638 std::copy(q_vec.begin() + h * head_dim, q_vec.begin() + (h + 1) * head_dim, q_head.begin());
639 std::vector<float> current_multihead_attn_out(head_dim, 0.0f);
640 int kv_cache_num_kv_heads = n_kv_heads; // from KVCache struct if available
642 int kv_head_idx = h / kv_group;
643
644 if (kv_cache && static_cast<size_t>(l) < kv_cache->layers.size()) {
645 const KVCacheLayer& kv_layer = kv_cache->layers[l];
646 int current_seq_len = n_tokens + 1;
647 std::vector<float> scores(current_seq_len);
648 for (int t = 0; t < current_seq_len; ++t) {
649 float score = 0.0f;
650 for (int d = 0; d < head_dim; ++d) {
651 score += q_head[d] * kv_layer.k[t * (n_kv_heads * head_dim) + kv_head_idx * head_dim + d];
652 }
653 scores[t] = score * att_scale;
654 }
655 softmax_vector_cpu(scores, scores); // In-place softmax
656 for (int t = 0; t < current_seq_len; ++t) {
657 for (int d = 0; d < head_dim; ++d) {
659 }
660 }
661 }
662 std::copy(current_multihead_attn_out.begin(), current_multihead_attn_out.end(), attn_out_vec.begin() + h * head_dim);
663 }
664
665
666 std::vector<float> attn_proj_vec(hs);
667 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_o_proj_dequantized");
669 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_o_proj_dequantized completed");
670 if(!lw.o_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.o_proj_f32, attn_out_vec, attn_proj_vec, hs, hs);
675 else if(!lw.o_proj.empty()) matvec_bf16_f32_vector_cpu(lw.o_proj, attn_out_vec, attn_proj_vec, hs, hs);
676 else throw std::runtime_error("Layer " + std::to_string(l) + ": No O proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
677
678 for(size_t i=0; i<input.size(); ++i) input[i] = x_resid1_vec[i] + attn_proj_vec[i]; // Update input by reference
679
680 // MLP part
681 std::vector<float> x_norm_vec2(hs);
682 std::vector<float> x_resid2_vec = input; // Store residual for MLP
683 const std::vector<float>& w_post_attn_norm_vec =
684 lw.post_attention_layernorm_f32.empty()
685 ? bf16vec_to_float_vec(lw.post_attention_layernorm)
686 : lw.post_attention_layernorm_f32;
688
689 std::vector<float> gate_vec(is), up_vec(is);
690 // Gate-projection
691 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_gate_proj_dequantized");
693 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_gate_proj_dequantized completed");
694 if(!lw.gate_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.gate_proj_f32, x_norm_vec2, gate_vec, is, hs);
695 else if (!lw.gate_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.gate_proj_q8k, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
696 else if (!lw.gate_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.gate_proj_q8_0, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
697 else if (!lw.gate_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.gate_proj_q4k, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
698 else if (!lw.gate_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.gate_proj_q6k, x_norm_vec2, gate_vec, is, hs, enable_debug_logging);
699 else if(!lw.gate_proj.empty()) matvec_bf16_f32_vector_cpu(lw.gate_proj, x_norm_vec2, gate_vec, is, hs);
700 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Gate proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
701
702 // Up-projection
703 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_up_proj_dequantized");
705 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_up_proj_dequantized completed");
706 if(!lw.up_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.up_proj_f32, x_norm_vec2, up_vec, is, hs);
707 else if (!lw.up_proj_q8k.empty() && config_.is_gguf_file_loaded) matvec_q8k_f32_vector_cpu(lw.up_proj_q8k, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
708 else if (!lw.up_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.up_proj_q8_0, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
709 else if (!lw.up_proj_q4k.empty() && config_.is_gguf_file_loaded) matvec_q4k_f32_vector_cpu(lw.up_proj_q4k, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
710 else if (!lw.up_proj_q6k.empty() && config_.is_gguf_file_loaded) matvec_q6k_f32_vector_cpu(lw.up_proj_q6k, x_norm_vec2, up_vec, is, hs, enable_debug_logging);
711 else if(!lw.up_proj.empty()) matvec_bf16_f32_vector_cpu(lw.up_proj, x_norm_vec2, up_vec, is, hs);
712 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Up proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
713
714 std::vector<float> silu_out_vec(is);
716
717 std::vector<float> swiglu_result_vec(is);
718 for(size_t i=0; i<is; ++i) swiglu_result_vec[i] = silu_out_vec[i] * up_vec[i];
719
720 std::vector<float> mlp_out_vec(hs);
721 // Down-projection
722 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": About to ensure_down_proj_dequantized");
724 Logger::info("[CPU_FWD_MEM] Layer " + std::to_string(l) + ": ensure_down_proj_dequantized completed");
725 if(!lw.down_proj_f32.empty()) matvec_f32_f32_vector_cpu(lw.down_proj_f32, swiglu_result_vec, mlp_out_vec, hs, is);
727 else if (!lw.down_proj_q8_0.empty() && config_.is_gguf_file_loaded) matvec_q8_0_f32_vector_cpu(lw.down_proj_q8_0, swiglu_result_vec, mlp_out_vec, hs, is, enable_debug_logging);
730 else if(!lw.down_proj.empty()) matvec_bf16_f32_vector_cpu(lw.down_proj, swiglu_result_vec, mlp_out_vec, hs, is);
731 else throw std::runtime_error("Layer " + std::to_string(l) + ": No Down proj weights (f32, q8k, q8, q4k, q6k, bf16) for CPU");
732
733 for(size_t i=0; i<input.size(); ++i) input[i] = x_resid2_vec[i] + mlp_out_vec[i]; // Update input by reference
734
735
736 if (log_this_layer) {
737 Logger::info("[CPU_FWD] ------ END Layer " + std::to_string(l) +
738 " (pos=" + std::to_string(n_tokens) + ") ------");
739 }
741 int layer_to_clear = l - 2;
745 }
746 }
747 }
748
750 Logger::info("[CPU_FWD] All layers processed on CPU. Performing final RMSNorm and Logits.");
751 const std::vector<float>& w_final_norm_vec =
754 std::vector<float> x_final_norm_vec(hs);
756
757 std::vector<float> logits(vs);
759 bool enable_lm_head_debug_logging = true; // Always log LM head for debugging
765 else if (!lm_head.empty()) matvec_bf16_f32_vector_cpu(lm_head, x_final_norm_vec, logits, vs, hs); // Fallback for BF16 SafeTensors
766 else throw std::runtime_error("No valid LM Head weights (f32, q8k, q8, q4k, q6k, bf16) found for CPU final stage.");
767
769 log_vector_summary("[CPU_FWD] Final Logits (all CPU, pos=" + std::to_string(n_tokens) + ")", logits, 15);
770 }
771 return logits; // Return final logits if all layers were CPU
772 }
773
774 Logger::info("[CPU_FWD] Finished processing " + std::to_string(config_.num_cpu_offload_layers) + " CPU layers. Output is intermediate activation.");
775 return input; // Return the intermediate activations if not all layers were processed here.
776}
void ensure_up_proj_dequantized(int layer_idx)
std::vector< float > final_norm_f32
Definition model.h:486
std::vector< uint16_t > final_norm
Definition model.h:485
void ensure_v_proj_dequantized(int layer_idx)
std::vector< std::pair< float, float > > precomputed_freqs_cis_
Definition model.h:554
void ensure_o_proj_dequantized(int layer_idx)
void clear_layer_dequantized_weights(int layer_idx)
void ensure_k_proj_dequantized(int layer_idx)
void ensure_q_proj_dequantized(int layer_idx)
void ensure_down_proj_dequantized(int layer_idx)
void ensure_gate_proj_dequantized(int layer_idx)
void ensure_lm_head_dequantized()
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count=5)
Definition utils.cpp:207
Key-Value cache for a single transformer layer.
Definition model.h:130
float rms_norm_eps
Definition model.h:88
bool enable_memory_efficient_layers
Definition model.h:107
void apply_rope_vector(std::vector< float > &x, int num_heads, int head_dim, int current_token_pos, const std::vector< std::pair< float, float > > &all_freqs_cis, int max_pos_embeddings, bool use_adjacent_pairing)
Definition utils.cpp:428
void matvec_q8k_f32_vector_cpu(const std::vector< block_q8_K > &mat_q8k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:399
void matvec_q6k_f32_vector_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:763
void matvec_bf16_f32_vector_cpu(const std::vector< uint16_t > &mat_bf16, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:1025
void matvec_f32_f32_vector_cpu(const std::vector< float > &mat_f32, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols)
Definition utils.cpp:349
void softmax_vector_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:675
void matvec_q4k_f32_vector_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:816
void silu_cpu(const std::vector< float > &x, std::vector< float > &out)
Definition utils.cpp:700
void rmsnorm_vector_cpu(const std::vector< float > &x, const std::vector< float > &weight, std::vector< float > &out, float eps)
Definition utils.cpp:648
void matvec_q8_0_f32_vector_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &vec_f32, std::vector< float > &out_f32, int rows, int cols, bool log_first_block)
Definition utils.cpp:293

References apply_rope_vector(), bf16vec_to_float_vec(), clear_layer_dequantized_weights(), config_, ModelConfig::enable_memory_efficient_layers, ensure_down_proj_dequantized(), ensure_gate_proj_dequantized(), ensure_k_proj_dequantized(), ensure_lm_head_dequantized(), ensure_o_proj_dequantized(), ensure_q_proj_dequantized(), ensure_up_proj_dequantized(), ensure_v_proj_dequantized(), Logger::error(), final_norm, final_norm_f32, ModelConfig::hidden_size, Logger::info(), ModelConfig::intermediate_size, ModelConfig::is_gguf_file_loaded, KVCacheLayer::k, KVCache::layers, layers, lm_head, lm_head_f32, lm_head_q4k, lm_head_q6k, lm_head_q8_0, lm_head_q8k, log_vector_summary(), matvec_bf16_f32_vector_cpu(), matvec_f32_f32_vector_cpu(), matvec_q4k_f32_vector_cpu(), matvec_q6k_f32_vector_cpu(), matvec_q8_0_f32_vector_cpu(), matvec_q8k_f32_vector_cpu(), ModelConfig::max_position_embeddings, KVCache::max_seq_len_config_, ModelConfig::num_attention_heads, ModelConfig::num_cpu_offload_layers, ModelConfig::num_hidden_layers, ModelConfig::num_key_value_heads, precomputed_freqs_cis_, ModelConfig::rms_norm_eps, rmsnorm_vector_cpu(), silu_cpu(), softmax_vector_cpu(), KVCacheLayer::v, and ModelConfig::vocab_size.

Referenced by clear_layer_dequantized_weights(), ensure_down_proj_dequantized(), ensure_embed_tokens_dequantized(), ensure_gate_proj_dequantized(), ensure_k_proj_dequantized(), ensure_lm_head_dequantized(), ensure_o_proj_dequantized(), ensure_q_proj_dequantized(), ensure_up_proj_dequantized(), and ensure_v_proj_dequantized().

◆ forward_cpu_batch()

std::vector< float > TinyLlamaModel::forward_cpu_batch ( const std::vector< float > &  batch_input_activations,
int  num_tokens_in_batch,
int  num_cpu_layers_to_process,
int  start_pos_in_sequence,
KVCache kv_cache,
const std::vector< int > &  prompt_lengths = {} 
)

Definition at line 2086 of file model.cpp.

2092 {
2093
2094 if (!cpu_batch_processor_) {
2095 cpu_batch_processor_ = std::make_unique<CPUBatchProcessor>(this);
2096 }
2097
2098 return cpu_batch_processor_->forward_cpu_batch(
2103 kv_cache,
2105 );
2106}
std::unique_ptr< class CPUBatchProcessor > cpu_batch_processor_
Definition model.h:560

References cpu_batch_processor_.

◆ forward_cpu_batch_generation()

std::vector< std::vector< float > > TinyLlamaModel::forward_cpu_batch_generation ( const std::vector< float > &  batch_input_activations,
const std::vector< int > &  token_positions,
const std::vector< int > &  original_sequence_indices,
int  num_tokens_in_batch,
KVCache kv_cache 
)

Definition at line 1127 of file model.cpp.

1132 {
1133
1134 Logger::info("[CPU_BATCH_GEN] Entry: num_tokens=" + std::to_string(num_tokens_in_batch));
1135 std::string pos_str = "token_positions=[";
1136 for (int i = 0; i < std::min(num_tokens_in_batch, 3); ++i) {
1137 pos_str += std::to_string(token_positions[i]) + " ";
1138 }
1139 pos_str += "]";
1140 std::string seq_str = "original_sequence_indices=[";
1141 for (int i = 0; i < std::min(num_tokens_in_batch, 3); ++i) {
1142 seq_str += std::to_string(original_sequence_indices[i]) + " ";
1143 }
1144 seq_str += "]";
1145 Logger::info("[CPU_BATCH_GEN] " + pos_str + ", " + seq_str);
1147 Logger::error("[CPU_BATCH_GENERATION] batch_input_activations size mismatch. Expected: " +
1148 std::to_string((size_t)num_tokens_in_batch * config_.hidden_size) + " Got: " +
1149 std::to_string(batch_input_activations.size()));
1150 return {};
1151 }
1152
1153 if (token_positions.size() != static_cast<size_t>(num_tokens_in_batch)) {
1154 Logger::error("[CPU_BATCH_GENERATION] token_positions size mismatch. Expected: " +
1155 std::to_string(num_tokens_in_batch) + " Got: " + std::to_string(token_positions.size()));
1156 return {};
1157 }
1158
1159 int hs = config_.hidden_size;
1163 if (n_heads == 0) {
1164 Logger::error("[CPU_BATCH_GENERATION] Error: num_attention_heads is zero.");
1165 return {};
1166 }
1167 int head_dim = hs / n_heads;
1168 float eps = config_.rms_norm_eps;
1171 float attention_scale = 1.0f / SAFE_SQRT(static_cast<float>(head_dim));
1172 int vs = config_.vocab_size;
1173 int kv_group = n_heads / n_kv_heads; // Pre-calculate GQA grouping
1174
1176 for (int l = 0; l < config_.num_cpu_offload_layers; ++l) {
1177 const auto& lw = layers[l];
1178
1179 // Batch RMSNorm for attention
1180 std::vector<float> batch_x_norm1(current_batch_activations.size());
1181 const std::vector<float>& w_input_norm_vec =
1182 lw.input_layernorm_f32.empty()
1183 ? bf16vec_to_float_vec(lw.input_layernorm)
1184 : lw.input_layernorm_f32;
1186
1188
1189 // Batch Q, K, V projections
1190 std::vector<float> q_batch((size_t)num_tokens_in_batch * hs);
1191 std::vector<float> k_batch((size_t)num_tokens_in_batch * n_kv_heads * head_dim);
1192 std::vector<float> v_batch((size_t)num_tokens_in_batch * n_kv_heads * head_dim);
1193
1194 // Q Projection (batched)
1195 if (!lw.q_proj_f32.empty()) {
1197 } else if (!lw.q_proj_q8_0.empty()) {
1199 } else if (!lw.q_proj_q6k.empty()) {
1201 } else if (!lw.q_proj_q4k.empty()) {
1203 } else {
1204 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No Q proj weights found for CPU (batched)");
1205 return {};
1206 }
1207
1208 // K Projection (batched)
1209 if (!lw.k_proj_f32.empty()) {
1211 } else if (!lw.k_proj_q8_0.empty()) {
1213 } else if (!lw.k_proj_q6k.empty()) {
1215 } else if (!lw.k_proj_q4k.empty()) {
1217 } else {
1218 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No K proj weights found for CPU (batched)");
1219 return {};
1220 }
1221
1222 // V Projection (batched)
1223 if (!lw.v_proj_f32.empty()) {
1225 } else if (!lw.v_proj_q8_0.empty()) {
1227 } else if (!lw.v_proj_q6k.empty()) {
1229 } else if (!lw.v_proj_q4k.empty()) {
1231 } else {
1232 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No V proj weights found for CPU (batched)");
1233 return {};
1234 }
1235
1236 // Optimized RoPE, KV cache update, and attention with OpenMP and SIMD
1237 std::vector<float> batch_attn_output((size_t)num_tokens_in_batch * hs);
1238
1239 // Ensure weights are dequantized before parallel processing
1247
1248 #pragma omp parallel if(num_tokens_in_batch > 1)
1249 {
1250 // Thread-local buffers to avoid allocations in loop
1251 std::vector<float> q_token(hs);
1252 std::vector<float> k_token(n_kv_heads * head_dim);
1253 std::vector<float> v_token(n_kv_heads * head_dim);
1254 std::vector<float> scores_buffer;
1255
1256 #pragma omp for
1257 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1259
1260 // Extract Q, K, V for this token (reuse thread-local buffers)
1261 std::copy(q_batch.begin() + (size_t)token_idx * hs,
1262 q_batch.begin() + (size_t)(token_idx + 1) * hs,
1263 q_token.begin());
1264 std::copy(k_batch.begin() + (size_t)token_idx * n_kv_heads * head_dim,
1265 k_batch.begin() + (size_t)(token_idx + 1) * n_kv_heads * head_dim,
1266 k_token.begin());
1267 std::copy(v_batch.begin() + (size_t)token_idx * n_kv_heads * head_dim,
1268 v_batch.begin() + (size_t)(token_idx + 1) * n_kv_heads * head_dim,
1269 v_token.begin());
1270
1271 // Apply RoPE individually
1274
1275 // Update KV cache at specific position - Sequence-Major Layout
1276 if (kv_cache && static_cast<size_t>(l) < kv_cache->layers.size()) {
1277 auto& layer_cache = kv_cache->layers[l];
1278 // Use sequence-major layout to match prefill behavior
1280 int sequence_base_offset = seq_idx * kv_cache->max_seq_len_config_;
1282 #pragma omp critical
1283 {
1284 if (kv_offset + n_kv_heads * head_dim <= static_cast<int>(layer_cache.k.size())) {
1285 std::copy(k_token.begin(), k_token.end(), layer_cache.k.begin() + kv_offset);
1286 std::copy(v_token.begin(), v_token.end(), layer_cache.v.begin() + kv_offset);
1287 }
1288 }
1289 }
1290
1291 // Copy back RoPE'd Q values to batch
1292 std::copy(q_token.begin(), q_token.end(), q_batch.begin() + (size_t)token_idx * hs);
1293
1294 // SIMD-optimized attention computation for this token
1296 int history_len = (seq_idx < kv_cache->current_batch_size) ? kv_cache->batch_seq_lens[seq_idx] : pos + 1;
1297 scores_buffer.resize(history_len);
1298
1299 const float* q_token_ptr = q_batch.data() + (size_t)token_idx * hs;
1301
1302 if (kv_cache && static_cast<size_t>(l) < kv_cache->layers.size()) {
1303 const auto& layer_cache = kv_cache->layers[l];
1304
1305 // Process all heads for this token efficiently with SIMD
1306 for (int h = 0; h < n_heads; ++h) {
1307 int kv_head_idx = h / kv_group; // Use pre-calculated kv_group
1308 const float* q_head_ptr = q_token_ptr + h * head_dim;
1310
1311 // SIMD-optimized attention score computation
1312 for (int t = 0; t < history_len; ++t) {
1313 // sequence-major layout: each sequence has contiguous region
1315 int sequence_base_offset = seq_idx * kv_cache->max_seq_len_config_;
1316 const float* k_ptr = layer_cache.k.data() + (sequence_base_offset + t) * n_kv_heads * head_dim + kv_head_idx * head_dim;
1317
1318 // Use SIMD dot product for Q·K computation
1319#if defined(__AVX2__) || defined(__SSE2__) || defined(__ARM_NEON)
1321#else
1322 float score = 0.0f;
1323 for (int d = 0; d < head_dim; ++d) {
1324 score += q_head_ptr[d] * k_ptr[d];
1325 }
1326#endif
1328 }
1329
1330 // Softmax
1332
1333 // SIMD-optimized weighted sum with V
1334 std::fill(head_output_ptr, head_output_ptr + head_dim, 0.0f);
1335 for (int t = 0; t < history_len; ++t) {
1336 // Sequence-major layout: each sequence has contiguous region
1338 int sequence_base_offset = seq_idx * kv_cache->max_seq_len_config_;
1339 const float* v_ptr = layer_cache.v.data() + (sequence_base_offset + t) * n_kv_heads * head_dim + kv_head_idx * head_dim;
1340 float score = scores_buffer[t];
1341
1342 // Use SIMD scaled vector addition for score * V accumulation
1343#if defined(__AVX2__) || defined(__SSE2__) || defined(__ARM_NEON)
1345#else
1346 for (int d = 0; d < head_dim; ++d) {
1347 head_output_ptr[d] += score * v_ptr[d];
1348 }
1349#endif
1350 }
1351 }
1352 } else {
1353 std::fill(attn_output_ptr, attn_output_ptr + hs, 0.0f);
1354 }
1355 }
1356 }
1357 // O-Projection (batched)
1358 std::vector<float> batch_attn_proj_out((size_t)num_tokens_in_batch * hs);
1359 if(!lw.o_proj_f32.empty()) {
1361 } else if (!lw.o_proj_q8_0.empty()) {
1363 } else if (!lw.o_proj_q6k.empty()) {
1365 } else if (!lw.o_proj_q4k.empty()) {
1367 } else {
1368 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No O proj weights found for CPU");
1369 return {};
1370 }
1371
1372 // First Residual Connection (batched)
1373 for(size_t i=0; i < current_batch_activations.size(); ++i) {
1375 }
1376
1377 // MLP processing (batched where possible)
1379 std::vector<float> batch_x_norm2(current_batch_activations.size());
1380
1381 const std::vector<float>& w_post_attn_norm_vec =
1382 lw.post_attention_layernorm_f32.empty()
1383 ? bf16vec_to_float_vec(lw.post_attention_layernorm)
1384 : lw.post_attention_layernorm_f32;
1386
1387 std::vector<float> batch_gate_proj_out((size_t)num_tokens_in_batch * is);
1388 std::vector<float> batch_up_proj_out((size_t)num_tokens_in_batch * is);
1389
1390 // Gate and Up projections (batched)
1391 if (!lw.gate_proj_f32.empty()) {
1393 } else if (!lw.gate_proj_q8_0.empty()) {
1395 } else if (!lw.gate_proj_q6k.empty()) {
1397 } else if (!lw.gate_proj_q4k.empty()) {
1399 } else {
1400 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No gate_proj weights found for CPU");
1401 return {};
1402 }
1403
1404 if (!lw.up_proj_f32.empty()) {
1406 } else if (!lw.up_proj_q8_0.empty()) {
1408 } else if (!lw.up_proj_q6k.empty()) {
1410 } else if (!lw.up_proj_q4k.empty()) {
1412 } else {
1413 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No up_proj weights found for CPU");
1414 return {};
1415 }
1416
1417 // SwiGLU (batched)
1418 std::vector<float> batch_swiglu_out((size_t)num_tokens_in_batch * is);
1419 for (size_t i = 0; i < batch_gate_proj_out.size(); ++i) {
1421 float silu_gate_val = gate_val / (1.0f + std::exp(-gate_val));
1423 }
1424
1425 // Down Projection (batched)
1426 std::vector<float> batch_mlp_down_proj_out((size_t)num_tokens_in_batch * hs);
1427 if (!lw.down_proj_f32.empty()) {
1429 } else if (!lw.down_proj_q8_0.empty()) {
1431 } else if (!lw.down_proj_q6k.empty()) {
1433 } else if (!lw.down_proj_q4k.empty()) {
1435 } else {
1436 Logger::error("[CPU_BATCH_GENERATION] Layer " + std::to_string(l) + ": No down_proj weights found for CPU");
1437 return {};
1438 }
1439
1440 // Second Residual Connection (batched)
1441 for(size_t i = 0; i < current_batch_activations.size(); ++i) {
1443 }
1444 }
1445
1446 // Update KV cache sequence length
1447 if (kv_cache && num_tokens_in_batch > 0) {
1448 // For batch mode, track positions per sequence
1449 if (kv_cache->current_batch_size > 0) {
1450 // Update batch_seq_lens based on the highest position seen for each sequence
1451 std::vector<int> max_positions_per_seq(kv_cache->current_batch_size, -1);
1452
1453 for (int i = 0; i < num_tokens_in_batch; ++i) {
1455 int pos = token_positions[i];
1456
1457 if (seq_idx >= 0 && seq_idx < kv_cache->current_batch_size) {
1459 }
1460 }
1461
1462 // Update batch_seq_lens to reflect new positions (pos + 1 since pos is 0-indexed)
1463 for (int seq_idx = 0; seq_idx < kv_cache->current_batch_size; ++seq_idx) {
1464 if (max_positions_per_seq[seq_idx] >= 0) {
1465 kv_cache->batch_seq_lens[seq_idx] = max_positions_per_seq[seq_idx] + 1;
1466 Logger::info("[CPU_BATCH_GEN] KV Length Max Update: seq_idx=" + std::to_string(seq_idx) +
1467 ", old_batch_seq_len=" + std::to_string(kv_cache->batch_seq_lens[seq_idx]) +
1468 ", new_batch_seq_len=" + std::to_string(max_positions_per_seq[seq_idx] + 1));
1469 }
1470 }
1471 // For single-sequence compatibility, update seq_len to the max
1472 kv_cache->seq_len = *std::max_element(kv_cache->batch_seq_lens.begin(),
1473 kv_cache->batch_seq_lens.begin() + kv_cache->current_batch_size);
1474 } else {
1475 // Fallback for single sequence mode
1476 int max_pos = *std::max_element(token_positions.begin(), token_positions.end());
1477 kv_cache->seq_len = std::max(kv_cache->seq_len, max_pos + 1);
1478 }
1479 }
1480
1481 // Final normalization and logits calculation for ALL tokens
1483
1484 // Convert flat logits to per-token vectors
1485 std::vector<std::vector<float>> all_logits(num_tokens_in_batch, std::vector<float>(vs));
1486 for (int token_idx = 0; token_idx < num_tokens_in_batch; ++token_idx) {
1487 std::copy(batch_logits.begin() + (size_t)token_idx * vs,
1488 batch_logits.begin() + (size_t)(token_idx + 1) * vs,
1489 all_logits[token_idx].begin());
1490 }
1491 return all_logits;
1492}
#define SAFE_SQRT(x)
std::vector< float > forward_cpu_logits_batch(const std::vector< float > &final_batch_activations, int num_tokens_in_batch)
Definition model.cpp:1063
void simd_scaled_add(float *dst, const float *src, float scale, int n)
Definition utils.cpp:92
void matmul_q4k_f32_batch_cpu(const std::vector< block_q4_K > &mat_q4k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:988
float simd_dot_product(const float *a, const float *b, int n)
Definition utils.cpp:35
void matmul_q8_0_f32_batch_cpu(const std::vector< block_q8_0 > &mat_q8_0, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:869
void matmul_f32_f32_batch_cpu(const std::vector< float > &mat_weights, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:709
void matmul_q6k_f32_batch_cpu(const std::vector< block_q6_K > &mat_q6k, const std::vector< float > &batch_input_activations, std::vector< float > &batch_output_activations, int num_tokens, int output_dim, int input_dim)
Definition utils.cpp:950
void rmsnorm_batch_cpu(const std::vector< float > &x_batch, const std::vector< float > &weight, std::vector< float > &out_batch, int num_tokens, int hidden_size, float eps)
Definition utils.cpp:613

References apply_rope_vector(), KVCache::batch_seq_lens, bf16vec_to_float_vec(), config_, KVCache::current_batch_size, ensure_down_proj_dequantized(), ensure_gate_proj_dequantized(), ensure_k_proj_dequantized(), ensure_o_proj_dequantized(), ensure_q_proj_dequantized(), ensure_up_proj_dequantized(), ensure_v_proj_dequantized(), Logger::error(), forward_cpu_logits_batch(), ModelConfig::hidden_size, Logger::info(), ModelConfig::intermediate_size, ModelConfig::is_gguf_file_loaded, KVCache::layers, layers, matmul_f32_f32_batch_cpu(), matmul_q4k_f32_batch_cpu(), matmul_q6k_f32_batch_cpu(), matmul_q8_0_f32_batch_cpu(), ModelConfig::max_position_embeddings, KVCache::max_seq_len_config_, ModelConfig::num_attention_heads, ModelConfig::num_cpu_offload_layers, ModelConfig::num_key_value_heads, precomputed_freqs_cis_, ModelConfig::rms_norm_eps, rmsnorm_batch_cpu(), SAFE_SQRT, KVCache::seq_len, simd_dot_product(), simd_scaled_add(), softmax_vector_cpu(), and ModelConfig::vocab_size.

◆ forward_cpu_logits_batch()

std::vector< float > TinyLlamaModel::forward_cpu_logits_batch ( const std::vector< float > &  final_batch_activations,
int  num_tokens_in_batch 
)

Definition at line 1063 of file model.cpp.

1065 {
1066
1068 Logger::error("[CPU_LOGITS_BATCH] final_batch_activations size mismatch. Expected: " +
1069 std::to_string((size_t)num_tokens_in_batch * config_.hidden_size) + " Got: " +
1070 std::to_string(final_batch_activations.size()));
1071 return {};
1072 }
1073
1074 int hs = config_.hidden_size;
1075 int vs = config_.vocab_size;
1076 float eps = config_.rms_norm_eps;
1077
1078 // 1. Final RMSNorm
1079 std::vector<float> final_batch_norm_out(num_tokens_in_batch * hs);
1080 const std::vector<float>& w_final_norm_vec =
1083 if (w_final_norm_vec.empty()) {
1084 Logger::error("[CPU_LOGITS_BATCH] Final RMSNorm weights are empty (neither f32 nor bf16 available).");
1085 return {};
1086 }
1087
1090
1091 // 2. Batched LM Head multiplication
1092 std::vector<float> batch_logits_out(num_tokens_in_batch * vs);
1093
1094 if (!lm_head_f32.empty()) {
1095 Logger::info("[CPU_LOGITS_BATCH] Using F32 LM Head weights.");
1098 } else if (!lm_head_q8_0.empty() && config_.is_gguf_file_loaded) {
1099 Logger::info("[CPU_LOGITS_BATCH] Using Q8_0 LM Head weights.");
1102 } else if (!lm_head_q6k.empty() && config_.is_gguf_file_loaded) {
1103 Logger::info("[CPU_LOGITS_BATCH] Using Q6_K LM Head weights.");
1106 } else if (!lm_head_q4k.empty() && config_.is_gguf_file_loaded) {
1107 Logger::info("[CPU_LOGITS_BATCH] Using Q4_K LM Head weights.");
1110 } else if (!lm_head.empty()) { // BF16 SafeTensors weights
1111 Logger::info("[CPU_LOGITS_BATCH] Using BF16 LM Head weights (converting to F32 for matmul).");
1112 std::vector<float> lm_head_f32_temp = bf16vec_to_float_vec(lm_head);
1113 if (lm_head_f32_temp.empty()) {
1114 Logger::error("[CPU_LOGITS_BATCH] Failed to convert BF16 LM Head to F32.");
1115 return {};
1116 }
1119 } else {
1120 Logger::error("[CPU_LOGITS_BATCH] No valid LM Head weights found (F32, Q8_0, Q6_K, Q4_K, BF16).");
1121 return {};
1122 }
1123
1124 return batch_logits_out;
1125}

References bf16vec_to_float_vec(), config_, Logger::error(), final_norm, final_norm_f32, ModelConfig::hidden_size, Logger::info(), ModelConfig::is_gguf_file_loaded, lm_head, lm_head_f32, lm_head_q4k, lm_head_q6k, lm_head_q8_0, matmul_f32_f32_batch_cpu(), matmul_q4k_f32_batch_cpu(), matmul_q6k_f32_batch_cpu(), matmul_q8_0_f32_batch_cpu(), ModelConfig::rms_norm_eps, rmsnorm_batch_cpu(), and ModelConfig::vocab_size.

Referenced by forward_cpu_batch_generation().

◆ free_bf16_concatenated_weights()

void TinyLlamaModel::free_bf16_concatenated_weights ( )

Definition at line 947 of file weight_management.cpp.

947 {
948 Logger::info("CPU-only build: free_bf16_concatenated_weights is a no-op");
949}

References Logger::info().

◆ free_layer_gpu_weights()

void TinyLlamaModel::free_layer_gpu_weights ( int  layer_idx)

◆ get_config()

const ModelConfig & TinyLlamaModel::get_config ( ) const
inline

Definition at line 425 of file model.h.

425{ return config_; }

References config_.

◆ get_embed_tokens()

const std::vector< uint16_t > & TinyLlamaModel::get_embed_tokens ( ) const
inline

Definition at line 429 of file model.h.

429{ return embed_tokens; }

References embed_tokens.

◆ get_gguf_data()

const GGUFData * TinyLlamaModel::get_gguf_data ( ) const
inline

Definition at line 446 of file model.h.

446 {
447 return gguf_data_ ? gguf_data_.get() : nullptr;
448 }

References gguf_data_.

◆ get_gguf_data_ptr()

GGUFData * TinyLlamaModel::get_gguf_data_ptr ( )
inline

Definition at line 450 of file model.h.

450{ return gguf_data_.get(); }

References gguf_data_.

◆ get_layers()

std::vector< LayerWeights > & TinyLlamaModel::get_layers ( )
inline

Definition at line 431 of file model.h.

431{ return layers; }

References layers.

◆ get_lm_head()

const std::vector< uint16_t > & TinyLlamaModel::get_lm_head ( ) const
inline

Definition at line 427 of file model.h.

427{ return lm_head; }

References lm_head.

◆ get_vocab_size()

int TinyLlamaModel::get_vocab_size ( ) const

Get the vocabulary size for the model.

Returns
Vocabulary size.

Definition at line 244 of file model_utils.cpp.

244 {
245 return config_.vocab_size;
246}

References config_, and ModelConfig::vocab_size.

◆ initialize_gpu_and_rope()

void TinyLlamaModel::initialize_gpu_and_rope ( )

Definition at line 15 of file gpu_initialization.cpp.

15 {
16 Logger::info("[INIT_GPU_ROPE_DEBUG_L1113] Absolute Start of initialize_gpu_and_rope: config_.num_cpu_offload_layers = " + std::to_string(config_.num_cpu_offload_layers) +
17 ", config_.num_hidden_layers = " + std::to_string(config_.num_hidden_layers));
18 Logger::info("[GPU_ROPE_INIT_ENTRY] Entered initialize_gpu_and_rope. Requested CPU Offload Layers: " + std::to_string(config_.num_cpu_offload_layers) + ", Total Hidden Layers: " + std::to_string(config_.num_hidden_layers));
22 int vs = config_.vocab_size;
25
29 Logger::warning("Requested CPU offload layers (" + std::to_string(config_.num_cpu_offload_layers) +
30 ") exceeds total hidden layers (" + std::to_string(nhl) +
31 "). Clamping to " + std::to_string(nhl) + " layers on CPU.");
33 }
36
37 Logger::info("Effective CPU layers for this init: " + std::to_string(active_num_cpu_layers) + ", Effective GPU layers for this init: " + std::to_string(active_num_gpu_layers));
38
39 if (hs <= 0) throw std::runtime_error("Invalid model config: hidden_size must be positive.");
40 if (vs <= 0) throw std::runtime_error("Invalid model config: vocab_size must be positive.");
41 if (n_heads <= 0) throw std::runtime_error("Invalid model config: num_attention_heads must be positive.");
42 if (n_kv_heads <= 0) throw std::runtime_error("Invalid model config: num_key_value_heads must be positive.");
43 if (hs % n_heads != 0) throw std::runtime_error("Invalid model config: hidden_size not divisible by num_attention_heads.");
44
45 int kv_dim = (hs / n_heads) * n_kv_heads;
46 int head_dim = hs / n_heads;
47
48 Logger::info("Precomputing RoPE frequencies on CPU (always done).");
51 float theta = config_.rope_theta;
52 for (int pos = 0; pos < max_seq_len; ++pos) {
53 for (int i_rope = 0; i_rope < head_dim; i_rope += 2) {
54 float freq = std::pow(theta, -((float)i_rope) / head_dim);
55 float angle = pos * freq;
56 precomputed_freqs_cis_[(pos * head_dim / 2) + (i_rope / 2)] = {std::cos(angle), std::sin(angle)};
57 }
58 }
59 Logger::info("Finished precomputing RoPE cos/sin frequencies on CPU.");
60
61#ifdef HAS_CUDA
62#define SAFE_CUDA_FREE(ptr) if(ptr) { cudaFree(ptr); ptr = nullptr; }
63
64 if (active_num_gpu_layers == 0) {
65 Logger::info("No layers assigned to GPU (active_num_gpu_layers = 0). Cleaning up existing CUDA resources and skipping GPU initialization.");
66
68 for (int i = 0; i < nhl; ++i) {
71 }
85
87 return;
88 }
89
90 Logger::info("Initializing CUDA resources for " + std::to_string(active_num_gpu_layers) + " GPU layers.");
91 if (!cublas_handle_) {
94 throw std::runtime_error("Failed to initialize cuBLAS: " + std::to_string(cublas_status));
95 }
96 Logger::info("cuBLAS handle created successfully.");
97
98 // Check for BF16 Tensor Core support
99 this->use_bf16_tensor_cores_ = false; // Default to false
101 int current_device;
102 gpuErrchk(cudaGetDevice(&current_device));
104
105 bool has_bf16_tensor_core_hw = ((props.major == 7 && props.minor == 5) || props.major >= 8);
107
110 Logger::info("GPU " + std::string(props.name) + " (CC " + std::to_string(props.major) + "." + std::to_string(props.minor) + ") supports BF16 Tensor Cores AND model dimensions (hs: " + std::to_string(config_.hidden_size) + ", vs: " + std::to_string(config_.vocab_size) + ") are compatible. Enabling Tensor Core path for matvec_bf16_f32.");
111 this->use_bf16_tensor_cores_ = true;
112 } else {
113 Logger::info("GPU " + std::string(props.name) + " (CC " + std::to_string(props.major) + "." + std::to_string(props.minor) + ") supports BF16 Tensor Cores, BUT model dimensions (hs: " + std::to_string(config_.hidden_size) + " (must be div by 8), vs: " + std::to_string(config_.vocab_size) + " (must be div by 4)) are NOT compatible. Disabling Tensor Core path for matvec_bf16_f32.");
114 }
115 } else {
116 Logger::info("GPU " + std::string(props.name) + " (CC " + std::to_string(props.major) + "." + std::to_string(props.minor) + ") does not meet criteria for BF16 Tensor Core use (requires CC >= 7.5). Disabling Tensor Core path for matvec_bf16_f32.");
117 }
118 }
119
120 if (final_norm_f32.empty() && !final_norm.empty()) {
121 Logger::info("Converting final_norm (BF16) to FP32 for GPU.");
123 }
124 if (!final_norm_f32.empty()) {
126 gpuErrchk(cudaMalloc(&final_norm_dev, final_norm_f32.size() * sizeof(float)));
128 Logger::info("Copied final_norm weights (FP32) to GPU.");
129 } else {
130 Logger::warning("Final norm weights (FP32) are empty, skipping GPU copy. This might be an issue if GPU layers are expected to use it.");
131 }
132
133 for (int i = 0; i < active_num_cpu_layers; ++i) {
134 if (static_cast<size_t>(i) < layers.size()) {
137 }
138 }
139 Logger::info("Copying layer norm weights (FP32) to GPU for layers " + std::to_string(active_num_cpu_layers) + " to " + std::to_string(nhl - 1));
140 Logger::info("[INIT_DEBUG_PRE_LOOP] Active CPU layers: " + std::to_string(active_num_cpu_layers));
141 if (nhl > 0 && layers.size() > 0) {
142 Logger::info("[INIT_DEBUG_PRE_LOOP] layers[0].input_layernorm_f32.empty(): " + std::string(layers[0].input_layernorm_f32.empty() ? "YES" : "NO") +
143 ", Size: " + std::to_string(layers[0].input_layernorm_f32.size()));
144 }
145 for (int i = active_num_cpu_layers; i < nhl; ++i) {
146 if (static_cast<size_t>(i) >= layers.size()) {
147 Logger::error("Layer index " + std::to_string(i) + " out of bounds for layers vector (size: " + std::to_string(layers.size()) + ")");
148 continue;
149 }
152
153 if (layers[i].input_layernorm_f32.empty() && !layers[i].input_layernorm.empty()) {
154 layers[i].input_layernorm_f32 = bf16vec_to_float_vec(layers[i].input_layernorm);
155 }
156 if (layers[i].post_attention_layernorm_f32.empty() && !layers[i].post_attention_layernorm.empty()) {
157 layers[i].post_attention_layernorm_f32 = bf16vec_to_float_vec(layers[i].post_attention_layernorm);
158 }
159
160 if (!layers[i].input_layernorm_f32.empty()) {
161 gpuErrchk(cudaMalloc(&layers[i].input_layernorm_dev, layers[i].input_layernorm_f32.size() * sizeof(float)));
162 gpuErrchk(cudaMemcpy(layers[i].input_layernorm_dev, layers[i].input_layernorm_f32.data(), layers[i].input_layernorm_f32.size() * sizeof(float), cudaMemcpyHostToDevice));
163 if (i == active_num_cpu_layers) {
164 Logger::info("[INIT_DEBUG] layers[" + std::to_string(i) + "].input_layernorm_dev allocated. Pointer: " + Logger::ptrToString(layers[i].input_layernorm_dev) +
165 ", Size used for malloc: " + std::to_string(layers[i].input_layernorm_f32.size() * sizeof(float)) + " bytes (" +
166 std::to_string(layers[i].input_layernorm_f32.size()) + " elements). Host vector empty: " + (layers[i].input_layernorm_f32.empty() ? "YES" : "NO"));
167 }
168 } else {
169 throw std::runtime_error("GPU Layer " + std::to_string(i) + ": input_layernorm_f32 weights are empty. Cannot offload to GPU without them.");
170 }
171
172 if (!layers[i].post_attention_layernorm_f32.empty()) {
173 gpuErrchk(cudaMalloc(&layers[i].post_attention_layernorm_dev, layers[i].post_attention_layernorm_f32.size() * sizeof(float)));
174 gpuErrchk(cudaMemcpy(layers[i].post_attention_layernorm_dev, layers[i].post_attention_layernorm_f32.data(), layers[i].post_attention_layernorm_f32.size() * sizeof(float), cudaMemcpyHostToDevice));
175 } else {
176 throw std::runtime_error("GPU Layer " + std::to_string(i) + ": post_attention_layernorm_f32 weights are empty. Cannot offload to GPU without them.");
177 }
178 }
179 Logger::info("Finished processing layer norm weights for GPU layers.");
180
181
186
187 if (active_num_gpu_layers > 0) {
188 if (!embed_tokens.empty()) {
191 Logger::info("Copied token_embedding_table (bf16 direct from model.embed_tokens) to GPU.");
193 }
194 else if (!embed_tokens_f32.empty()) {
195 std::vector<uint16_t> bf16_data(embed_tokens_f32.size());
196 #pragma omp parallel for
197 for (int i = 0; i < (int)embed_tokens_f32.size(); ++i) {
199 }
202 Logger::info("Converted token_embedding_table (fp32 source -> bf16) to GPU.");
204 }
205 else if (!embed_tokens_q8_0.empty()) {
206 std::vector<float> temp_f32_data(embed_tokens_q8_0.size() * GGML_QK8_0);
207 #pragma omp parallel for
208 for (int i = 0; i < (int)embed_tokens_q8_0.size(); ++i) {
210 }
211 std::vector<uint16_t> bf16_data(temp_f32_data.size());
212 #pragma omp parallel for
213 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
215 }
218 Logger::info("Dequantized token_embedding_table (Q8_0 -> fp32 -> bf16) to GPU.");
220 }
221 else if (!embed_tokens_q4k.empty()) {
222 std::vector<float> temp_f32_data(embed_tokens_q4k.size() * GGML_QK_K);
223 #pragma omp parallel for
224 for (int i = 0; i < (int)embed_tokens_q4k.size(); ++i) {
226 }
227 std::vector<uint16_t> bf16_data(temp_f32_data.size());
228 #pragma omp parallel for
229 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
231 }
234 Logger::info("Dequantized token_embedding_table (Q4_K -> fp32 -> bf16) to GPU.");
236 }
237 else if (!embed_tokens_q6k.empty()) {
238 std::vector<float> temp_f32_data(embed_tokens_q6k.size() * GGML_QK_K);
239 #pragma omp parallel for
240 for (int i = 0; i < (int)embed_tokens_q6k.size(); ++i) {
242 }
243 std::vector<uint16_t> bf16_data(temp_f32_data.size());
244 #pragma omp parallel for
245 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
247 }
250 Logger::info("Dequantized token_embedding_table (Q6_K -> fp32 -> bf16) to GPU.");
252 }
253
255 Logger::info("[INIT_DEBUG] token_embedding_table_dev_ (BF16 on GPU) processed. Pointer: " + Logger::ptrToString(token_embedding_table_dev_) +
256 ". Flag token_embeddings_processed_to_gpu_bf16: YES");
257 }
259 Logger::warning("Token embeddings were not processed to GPU as BF16, despite GPU layers being active. This might indicate missing source embedding data in the model structure or an unhandled GGUF type for embeddings.");
260 }
261 } else {
262 Logger::info("No GPU layers active, skipping token embedding table processing for GPU.");
263 }
264
269
270 if (active_num_gpu_layers > 0) {
271 if (!lm_head.empty()) {
272 gpuErrchk(cudaMalloc(&lm_head_dev_, lm_head.size() * sizeof(uint16_t)));
274 Logger::info("Copied lm_head (bf16 direct from model.lm_head) to GPU.");
276 }
277 else if (!lm_head_f32.empty()) {
278 std::vector<uint16_t> bf16_data(lm_head_f32.size());
279 #pragma omp parallel for
280 for (int i = 0; i < (int)lm_head_f32.size(); ++i) {
282 }
285 Logger::info("Converted lm_head (fp32 source -> bf16) to GPU.");
287 }
288 else if (!lm_head_q8_0.empty()) {
289 std::vector<float> temp_f32_data(lm_head_q8_0.size() * GGML_QK8_0);
290 #pragma omp parallel for
291 for (int i = 0; i < (int)lm_head_q8_0.size(); ++i) {
293 }
294 std::vector<uint16_t> bf16_data(temp_f32_data.size());
295 #pragma omp parallel for
296 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
298 }
301 Logger::info("Dequantized lm_head (Q8_0 -> fp32 -> bf16) to GPU.");
303 }
304 else if (!lm_head_q4k.empty()) {
305 std::vector<float> temp_f32_data(lm_head_q4k.size() * GGML_QK_K);
306 #pragma omp parallel for
307 for (int i = 0; i < (int)lm_head_q4k.size(); ++i) {
309 }
310 std::vector<uint16_t> bf16_data(temp_f32_data.size());
311 #pragma omp parallel for
312 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
314 }
317 Logger::info("Dequantized lm_head (Q4_K -> fp32 -> bf16) to GPU.");
319 }
320 else if (!lm_head_q6k.empty()) {
321 std::vector<float> temp_f32_data(lm_head_q6k.size() * GGML_QK_K);
322 #pragma omp parallel for
323 for (int i = 0; i < (int)lm_head_q6k.size(); ++i) {
325 }
326 std::vector<uint16_t> bf16_data(temp_f32_data.size());
327 #pragma omp parallel for
328 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
330 }
333 Logger::info("Dequantized lm_head (Q6_K -> fp32 -> bf16) to GPU.");
335 }
336
338 Logger::warning("LM head was not processed to GPU as BF16, despite GPU layers being active. This might indicate missing source LM head data in the model structure or an unhandled GGUF type for LM head.");
339 }
340 } else {
341 Logger::info("No GPU layers active, skipping LM head processing for GPU.");
342 }
343
345
346 if (active_num_gpu_layers > 0) {
347 if (!lm_head_f32.empty()) {
348 gpuErrchk(cudaMalloc(&lm_head_f32_dev_, lm_head_f32.size() * sizeof(float)));
350 Logger::info("[INIT_GPU_ROPE] Copied lm_head_f32 (host FP32) to GPU for lm_head_f32_dev_. Pointer: " + Logger::ptrToString(lm_head_f32_dev_));
351 } else {
352 Logger::error("[INIT_GPU_ROPE] Host lm_head_f32 is EMPTY. Cannot populate lm_head_f32_dev_. This WILL CAUSE a cublasSgemm error in the final matvec. Check model loading and initialize_weights logic for lm_head_f32 population.");
353 lm_head_f32_dev_ = nullptr;
354 }
355 } else {
356 lm_head_f32_dev_ = nullptr;
357 }
358
359
360 Logger::info("Finished processing embedding and LM head tables for GPU.");
361
363 if (active_num_gpu_layers > 0) {
364 if (!precomputed_freqs_cis_.empty()) {
367 std::vector<float> flat_host_freqs; flat_host_freqs.reserve(total_freq_elements);
368 for (const auto& p : precomputed_freqs_cis_) { flat_host_freqs.push_back(p.first); flat_host_freqs.push_back(p.second); }
370 Logger::info("Copied all precomputed RoPE frequencies to persistent GPU buffer.");
371 } else {
372 Logger::warning("Host precomputed_freqs_cis_ is empty. Skipping GPU RoPE buffer allocation. This WILL cause issues if GPU layers use RoPE.");
373 }
374 Logger::info("Finished processing RoPE frequencies for GPU.");
375 } else {
376 Logger::info("No GPU layers active, skipping RoPE GPU buffer allocation.");
377 }
378
379 if (active_num_gpu_layers > 0) {
380 Logger::info("Allocating/Reallocating persistent GPU workspace buffers for " + std::to_string(active_num_gpu_layers) + " GPU layers.");
381 size_t hs_bytes = (size_t)hs * sizeof(float);
382 size_t is_bytes = (size_t)is * sizeof(float);
383 size_t vs_bytes = (size_t)vs * sizeof(float);
384 size_t k_dev_size_bytes = (size_t)n_kv_heads * head_dim * sizeof(float);
385 size_t v_dev_size_bytes = (size_t)n_kv_heads * head_dim * sizeof(float);
386
387#define REALLOC_GPU_WORKSPACE(ptr, sz) SAFE_CUDA_FREE(ptr); gpuErrchk(cudaMalloc(&ptr, sz));
402 Logger::info("Finished allocating/reallocating GPU workspace buffers.");
403 } else {
404 Logger::info("No GPU layers active, skipping GPU workspace buffer allocation.");
408 }
409
410 if (active_num_gpu_layers > 0) {
413
419 Logger::info("Allocated SELECTIVE KVCache dequantization buffers (K and V) on GPU. Size per buffer: " +
420 std::to_string(selective_buffer_bytes / (1024.0 * 1024.0)) + " MB (vs " +
421 std::to_string((static_cast<size_t>(config_.max_position_embeddings) * n_kv_heads * head_dim * sizeof(float)) / (1024.0 * 1024.0)) + " MB for full buffers)");
422 } else {
423 Logger::warning("Selective KVCache dequantization buffer size is 0. Skipping allocation.");
426 }
427
430 } else {
435 }
436
441
442 std::vector<uint16_t> h_q, h_k, h_v, h_o, h_gate, h_up, h_down;
447
448 Logger::info("Concatenating BF16 weights for GPU layers on host (zero-padding if missing for a layer)...");
449 for (int i = 0; i < active_num_gpu_layers; ++i) {
451 const auto& lw = layers[model_layer_idx];
452
453 if (!lw.q_proj.empty()) {
454 h_q.insert(h_q.end(), lw.q_proj.begin(), lw.q_proj.end());
455 } else {
456 h_q.insert(h_q.end(), layer_q_size, bfloat16::ZERO);
457 }
458
459 if (!lw.k_proj.empty()) {
460 h_k.insert(h_k.end(), lw.k_proj.begin(), lw.k_proj.end());
461 } else {
462 h_k.insert(h_k.end(), layer_k_size, bfloat16::ZERO);
463 }
464
465 if (!lw.v_proj.empty()) {
466 h_v.insert(h_v.end(), lw.v_proj.begin(), lw.v_proj.end());
467 } else {
468 h_v.insert(h_v.end(), layer_v_size, bfloat16::ZERO);
469 }
470
471 if (!lw.o_proj.empty()) {
472 h_o.insert(h_o.end(), lw.o_proj.begin(), lw.o_proj.end());
473 } else {
474 h_o.insert(h_o.end(), layer_o_size, bfloat16::ZERO);
475 }
476
477 if (!lw.gate_proj.empty()) {
478 h_gate.insert(h_gate.end(), lw.gate_proj.begin(), lw.gate_proj.end());
479 } else {
481 }
482
483 if (!lw.up_proj.empty()) {
484 h_up.insert(h_up.end(), lw.up_proj.begin(), lw.up_proj.end());
485 } else {
486 h_up.insert(h_up.end(), layer_up_size, bfloat16::ZERO);
487 }
488
489 if (!lw.down_proj.empty()) {
490 h_down.insert(h_down.end(), lw.down_proj.begin(), lw.down_proj.end());
491 } else {
493 }
494 }
495
496#define ALLOC_COPY_CONCAT_BF16(dev_ptr, host_vec, weight_name_str) \
497 SAFE_CUDA_FREE(dev_ptr); \
498 if (!host_vec.empty()) { \
499 gpuErrchk(cudaMalloc(&dev_ptr, host_vec.size() * sizeof(uint16_t))); \
500 gpuErrchk(cudaMemcpy(dev_ptr, host_vec.data(), host_vec.size() * sizeof(uint16_t), cudaMemcpyHostToDevice)); \
501 Logger::info("Copied concatenated " weight_name_str " (BF16) to GPU for GPU layers."); \
502 } else if (active_num_gpu_layers > 0) { \
503 Logger::info("Host vector for concatenated " weight_name_str " (BF16) is empty. Skipping GPU copy."); \
504 }
505
509#undef ALLOC_COPY_CONCAT_BF16
510
511 } else {
512 Logger::info("Skipping BF16 concatenated layer weight processing (first GPU layer appears not to use BF16 q_proj, or no GPU layers).");
515 }
516
517 Logger::info("DEFERRING concatenated F32 weight processing for GPU layers to save memory during initialization");
518 Logger::info("Concatenated F32 weights will be processed on-demand during first inference");
519
522
523 // Free BF16 concatenated weights
526
527 Logger::info("Finished deferring concatenated F32 weight processing for GPU layers.");
528
529 // Allocate persistent batch processing buffers for GPU memory optimization
530 if (active_num_gpu_layers > 0) {
532 }
533
534#undef SAFE_CUDA_FREE
535#else
536 if (active_num_gpu_layers > 0 && nhl > 0) {
537 Logger::warning("CUDA not available, but " + std::to_string(active_num_gpu_layers) + " layer(s) were configured for GPU. Model will run entirely on CPU.");
538 } else {
539 Logger::info("CUDA not available or no GPU layers configured. Model will run entirely on CPU.");
540 }
541#endif
542}
static std::string ptrToString(const void *ptr)
Definition logger.cpp:225
bool use_bf16_tensor_cores_
Definition model.h:481
void ensure_embed_tokens_dequantized()
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
constexpr uint16_t ZERO
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
float rope_theta
Definition model.h:89
uint16_t float32_to_bfloat16(float val)
Definition utils.cpp:136

References bf16vec_to_float_vec(), config_, dequantize_q4_k_m(), dequantize_q6_k(), dequantize_q8_0_block(), embed_tokens, embed_tokens_f32, embed_tokens_q4k, embed_tokens_q6k, embed_tokens_q8_0, ensure_embed_tokens_dequantized(), ensure_lm_head_dequantized(), Logger::error(), final_norm, final_norm_f32, float32_to_bfloat16(), GGML_QK8_0, GGML_QK_K, ModelConfig::hidden_size, Logger::info(), ModelConfig::intermediate_size, layers, lm_head, lm_head_f32, lm_head_q4k, lm_head_q6k, lm_head_q8_0, ModelConfig::max_position_embeddings, ModelConfig::num_attention_heads, ModelConfig::num_cpu_offload_layers, ModelConfig::num_hidden_layers, ModelConfig::num_key_value_heads, precomputed_freqs_cis_, Logger::ptrToString(), ModelConfig::rope_theta, use_bf16_tensor_cores_, ModelConfig::vocab_size, Logger::warning(), and bfloat16::ZERO.

Referenced by TinyLlamaModel(), TinyLlamaModel(), and TinyLlamaModel().

◆ initialize_rope_freqs()

void TinyLlamaModel::initialize_rope_freqs ( )

Definition at line 184 of file model_utils.cpp.

184 {
185 Logger::info("[ROPE_FREQ_ENTRY] Entered initialize_rope_freqs.");
186
187 Logger::info("[ROPE_FREQ_CHECK] num_attention_heads: " + std::to_string(config_.num_attention_heads));
188 if (config_.num_attention_heads == 0) {
189 Logger::error("Cannot initialize RoPE frequencies: num_attention_heads is zero.");
190 return;
191 }
193 Logger::info("[ROPE_FREQ_CHECK] calculated head_dim: " + std::to_string(head_dim));
194 if (head_dim == 0) {
195 Logger::error("Cannot initialize RoPE frequencies: calculated head_dim is zero.");
196 return;
197 }
198 Logger::info("[ROPE_FREQ_CHECK] head_dim % 2 check. head_dim: " + std::to_string(head_dim));
199 if (head_dim % 2 != 0) {
200 Logger::error("Cannot initialize RoPE frequencies: head_dim must be even.");
201 return;
202 }
203
204 Logger::info("[ROPE_INIT] Initializing RoPE with head_dim=" + std::to_string(head_dim) +
205 ", configured max_pos_emb=" + std::to_string(config_.max_position_embeddings) +
206 ", using internal rope::MAX_SEQUENCE_LENGTH=" + std::to_string(rope::MAX_SEQUENCE_LENGTH) +
207 ", configured rope_theta=" + std::to_string(config_.rope_theta));
208
209
210 if (precomputed_freqs_cis_.empty()) {
212 size_t required_size = (static_cast<size_t>(max_seq_len) * head_dim) / 2;
213 if (required_size == 0) {
214 Logger::warning("RoPE precomputation resulted in zero size. Max seq len: " +
215 std::to_string(max_seq_len) + ", head_dim: " + std::to_string(head_dim));
216 return;
217 }
219
220 float rope_theta = config_.rope_theta > 0 ? config_.rope_theta : rope::ROPE_THETA;
221
222 for (int pos = 0; pos < max_seq_len; ++pos) {
223 for (int i = 0; i < head_dim; i += 2) {
224 float freq = 1.0f / std::pow(rope_theta, float(i) / head_dim);
225 float val = static_cast<float>(pos) * freq;
226 float cos_val = std::cos(val);
227 float sin_val = std::sin(val);
228 size_t flat_idx = (static_cast<size_t>(pos) * head_dim / 2) + (i / 2);
229 if (flat_idx < precomputed_freqs_cis_.size()){
231 } else {
232 Logger::error("RoPE precomputation index out of bounds: " + std::to_string(flat_idx) +
233 " vs size " + std::to_string(precomputed_freqs_cis_.size()));
234 return;
235 }
236 }
237 }
238 Logger::info("Precomputed RoPE frequencies on CPU. Size: " + std::to_string(precomputed_freqs_cis_.size()));
239 } else {
240 Logger::info("RoPE frequencies already precomputed.");
241 }
242}
constexpr float ROPE_THETA
constexpr int MAX_SEQUENCE_LENGTH

References config_, Logger::error(), ModelConfig::hidden_size, Logger::info(), ModelConfig::max_position_embeddings, rope::MAX_SEQUENCE_LENGTH, ModelConfig::num_attention_heads, precomputed_freqs_cis_, ModelConfig::rope_theta, rope::ROPE_THETA, and Logger::warning().

◆ initialize_weights()

void TinyLlamaModel::initialize_weights ( const SafeTensorsLoader loader,
const GGUFData gguf 
)
private

Definition at line 38 of file model.cpp.

39 {
40 Logger::info("Initializing model weights...");
44 int vs = config_.vocab_size;
45 layers.resize(nhl);
46
47 if (gguf) {
48 Logger::info("Processing weights from GGUF data source...");
49
50 if (gguf && (gguf->mapped_tensor_data || !gguf->tensor_data.empty())) {
51 map_gguf_weights(*gguf, *this);
52 Logger::info("[INIT_WEIGHTS_GGUF] map_gguf_weights(*gguf, *this) CALLED (using function parameter).");
53 } else if (gguf_data_ && (gguf_data_->mapped_tensor_data || !gguf_data_->tensor_data.empty())) {
55 Logger::info("[INIT_WEIGHTS_GGUF] map_gguf_weights(*gguf_data_, *this) CALLED (using member gguf_data_).");
56 } else {
57 Logger::error("[INIT_WEIGHTS_GGUF] map_gguf_weights failed - tensor data not available. No GGUF weights mapped.");
58 }
59
60 // LAZY DEQUANTIZATION: Only dequantize what's immediately needed
61 Logger::info("[INIT_WEIGHTS_GGUF] Using lazy dequantization to prevent OOM");
62
63 // Only dequantize embed_tokens and final_norm immediately (small and always needed)
64 if (this->embed_tokens_f32.empty()) {
65 size_t total_elements_embed = static_cast<size_t>(config_.vocab_size) * config_.hidden_size;
66 if (!this->embed_tokens_q6k.empty()) {
68 } else if (!this->embed_tokens_q4k.empty()) {
70 } else if (!this->embed_tokens_q8k.empty()) {
72 } else if (!this->embed_tokens_q8_0.empty()) {
74 } else if (!this->embed_tokens.empty()) {
76 }
77 if (!this->embed_tokens_f32.empty()) {
78 Logger::info("[INIT_WEIGHTS_GGUF_DEQUANT] embed_tokens_f32 populated. Size: " + std::to_string(this->embed_tokens_f32.size()));
79 }
80 }
81
82 if (this->final_norm_f32.empty()) {
83 if (!this->final_norm.empty()) {
85 if (!this->final_norm_f32.empty()) {
86 Logger::info("[INIT_WEIGHTS_GGUF_DEQUANT] Successfully converted final_norm (BF16) to final_norm_f32. Size: " + std::to_string(this->final_norm_f32.size()));
87 }
88 }
89 }
90
91 // DEFER lm_head dequantization until actually needed (it's huge)
92 Logger::info("[INIT_WEIGHTS_GGUF] Deferring lm_head dequantization until needed to save memory");
93
94 // DEFER all layer weight dequantization until the layer is actually used
95 Logger::info("[INIT_WEIGHTS_GGUF] Deferring all layer weight dequantization until layers are used");
96
97 // Only populate layer norms immediately (small and needed for validation)
98 for (int l = 0; l < nhl; ++l) {
99 auto& lw = layers[l];
100
101 if (lw.input_layernorm_f32.empty() && !lw.input_layernorm.empty()) {
102 lw.input_layernorm_f32 = bf16vec_to_float_vec(lw.input_layernorm);
103 if (!lw.input_layernorm_f32.empty()) Logger::info(" L" + std::to_string(l) + " input_layernorm_f32 populated from BF16. Size: " + std::to_string(lw.input_layernorm_f32.size()));
104 }
105 if (lw.post_attention_layernorm_f32.empty() && !lw.post_attention_layernorm.empty()) {
106 lw.post_attention_layernorm_f32 = bf16vec_to_float_vec(lw.post_attention_layernorm);
107 if (!lw.post_attention_layernorm_f32.empty()) Logger::info(" L" + std::to_string(l) + " post_attention_layernorm_f32 populated from BF16. Size: " + std::to_string(lw.post_attention_layernorm_f32.size()));
108 }
109 }
110
111 // Validation checks for layer norms
112 for (int l = 0; l < nhl; ++l) {
113 const auto& lw = layers[l];
114 if (lw.input_layernorm_f32.empty()) {
115 Logger::error("[INIT_WEIGHTS_GGUF_CHECK] Layer " + std::to_string(l) +
116 ": input_layernorm_f32 is EMPTY post-GGUF. This WILL cause GPU init errors if this layer is on GPU.");
117 }
118 if (lw.post_attention_layernorm_f32.empty()) {
119 Logger::error("[INIT_WEIGHTS_GGUF_CHECK] Layer " + std::to_string(l) +
120 ": post_attention_layernorm_f32 is EMPTY post-GGUF. This WILL cause GPU init errors if this layer is on GPU.");
121 }
122 }
123 Logger::info("[INIT_WEIGHTS_GGUF] Finished per-layer NORM F32 vector checks post-GGUF.");
124
125 } else {
126 Logger::fatal("TinyLlamaModel::initialize_weights called with neither GGUF nor SafeTensors loader. Cannot initialize weights.");
127 throw std::runtime_error("Model weights source (GGUF or SafeTensors) not provided to initialize_weights.");
128 }
129
130 Logger::info("Finished initializing model weights logic block.");
131
132 if (this->final_norm_f32.empty()) {
133 Logger::error("[INIT_WEIGHTS_FINAL_CHECK] final_norm_f32 is EMPTY. This WILL cause errors if final normalization is needed in F32.");
134 } else {
135 Logger::info("[INIT_WEIGHTS_FINAL_CHECK] final_norm_f32 is POPULATED. Size: " + std::to_string(this->final_norm_f32.size()));
136 }
137
138 if (this->embed_tokens_f32.empty()) {
139 Logger::error("[INIT_WEIGHTS_FINAL_CHECK] embed_tokens_f32 is EMPTY. This WILL cause errors if token embeddings are needed in F32.");
140 } else {
141 Logger::info("[INIT_WEIGHTS_FINAL_CHECK] embed_tokens_f32 is POPULATED. Size: " + std::to_string(this->embed_tokens_f32.size()));
142 }
143}
static void fatal(const std::string &message)
Definition logger.cpp:151
friend void map_gguf_weights(const GGUFData &gguf, TinyLlamaModel &model)

References bf16vec_to_float_vec(), config_, dequantize_q8_k(), dequantize_vector_q4k_to_f32(), dequantize_vector_q6k_to_f32(), dequantize_vector_q8_0_to_f32(), embed_tokens, embed_tokens_f32, embed_tokens_q4k, embed_tokens_q6k, embed_tokens_q8_0, embed_tokens_q8k, Logger::error(), Logger::fatal(), final_norm, final_norm_f32, gguf_data_, ModelConfig::hidden_size, Logger::info(), ModelConfig::intermediate_size, layers, map_gguf_weights, GGUFData::mapped_tensor_data, ModelConfig::num_hidden_layers, GGUFData::tensor_data, and ModelConfig::vocab_size.

Referenced by TinyLlamaModel(), TinyLlamaModel(), and TinyLlamaModel().

◆ lookup_embedding()

std::vector< float > TinyLlamaModel::lookup_embedding ( int  token_id)

Lookup the embedding vector for a given token ID.

Parameters
token_idThe token ID to lookup.
Returns
The embedding vector as a std::vector<float>.

Definition at line 11 of file model_utils.cpp.

11 {
13 int vs = config_.vocab_size;
14
16 Logger::error("Token ID out of bounds in lookup_embedding: " +
17 std::to_string(token_id));
18 return std::vector<float>(hs, 0.0f);
19 }
20
21 std::vector<float> embedding_vec(hs, 0.0f);
22
23 if (!embed_tokens_q4k.empty()) {
24 if (hs % GGML_QK_K != 0) {
25 Logger::error("Hidden size (" + std::to_string(hs) +
26 ") is not divisible by GGML_QK_K (" +
27 std::to_string(GGML_QK_K) + ") for Q4_K embedding lookup.");
28 return embedding_vec;
29 }
30
31 size_t blocks_per_row = hs / GGML_QK_K;
34
35 if (end_block_idx > embed_tokens_q4k.size()) {
37 "Calculated block index out of bounds for Q4_K embedding table. "
38 "Token: " +
39 std::to_string(token_id) +
40 ", StartBlock: " + std::to_string(start_block_idx) +
41 ", EndBlock: " + std::to_string(end_block_idx) +
42 ", TableSize: " + std::to_string(embed_tokens_q4k.size()));
43 return embedding_vec;
44 }
45
47 for (size_t block_n = 0; block_n < blocks_per_row; ++block_n) {
50
51 size_t dest_offset = block_n * GGML_QK_K;
52
53 size_t elements_to_copy = SAFE_MIN((size_t)GGML_QK_K, (size_t)(hs - dest_offset));
55 elements_to_copy * sizeof(float));
56 }
57 return embedding_vec;
58 }
59
60 else if (!embed_tokens_q8_0.empty()) {
61 if (hs % GGML_QK8_0 != 0) {
62 Logger::error("Hidden size (" + std::to_string(hs) +
63 ") is not divisible by GGML_QK8_0 (" +
64 std::to_string(GGML_QK8_0) +
65 ") for Q8_0 embedding lookup.");
66 return embedding_vec;
67 }
68 size_t blocks_per_row = hs / GGML_QK8_0;
71
72 if (end_block_idx > embed_tokens_q8_0.size()) {
74 "Calculated block index out of bounds for Q8_0 embedding table. "
75 "Token: " +
76 std::to_string(token_id) +
77 ", StartBlock: " + std::to_string(start_block_idx) +
78 ", EndBlock: " + std::to_string(end_block_idx) +
79 ", TableSize: " + std::to_string(embed_tokens_q8_0.size()));
80 return embedding_vec;
81 }
82
84
85 for (size_t block_n = 0; block_n < blocks_per_row; ++block_n) {
89 size_t elements_to_copy = SAFE_MIN(static_cast<size_t>(GGML_QK8_0), static_cast<size_t>(hs - dest_offset));
91 elements_to_copy * sizeof(float));
92
93 }
94
95 if (token_id < 2) {
96 float sum = 0.0f, min_val = embedding_vec[0], max_val = embedding_vec[0];
97 for (int i = 0; i < hs; ++i) {
99 min_val = std::min(min_val, embedding_vec[i]);
100 max_val = std::max(max_val, embedding_vec[i]);
101 }
102 Logger::info("[Q8_0_EMBED_FINAL] Token " + std::to_string(token_id) +
103 " embedding stats: sum=" + std::to_string(sum) +
104 ", mean=" + std::to_string(sum / hs) +
105 ", min=" + std::to_string(min_val) +
106 ", max=" + std::to_string(max_val) +
107 ", first_4=[" + std::to_string(embedding_vec[0]) +
108 ", " + std::to_string(embedding_vec[1]) +
109 ", " + std::to_string(embedding_vec[2]) +
110 ", " + std::to_string(embedding_vec[3]) + "]");
111 }
112 return embedding_vec;
113 }
114
115 else if (!embed_tokens_q6k.empty()) {
116 if (hs % GGML_QK_K != 0) {
117 Logger::error("Hidden size (" + std::to_string(hs) +
118 ") is not divisible by GGML_QK_K (" +
119 std::to_string(GGML_QK_K) + ") for Q6_K embedding lookup.");
120 return embedding_vec;
121 }
122 size_t blocks_per_row = hs / GGML_QK_K;
125
126 if (end_block_idx > embed_tokens_q6k.size()) {
128 "Calculated block index out of bounds for Q6_K embedding table. "
129 "Token: " +
130 std::to_string(token_id) +
131 ", StartBlock: " + std::to_string(start_block_idx) +
132 ", EndBlock: " + std::to_string(end_block_idx) +
133 ", TableSize: " + std::to_string(embed_tokens_q6k.size()));
134 return embedding_vec;
135 }
136
138 for (size_t block_n = 0; block_n < blocks_per_row; ++block_n) {
141 size_t dest_offset = block_n * GGML_QK_K;
142 size_t elements_to_copy = SAFE_MIN(static_cast<size_t>(GGML_QK_K), static_cast<size_t>(hs - dest_offset));
144 elements_to_copy * sizeof(float));
145 }
146 return embedding_vec;
147 }
148
149 else if (!embed_tokens_f32.empty()) {
150 size_t offset = (size_t)token_id * hs;
151 if (offset + hs > embed_tokens_f32.size()) {
152 Logger::error("Embedding offset out of bounds in F32 lookup for token: " +
153 std::to_string(token_id));
154 return embedding_vec;
155 }
156
157 std::copy(embed_tokens_f32.begin() + offset,
158 embed_tokens_f32.begin() + offset + hs, embedding_vec.begin());
159 return embedding_vec;
160
161 } else if (!embed_tokens.empty()) {
162 size_t offset = (size_t)token_id * hs;
163 if (offset + hs > embed_tokens.size()) {
165 "Embedding offset out of bounds in BF16 lookup for token: " +
166 std::to_string(token_id));
167 return embedding_vec;
168 }
169 std::vector<uint16_t> token_embedding_bf16(
170 embed_tokens.begin() + offset, embed_tokens.begin() + offset + hs);
171
173 return embedding_vec;
174
175 } else {
177 "No valid embedding table found (Q4_K, Q8_0, Q6_K, F32, BF16) for token: " +
178 std::to_string(token_id));
179
180 return embedding_vec;
181 }
182}
#define SAFE_MIN(a, b)

References bf16vec_to_float_vec(), config_, dequantize_q4_k_m(), dequantize_q6_k(), dequantize_q8_0_block(), embed_tokens, embed_tokens_f32, embed_tokens_q4k, embed_tokens_q6k, embed_tokens_q8_0, Logger::error(), GGML_QK8_0, GGML_QK_K, ModelConfig::hidden_size, Logger::info(), SAFE_MIN, and ModelConfig::vocab_size.

◆ smart_gemm_batch_cuda()

void TinyLlamaModel::smart_gemm_batch_cuda ( bool  transa_user,
bool  transb_user,
int  m_user,
int  n_user,
int  k_user,
const float alpha_user,
const float A_f32_user,
int  lda_user,
const float B_f32_user,
int  ldb_user,
const float beta_user,
float C_f32_user,
int  ldc_user,
cudaStream_t  stream,
const char operation_name = "GEMM" 
)

Definition at line 2109 of file model.cpp.

2117 {
2118
2119 // Tensor Cores are most beneficial for larger batch sizes
2120 const int tensor_core_threshold = 4; // Use BF16 Tensor Cores when batch size >= 4
2122
2123 // Determine which BF16 weight to use based on operation name and weight pointer
2124 uint16_t* bf16_weight_ptr = nullptr;
2125
2126 if (use_tensor_cores) {
2127 Logger::info("[SMART_GEMM] Using BF16 Tensor Cores for " + std::string(operation_name) +
2128 " (batch_size=" + std::to_string(m_user) + " >= " + std::to_string(tensor_core_threshold) + ")");
2129
2130 // Ensure BF16 weights are loaded
2132
2133 // Map F32 weight pointer to corresponding BF16 weight pointer
2134 if (B_f32_user == w_q_f32_dev_) {
2136 } else if (B_f32_user == w_k_f32_dev_) {
2138 } else if (B_f32_user == w_v_f32_dev_) {
2140 } else if (B_f32_user == w_o_f32_dev_) {
2142 } else if (B_f32_user == w_gate_f32_dev_) {
2144 } else if (B_f32_user == w_up_f32_dev_) {
2146 } else if (B_f32_user == w_down_f32_dev_) {
2148 } else {
2149 // Map layer-specific pointers by calculating offset from base pointer
2150 size_t offset_bytes = 0;
2151 bool found = false;
2152
2153 // Check if it's a layer-specific pointer within the concatenated weights
2155 offset_bytes = (B_f32_user - w_q_f32_dev_) * sizeof(float);
2157 found = true;
2159 offset_bytes = (B_f32_user - w_k_f32_dev_) * sizeof(float);
2161 found = true;
2163 offset_bytes = (B_f32_user - w_v_f32_dev_) * sizeof(float);
2165 found = true;
2167 offset_bytes = (B_f32_user - w_o_f32_dev_) * sizeof(float);
2169 found = true;
2171 offset_bytes = (B_f32_user - w_gate_f32_dev_) * sizeof(float);
2173 found = true;
2175 offset_bytes = (B_f32_user - w_up_f32_dev_) * sizeof(float);
2177 found = true;
2179 offset_bytes = (B_f32_user - w_down_f32_dev_) * sizeof(float);
2181 found = true;
2182 }
2183
2184 if (!found) {
2185 // If we can't identify the weight, fall back to FP32
2186 Logger::warning("[SMART_GEMM] Unknown weight pointer for " + std::string(operation_name) +
2187 ", falling back to FP32");
2188 use_tensor_cores = false;
2189 }
2190 }
2191
2193 try {
2194 // Use mixed precision: FP32 input x BF16 weights = FP32 output
2199 Logger::info("[SMART_GEMM] Successfully used BF16 Tensor Cores for " + std::string(operation_name));
2200 return;
2201 } catch (const std::exception& e) {
2202 Logger::warning("[SMART_GEMM] BF16 Tensor Cores failed for " + std::string(operation_name) +
2203 ": " + e.what() + ". Falling back to FP32.");
2204 use_tensor_cores = false;
2205 }
2206 }
2207 }
2208
2209 // Fallback to standard FP32 GEMM
2210 Logger::info("[SMART_GEMM] Using FP32 GEMM for " + std::string(operation_name) +
2211 " (batch_size=" + std::to_string(m_user) + " < " + std::to_string(tensor_core_threshold) +
2212 " or Tensor Cores unavailable)");
2217}
void ensure_bf16_concatenated_weights_loaded()

References config_, ensure_bf16_concatenated_weights_loaded(), ModelConfig::hidden_size, Logger::info(), ModelConfig::intermediate_size, ModelConfig::num_attention_heads, ModelConfig::num_hidden_layers, ModelConfig::num_key_value_heads, use_bf16_tensor_cores_, and Logger::warning().

Friends And Related Symbol Documentation

◆ CPUBatchProcessor

Definition at line 477 of file model.h.

◆ map_gguf_weights

void map_gguf_weights ( const GGUFData gguf,
TinyLlamaModel model 
)
friend

Referenced by initialize_weights().

Member Data Documentation

◆ config_

ModelConfig TinyLlamaModel::config_
private

◆ cpu_batch_processor_

std::unique_ptr<class CPUBatchProcessor> TinyLlamaModel::cpu_batch_processor_
private

Definition at line 560 of file model.h.

Referenced by forward_cpu_batch().

◆ embed_tokens

std::vector<uint16_t> TinyLlamaModel::embed_tokens
private

◆ embed_tokens_f32

std::vector<float> TinyLlamaModel::embed_tokens_f32
private

◆ embed_tokens_q4k

std::vector<block_q4_K> TinyLlamaModel::embed_tokens_q4k
private

◆ embed_tokens_q6k

std::vector<block_q6_K> TinyLlamaModel::embed_tokens_q6k
private

◆ embed_tokens_q8_0

std::vector<block_q8_0> TinyLlamaModel::embed_tokens_q8_0
private

◆ embed_tokens_q8k

std::vector<block_q8_K> TinyLlamaModel::embed_tokens_q8k
private

Definition at line 490 of file model.h.

Referenced by ensure_embed_tokens_dequantized(), and initialize_weights().

◆ f32_concatenated_weights_loaded_

bool TinyLlamaModel::f32_concatenated_weights_loaded_ = false
private

Definition at line 558 of file model.h.

◆ final_norm

std::vector<uint16_t> TinyLlamaModel::final_norm
private

◆ final_norm_f32

std::vector<float> TinyLlamaModel::final_norm_f32
private

◆ final_norm_q4k

std::vector<block_q4_K> TinyLlamaModel::final_norm_q4k
private

Definition at line 487 of file model.h.

◆ final_norm_q6k

std::vector<block_q6_K> TinyLlamaModel::final_norm_q6k
private

Definition at line 488 of file model.h.

◆ gguf_data_

std::unique_ptr<GGUFData> TinyLlamaModel::gguf_data_
private

◆ layers

std::vector<LayerWeights> TinyLlamaModel::layers
private

◆ lm_head

std::vector<uint16_t> TinyLlamaModel::lm_head
private

◆ lm_head_f32

std::vector<float> TinyLlamaModel::lm_head_f32
private

◆ lm_head_q4k

std::vector<block_q4_K> TinyLlamaModel::lm_head_q4k
private

◆ lm_head_q6k

std::vector<block_q6_K> TinyLlamaModel::lm_head_q6k
private

◆ lm_head_q8_0

std::vector<block_q8_0> TinyLlamaModel::lm_head_q8_0
private

◆ lm_head_q8k

std::vector<block_q8_K> TinyLlamaModel::lm_head_q8k
private

Definition at line 490 of file model.h.

Referenced by ensure_lm_head_dequantized(), and forward().

◆ model_path_

std::string TinyLlamaModel::model_path_
private

Definition at line 557 of file model.h.

Referenced by TinyLlamaModel(), and TinyLlamaModel().

◆ precomputed_freqs_cis_

std::vector<std::pair<float, float> > TinyLlamaModel::precomputed_freqs_cis_
private

◆ use_bf16_tensor_cores_

bool TinyLlamaModel::use_bf16_tensor_cores_ = false
private

Definition at line 481 of file model.h.

Referenced by initialize_gpu_and_rope(), and smart_gemm_batch_cuda().


The documentation for this class was generated from the following files: