TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
model.h
Go to the documentation of this file.
1#ifndef MODEL_H
2#define MODEL_H
3
4#include <cstdint>
5#include <functional>
6#include <nlohmann/json.hpp>
7#include <string>
8#include <unordered_map>
9#include <vector>
10
11#include "safetensors_loader.h"
12#ifdef HAS_CUDA
13// Use safe headers only for Windows CUDA 12.1+ workaround, normal headers everywhere else
14#if defined(WINDOWS_CUDA_12_1_WORKAROUND) && defined(_WIN32)
15#include "cuda_safe_headers.h"
16#else
17// Normal CUDA header inclusion for non-problematic platforms (Ubuntu, etc.)
18#include <cuda_runtime.h>
19#include <cublas_v2.h>
20#include <cuda_fp16.h>
21#include <cuda_bf16.h>
22#endif
23
24#include "cuda_kernels.h"
25#endif
26#include <memory>
27
28#include "quantization.h"
29
36enum class TensorName {
37 Q_PROJ,
38 K_PROJ,
39 V_PROJ,
40 O_PROJ,
41 GATE_PROJ,
42 UP_PROJ,
43 DOWN_PROJ,
45 LM_HEAD,
46 UNKNOWN
47};
48
49static std::string tensor_name_to_string(TensorName tn) {
50 switch (tn) {
52 return "Q_PROJ";
54 return "K_PROJ";
56 return "V_PROJ";
58 return "O_PROJ";
60 return "GATE_PROJ";
62 return "UP_PROJ";
64 return "DOWN_PROJ";
66 return "TOKEN_EMBD";
68 return "LM_HEAD";
69 default:
70 return "UNKNOWN";
71 }
72}
73
89 float rope_theta;
90 std::string hidden_act;
91 std::string torch_dtype;
94 int unk_token_id = -1;
95 int pad_token_id = -1;
96 std::string architecture;
97 std::string model_name;
98 std::string chat_template_type;
99 std::string pre_tokenizer_type;
102 bool use_mmap_for_gguf = true; // Whether to use mmap for GGUF files, defaults to true
106 // Memory management: Enable layer-wise weight eviction to prevent OOM
110 bool use_optimized_cuda_kernels = true; // Re-enabled: fixed performance issues with simpler implementations
111
112 enum class TokenizerFamily {
113 UNKNOWN,
114 LLAMA_SENTENCEPIECE, // For Llama 2 and similar SentencePiece BPE
115 LLAMA3_TIKTOKEN // For Llama 3's Tiktoken-based BPE
116 };
118};
119
120struct GGUFData;
121struct ModelConfig;
123
131 std::vector<float> k; // Key cache (CPU)
132 std::vector<float> v; // Value cache (CPU)
133#ifdef HAS_CUDA
134 float* k_dev_fp32 = nullptr; // Original FP32 Key cache (GPU device pointer)
135 float* v_dev_fp32 = nullptr; // Original FP32 Value cache (GPU device pointer)
136
137 int8_t* k_dev_quantized = nullptr; // Quantized INT8 Key cache (GPU device pointer)
138 int8_t* v_dev_quantized = nullptr; // Quantized INT8 Value cache (GPU device pointer)
139 float* k_dev_scales = nullptr; // Scales for K cache (GPU device pointer)
140 float* v_dev_scales = nullptr; // Scales for V cache (GPU device pointer)
141#endif
142};
143
151struct KVCache {
152 std::vector<KVCacheLayer> layers;
154 // Single-sequence mode (legacy compatibility)
155 int seq_len = 0;
157 // Multi-sequence mode (new batch functionality)
158 std::vector<int> batch_seq_lens;
175 void initialize(const ModelConfig& config,
176 int total_num_model_layers, int num_gpu_layers_to_allocate,
177 int max_seq_len_arg, int num_kv_heads, int head_dim,
178 int max_batch_size_arg = 1);
179
180 void clear_data() {
181 // Single-sequence mode (legacy compatibility)
182 seq_len = 0;
183
184 // Multi-sequence mode
186 batch_seq_lens.clear();
187
188 // For batch processing, we MUST clear the actual KV data to prevent cross-sequence contamination
189 for (auto& layer : layers) {
190 std::fill(layer.k.begin(), layer.k.end(), 0.0f);
191 std::fill(layer.v.begin(), layer.v.end(), 0.0f);
192 }
193
194 // Logger::debug("[KVCache] clear_data() called. seq_len reset to 0. K/V vectors cleared for batch processing.");
195 }
196
201 void initialize_batch(int batch_size) {
202 if (batch_size > max_batch_size) {
203 Logger::warning("Requested batch size " + std::to_string(batch_size) +
204 " exceeds max batch size " + std::to_string(max_batch_size) +
205 ". Using max batch size.");
206 batch_size = max_batch_size;
207 }
208 current_batch_size = batch_size;
209 batch_seq_lens.resize(batch_size, 0);
210 }
211
212 void destroy_gpu_resources(); // Implementation moved to kv_cache.cpp
213
214#ifdef HAS_CUDA
215 int allocated_num_layers = 0;
216 int allocated_max_seq_len = 0;
217 int allocated_num_kv_heads = 0;
218 int allocated_head_dim = 0;
220 ~KVCache() {
222 }
223#else
227#endif
228};
229
230using ForwardDiagCallback = std::function<void(
231 int layer, const std::string& name, const std::vector<float>& v)>;
232
239 std::vector<uint16_t> input_layernorm;
240 std::vector<uint16_t> post_attention_layernorm;
241
242 std::vector<uint16_t> q_proj;
243 std::vector<uint16_t> k_proj;
244 std::vector<uint16_t> v_proj;
245 std::vector<uint16_t> o_proj;
246
247 std::vector<uint16_t> gate_proj;
248 std::vector<uint16_t> up_proj;
249 std::vector<uint16_t> down_proj;
250
251 std::vector<float> input_layernorm_f32;
255 std::vector<block_q4_K> q_proj_q4k, k_proj_q4k, v_proj_q4k, o_proj_q4k;
256 std::vector<block_q4_K> gate_proj_q4k, up_proj_q4k, down_proj_q4k;
257 std::vector<block_q6_K> q_proj_q6k, k_proj_q6k, v_proj_q6k, o_proj_q6k;
258 std::vector<block_q6_K> gate_proj_q6k, up_proj_q6k, down_proj_q6k;
259 std::vector<block_q8_0> q_proj_q8_0, k_proj_q8_0, v_proj_q8_0, o_proj_q8_0;
260 std::vector<block_q8_0> gate_proj_q8_0, up_proj_q8_0, down_proj_q8_0;
261 std::vector<block_q8_K> q_proj_q8k, k_proj_q8k, v_proj_q8k, o_proj_q8k;
262 std::vector<block_q8_K> gate_proj_q8k, up_proj_q8k, down_proj_q8k;
263
264#ifdef HAS_CUDA
265
266 float* input_layernorm_dev = nullptr;
267 float* post_attention_layernorm_dev = nullptr;
268
269 // Individual layer device pointers for JIT weight loading
270 float* q_proj_f32_dev = nullptr;
271 float* k_proj_f32_dev = nullptr;
272 float* v_proj_f32_dev = nullptr;
273 float* o_proj_f32_dev = nullptr;
274 float* gate_proj_f32_dev = nullptr;
275 float* up_proj_f32_dev = nullptr;
276 float* down_proj_f32_dev = nullptr;
277#endif
278};
279
286 public:
292 TinyLlamaModel(const ModelConfig& config, const SafeTensorsLoader& loader);
293
299 TinyLlamaModel(const ModelConfig& initial_config, const std::string& model_path);
300
306 TinyLlamaModel(const ModelConfig& config_from_session,
307 std::unique_ptr<GGUFData> gguf_data_from_session);
308
313
322 std::vector<float> forward(
323 std::vector<float>& input,
324 int n_tokens, KVCache* kv_cache,
325 const std::vector<int>* attention_mask);
326
327 void ensure_q_proj_dequantized(int layer_idx);
328 void ensure_k_proj_dequantized(int layer_idx);
329 void ensure_v_proj_dequantized(int layer_idx);
330 void ensure_o_proj_dequantized(int layer_idx);
331 void ensure_gate_proj_dequantized(int layer_idx);
332 void ensure_up_proj_dequantized(int layer_idx);
333 void ensure_down_proj_dequantized(int layer_idx);
337 void ensure_layer_weights_on_gpu(int layer_idx);
338 void free_layer_gpu_weights(int layer_idx);
339 void clear_layer_dequantized_weights(int layer_idx);
341
342 // BF16 Tensor Core weight management
345
346 // Smart GEMM wrapper that chooses between BF16 Tensor Cores and FP32 based on batch size
347 void smart_gemm_batch_cuda(bool transa_user, bool transb_user,
348 int m_user, int n_user, int k_user,
349 const float* alpha_user,
350 const float* A_f32_user, int lda_user,
351 const float* B_f32_user, int ldb_user,
352 const float* beta_user,
353 float* C_f32_user, int ldc_user,
354 cudaStream_t stream,
355 const char* operation_name = "GEMM");
356
357#ifdef HAS_CUDA
367 std::vector<float> forward_device(
368 float* x_input_dev,
369 int pos,
370 KVCache* cache,
371 const std::vector<int>* attention_mask = nullptr,
372 cudaStream_t stream = 0);
373
374 float* get_x_dev() { return x_dev_; }
375
376 void forward_device(int token_id, int pos, KVCache* kv_cache,
377 cudaStream_t stream = 0);
378 void forward_device_token(int token_id, int pos, KVCache* kv_cache, cudaStream_t stream = 0);
379
380 std::vector<float> forward_device_batch_prefill(
381 float* d_batch_input_hidden_states, // Device pointer to [num_tokens_in_batch, config_.hidden_size]
382 int num_tokens_in_batch,
383 int start_pos_in_kv_cache, // Typically 0 for prefill
384 KVCache* kv_cache,
385 cudaStream_t stream
386 );
387
388 std::vector<std::vector<float>> forward_device_batch_generation(
389 float* d_batch_input_hidden_states, // Device pointer to [num_tokens_in_batch, config_.hidden_size]
390 const std::vector<int>& token_positions, // Position of each token in its respective sequence
391 const std::vector<int>& original_sequence_indices, // Original sequence index for each token
392 int num_tokens_in_batch,
393 KVCache* kv_cache,
394 cudaStream_t stream
395 );
396
397 // Memory management for layer-wise weight eviction
398
399 // GPU workspace buffers
400
401 // Persistent batch processing buffers to eliminate per-forward-pass allocations
402 static constexpr int MAX_BATCH_TOKENS = 2048; // Maximum tokens we can process in one batch
403
404 // Persistent GPU buffers for batch processing (allocated once, reused)
405 float* d_persistent_batch_input_ = nullptr; // [MAX_BATCH_TOKENS, hidden_size]
406 float* d_persistent_batch_norm_out_ = nullptr; // [MAX_BATCH_TOKENS, hidden_size]
407 float* d_persistent_batch_residual_ = nullptr; // [MAX_BATCH_TOKENS, hidden_size]
408 float* d_persistent_q_batch_ = nullptr; // [MAX_BATCH_TOKENS, hidden_size]
409 float* d_persistent_k_batch_ = nullptr; // [MAX_BATCH_TOKENS, n_kv_heads * head_dim]
410 float* d_persistent_v_batch_ = nullptr; // [MAX_BATCH_TOKENS, n_kv_heads * head_dim]
411 float* d_persistent_attn_output_ = nullptr; // [MAX_BATCH_TOKENS, hidden_size]
412 float* d_persistent_attn_proj_out_ = nullptr; // [MAX_BATCH_TOKENS, hidden_size]
413 float* d_persistent_gate_proj_out_ = nullptr; // [MAX_BATCH_TOKENS, intermediate_size]
414 float* d_persistent_up_proj_out_ = nullptr; // [MAX_BATCH_TOKENS, intermediate_size]
415 float* d_persistent_swiglu_out_ = nullptr; // [MAX_BATCH_TOKENS, intermediate_size]
416 float* d_persistent_mlp_down_out_ = nullptr; // [MAX_BATCH_TOKENS, hidden_size]
417
418 // Buffer management functions
419 void allocate_persistent_batch_buffers();
420 void free_persistent_batch_buffers();
421 void resize_persistent_batch_buffers_if_needed(int required_batch_size);
422
423#endif // HAS_CUDA
424
425 const ModelConfig& get_config() const { return config_; }
426
427 const std::vector<uint16_t>& get_lm_head() const { return lm_head; }
428
429 const std::vector<uint16_t>& get_embed_tokens() const { return embed_tokens; }
430
431 std::vector<LayerWeights>& get_layers() { return layers; }
432
438 std::vector<float> lookup_embedding(int token_id);
439
444 int get_vocab_size() const;
445
446 const GGUFData* get_gguf_data() const {
447 return gguf_data_ ? gguf_data_.get() : nullptr;
448 }
449
451
453
454 std::vector<float> forward_cpu_batch(
455 const std::vector<float>& batch_input_activations,
456 int num_tokens_in_batch,
457 int num_cpu_layers_to_process,
458 int start_pos_in_sequence,
459 KVCache* kv_cache,
460 const std::vector<int>& prompt_lengths = {}
461 );
462
463 std::vector<float> forward_cpu_logits_batch(
464 const std::vector<float>& final_batch_activations,
465 int num_tokens_in_batch
466 );
467
468 std::vector<std::vector<float>> forward_cpu_batch_generation(
469 const std::vector<float>& batch_input_activations,
470 const std::vector<int>& token_positions,
471 const std::vector<int>& original_sequence_indices,
472 int num_tokens_in_batch,
473 KVCache* kv_cache
474 );
475
476 friend void map_gguf_weights(const GGUFData& gguf, TinyLlamaModel& model);
477 friend class CPUBatchProcessor;
478
479 private:
482
483 std::vector<uint16_t> embed_tokens;
484 std::vector<uint16_t> lm_head;
485 std::vector<uint16_t> final_norm;
487 std::vector<block_q4_K> embed_tokens_q4k, lm_head_q4k, final_norm_q4k;
488 std::vector<block_q6_K> embed_tokens_q6k, lm_head_q6k, final_norm_q6k;
489 std::vector<block_q8_0> embed_tokens_q8_0, lm_head_q8_0;
490 std::vector<block_q8_K> embed_tokens_q8k, lm_head_q8k;
491 std::vector<LayerWeights> layers;
492
493#ifdef HAS_CUDA
494 float* final_norm_dev = nullptr;
495 float* all_freqs_cis_dev = nullptr;
496 uint16_t* token_embedding_table_dev_ = nullptr;
497 uint16_t* w_q_dev_ = nullptr;
498 uint16_t* w_k_dev_ = nullptr;
499 uint16_t* w_v_dev_ = nullptr;
500 uint16_t* w_o_dev_ = nullptr;
501 uint16_t* w_gate_dev_ = nullptr;
502 uint16_t* w_up_dev_ = nullptr;
503 uint16_t* w_down_dev_ = nullptr;
504 uint16_t* lm_head_dev_ = nullptr;
505 float* token_embedding_table_f32_dev_ = nullptr;
506 float* w_q_f32_dev_ = nullptr;
507 float* w_k_f32_dev_ = nullptr;
508 float* w_v_f32_dev_ = nullptr;
509 float* w_o_f32_dev_ = nullptr;
510 float* w_gate_f32_dev_ = nullptr;
511 float* w_up_f32_dev_ = nullptr;
512 float* w_down_f32_dev_ = nullptr;
513 float* lm_head_f32_dev_ = nullptr;
514 cublasHandle_t cublas_handle_ = nullptr;
515
516 float* x_dev_ = nullptr;
517 float* x_norm_dev_ = nullptr;
518 float* x_resid1_dev_ = nullptr;
519 float* x_resid2_dev_ = nullptr;
520 float* q_dev_ = nullptr;
521 float* k_dev_ = nullptr;
522 float* v_dev_ = nullptr;
523 float* attn_out_dev_ = nullptr;
524 float* attn_proj_dev_ = nullptr;
525 float* gate_vec_dev_ = nullptr;
526 float* up_vec_dev_ = nullptr;
527 float* swiglu_vec_dev_ = nullptr;
528 float* mlp_down_dev_ = nullptr;
529 float* logits_dev_ = nullptr;
530
531 // Temporary buffers for KVCache dequantization
532 float* dequant_k_cache_buffer_dev_ = nullptr; // For KVCache dequantization (full cache size)
533 float* dequant_v_cache_buffer_dev_ = nullptr; // For KVCache dequantization (full cache size)
534
535 // Selective KVCache dequantization buffers (much smaller - only per head per token)
536 float* selective_k_dequant_buffer_dev_ = nullptr; // Small buffer for selective K dequantization
537 float* selective_v_dequant_buffer_dev_ = nullptr; // Small buffer for selective V dequantization
538 size_t selective_dequant_buffer_size_ = 0; // Size of selective buffers in elements
539
540 // GPU workspace buffers
541
542 // BF16 weight device pointers for Tensor Core acceleration
543 uint16_t* w_q_bf16_dev_ = nullptr;
544 uint16_t* w_k_bf16_dev_ = nullptr;
545 uint16_t* w_v_bf16_dev_ = nullptr;
546 uint16_t* w_o_bf16_dev_ = nullptr;
547 uint16_t* w_gate_bf16_dev_ = nullptr;
548 uint16_t* w_up_bf16_dev_ = nullptr;
549 uint16_t* w_down_bf16_dev_ = nullptr;
550 bool bf16_concatenated_weights_loaded_ = false;
551
552#endif
553
554 std::vector<std::pair<float, float>> precomputed_freqs_cis_;
555
556 std::unique_ptr<GGUFData> gguf_data_;
557 std::string model_path_;
559
560 std::unique_ptr<class CPUBatchProcessor> cpu_batch_processor_;
561
562 void initialize_weights(const SafeTensorsLoader* loader,
563 const GGUFData* gguf);
564
565};
566
567ModelConfig parse_model_config(const nlohmann::json& json);
568
569int argmax(const std::vector<float>& v);
570
571float bfloat16_to_float32(uint16_t b16);
572
573void rmsnorm(const std::vector<float>& x, const std::vector<uint16_t>& weight,
574 float eps, std::vector<float>& out);
575
576void matvec_bf16_f32(const std::vector<uint16_t>& mat,
577 const std::vector<float>& vec, std::vector<float>& out,
578 int M, int N);
579
580void softmax(std::vector<float>& x);
581
582struct KVCache;
583
584float bfloat16_to_float32(uint16_t b16);
585std::vector<uint16_t> uint8_vector_to_uint16_vector(
586 const std::vector<uint8_t>& bytes, size_t numel);
587
588void log_vector_summary(const std::string& name, const std::vector<float>& v,
589 int head_count = 5);
590
591void log_vector_summary_batch(const std::string& name, const std::vector<float>& batch_vector,
592 int num_tokens_in_batch, int single_token_vector_size,
593 int head_count = 5);
594
595#endif
static void warning(const std::string &message)
Definition logger.cpp:139
Main class for loading tensors from SafeTensors format files (single or sharded)
Main transformer model class for TinyLlama.
Definition model.h:285
const GGUFData * get_gguf_data() const
Definition model.h:446
bool use_bf16_tensor_cores_
Definition model.h:481
void free_layer_gpu_weights(int layer_idx)
~TinyLlamaModel()
Destructor. Cleans up all allocated resources.
Definition model.cpp:330
bool f32_concatenated_weights_loaded_
Definition model.h:558
std::vector< LayerWeights > & get_layers()
Definition model.h:431
std::vector< block_q6_K > embed_tokens_q6k
Definition model.h:488
std::vector< float > lookup_embedding(int token_id)
Lookup the embedding vector for a given token ID.
const ModelConfig & get_config() const
Definition model.h:425
void ensure_up_proj_dequantized(int layer_idx)
std::vector< float > final_norm_f32
Definition model.h:486
void free_bf16_concatenated_weights()
std::vector< block_q4_K > final_norm_q4k
Definition model.h:487
void initialize_rope_freqs()
std::vector< uint16_t > final_norm
Definition model.h:485
int get_vocab_size() const
Get the vocabulary size for the model.
void ensure_v_proj_dequantized(int layer_idx)
std::vector< block_q6_K > final_norm_q6k
Definition model.h:488
std::vector< block_q4_K > lm_head_q4k
Definition model.h:487
std::vector< float > forward_cpu_logits_batch(const std::vector< float > &final_batch_activations, int num_tokens_in_batch)
Definition model.cpp:1063
friend void map_gguf_weights(const GGUFData &gguf, TinyLlamaModel &model)
std::vector< block_q6_K > lm_head_q6k
Definition model.h:488
void ensure_layer_weights_on_gpu(int layer_idx)
const std::vector< uint16_t > & get_embed_tokens() const
Definition model.h:429
std::vector< std::pair< float, float > > precomputed_freqs_cis_
Definition model.h:554
std::string model_path_
Definition model.h:557
void ensure_embed_tokens_dequantized()
std::vector< LayerWeights > layers
Definition model.h:491
std::vector< block_q8_0 > embed_tokens_q8_0
Definition model.h:489
ModelConfig config_
Definition model.h:480
void ensure_o_proj_dequantized(int layer_idx)
void clear_layer_dequantized_weights(int layer_idx)
std::vector< block_q4_K > embed_tokens_q4k
Definition model.h:487
void smart_gemm_batch_cuda(bool transa_user, bool transb_user, int m_user, int n_user, int k_user, const float *alpha_user, const float *A_f32_user, int lda_user, const float *B_f32_user, int ldb_user, const float *beta_user, float *C_f32_user, int ldc_user, cudaStream_t stream, const char *operation_name="GEMM")
Definition model.cpp:2109
void ensure_k_proj_dequantized(int layer_idx)
const std::vector< uint16_t > & get_lm_head() const
Definition model.h:427
std::unique_ptr< class CPUBatchProcessor > cpu_batch_processor_
Definition model.h:560
std::vector< block_q8_0 > lm_head_q8_0
Definition model.h:489
std::vector< uint16_t > lm_head
Definition model.h:484
void ensure_f32_concatenated_weights_loaded()
std::vector< std::vector< float > > forward_cpu_batch_generation(const std::vector< float > &batch_input_activations, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, int num_tokens_in_batch, KVCache *kv_cache)
Definition model.cpp:1127
std::vector< uint16_t > embed_tokens
Definition model.h:483
std::vector< block_q8_K > embed_tokens_q8k
Definition model.h:490
void ensure_bf16_concatenated_weights_loaded()
void ensure_q_proj_dequantized(int layer_idx)
void initialize_weights(const SafeTensorsLoader *loader, const GGUFData *gguf)
Definition model.cpp:38
std::vector< float > forward_cpu_batch(const std::vector< float > &batch_input_activations, int num_tokens_in_batch, int num_cpu_layers_to_process, int start_pos_in_sequence, KVCache *kv_cache, const std::vector< int > &prompt_lengths={})
Definition model.cpp:2086
void ensure_down_proj_dequantized(int layer_idx)
void ensure_gate_proj_dequantized(int layer_idx)
GGUFData * get_gguf_data_ptr()
Definition model.h:450
std::vector< float > embed_tokens_f32
Definition model.h:486
std::vector< float > forward(std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask)
Run the forward pass for the model on CPU layers.
Definition model.cpp:536
void ensure_lm_head_dequantized()
std::unique_ptr< GGUFData > gguf_data_
Definition model.h:556
std::vector< float > lm_head_f32
Definition model.h:486
std::vector< block_q8_K > lm_head_q8k
Definition model.h:490
Safe CUDA header inclusion wrapper for Windows CUDA 12.1+ compatibility.
static std::string tensor_name_to_string(TensorName tn)
Definition model.h:49
TensorName
Enumeration of tensor names used in the TinyLlama model.
Definition model.h:36
std::vector< uint16_t > uint8_vector_to_uint16_vector(const std::vector< uint8_t > &bytes, size_t numel)
Definition utils.cpp:176
void rmsnorm(const std::vector< float > &x, const std::vector< uint16_t > &weight, float eps, std::vector< float > &out)
ModelConfig parse_model_config(const nlohmann::json &json)
ModelConfig parse_model_config_from_gguf(const GGUFData &gguf)
void log_vector_summary(const std::string &name, const std::vector< float > &v, int head_count=5)
Definition utils.cpp:207
void log_vector_summary_batch(const std::string &name, const std::vector< float > &batch_vector, int num_tokens_in_batch, int single_token_vector_size, int head_count=5)
void matvec_bf16_f32(const std::vector< uint16_t > &mat, const std::vector< float > &vec, std::vector< float > &out, int M, int N)
int argmax(const std::vector< float > &v)
Definition utils.cpp:185
float bfloat16_to_float32(uint16_t b16)
Definition utils.cpp:144
void softmax(std::vector< float > &x)
std::function< void(int layer, const std::string &name, const std::vector< float > &v)> ForwardDiagCallback
Definition model.h:231
Weight quantization structures and functions for model compression.
SafeTensors format loader for efficient tensor loading, supporting single and sharded models.
nlohmann::json json
Definition server.cpp:54
Complete representation of a GGUF file's contents.
Key-Value cache for a single transformer layer.
Definition model.h:130
std::vector< float > v
Definition model.h:132
std::vector< float > k
Definition model.h:131
Complete Key-Value cache for all transformer layers.
Definition model.h:151
int max_batch_size
Definition model.h:159
void initialize_batch(int batch_size)
Initialize batch mode with specified number of sequences.
Definition model.h:201
int max_seq_len_config_
Definition model.h:163
void initialize(const ModelConfig &config, int total_num_model_layers, int num_gpu_layers_to_allocate, int max_seq_len_arg, int num_kv_heads, int head_dim, int max_batch_size_arg=1)
Initializes the KV cache with given dimensions.
Definition kv_cache.cpp:10
int total_model_layers_
Definition model.h:162
void clear_data()
Definition model.h:180
std::vector< KVCacheLayer > layers
Definition model.h:152
~KVCache()
Definition model.h:224
int seq_len
Definition model.h:155
void destroy_gpu_resources()
Definition kv_cache.cpp:217
std::vector< int > batch_seq_lens
Definition model.h:158
int current_batch_size
Definition model.h:160
Structure holding all weights for a single transformer layer.
Definition model.h:238
std::vector< uint16_t > post_attention_layernorm
Definition model.h:240
std::vector< block_q6_K > down_proj_q6k
Definition model.h:258
std::vector< block_q4_K > k_proj_q4k
Definition model.h:255
std::vector< block_q6_K > k_proj_q6k
Definition model.h:257
std::vector< float > input_layernorm_f32
Definition model.h:251
std::vector< block_q8_K > o_proj_q8k
Definition model.h:261
std::vector< uint16_t > gate_proj
Definition model.h:247
std::vector< uint16_t > v_proj
Definition model.h:244
std::vector< uint16_t > input_layernorm
Definition model.h:239
std::vector< block_q4_K > v_proj_q4k
Definition model.h:255
std::vector< block_q4_K > up_proj_q4k
Definition model.h:256
std::vector< block_q8_0 > o_proj_q8_0
Definition model.h:259
std::vector< float > up_proj_f32
Definition model.h:254
std::vector< uint16_t > o_proj
Definition model.h:245
std::vector< block_q8_K > down_proj_q8k
Definition model.h:262
std::vector< block_q4_K > down_proj_q4k
Definition model.h:256
std::vector< block_q4_K > gate_proj_q4k
Definition model.h:256
std::vector< float > v_proj_f32
Definition model.h:253
std::vector< block_q6_K > v_proj_q6k
Definition model.h:257
std::vector< block_q8_K > up_proj_q8k
Definition model.h:262
std::vector< block_q6_K > up_proj_q6k
Definition model.h:258
std::vector< block_q8_0 > v_proj_q8_0
Definition model.h:259
std::vector< float > k_proj_f32
Definition model.h:253
std::vector< block_q8_K > v_proj_q8k
Definition model.h:261
std::vector< block_q8_0 > gate_proj_q8_0
Definition model.h:260
std::vector< block_q6_K > q_proj_q6k
Definition model.h:257
std::vector< block_q8_K > k_proj_q8k
Definition model.h:261
std::vector< block_q6_K > gate_proj_q6k
Definition model.h:258
std::vector< block_q8_K > gate_proj_q8k
Definition model.h:262
std::vector< float > gate_proj_f32
Definition model.h:254
std::vector< float > o_proj_f32
Definition model.h:253
std::vector< uint16_t > down_proj
Definition model.h:249
std::vector< block_q8_0 > q_proj_q8_0
Definition model.h:259
std::vector< block_q8_0 > k_proj_q8_0
Definition model.h:259
std::vector< uint16_t > up_proj
Definition model.h:248
std::vector< block_q4_K > o_proj_q4k
Definition model.h:255
std::vector< float > q_proj_f32
Definition model.h:253
std::vector< uint16_t > q_proj
Definition model.h:242
std::vector< block_q8_K > q_proj_q8k
Definition model.h:261
std::vector< float > post_attention_layernorm_f32
Definition model.h:252
std::vector< float > down_proj_f32
Definition model.h:254
std::vector< block_q6_K > o_proj_q6k
Definition model.h:257
std::vector< block_q8_0 > down_proj_q8_0
Definition model.h:260
std::vector< block_q8_0 > up_proj_q8_0
Definition model.h:260
std::vector< block_q4_K > q_proj_q4k
Definition model.h:255
std::vector< uint16_t > k_proj
Definition model.h:243
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
int pad_token_id
Definition model.h:95
std::string chat_template_string
Definition model.h:100
std::string pre_tokenizer_type
Definition model.h:99
std::string architecture
Definition model.h:96
std::string model_name
Definition model.h:97
float rms_norm_eps
Definition model.h:88
int num_attention_heads
Definition model.h:83
std::string chat_template_type
Definition model.h:98
bool use_mmap_for_gguf
Definition model.h:102
int intermediate_size
Definition model.h:82
int eos_token_id
Definition model.h:93
int num_cpu_offload_layers
Definition model.h:104
bool enable_memory_efficient_layers
Definition model.h:107
std::string torch_dtype
Definition model.h:91
bool is_gguf_file_loaded
Definition model.h:101
bool use_kvcache_quantization
Definition model.h:103
float rope_theta
Definition model.h:89
int num_hidden_layers
Definition model.h:85
bool use_optimized_cuda_kernels
Definition model.h:110
int num_key_value_heads
Definition model.h:84
TokenizerFamily
Definition model.h:112
bool enable_prefill_chunking
Definition model.h:109
int bos_token_id
Definition model.h:92
std::string hidden_act
Definition model.h:90
TokenizerFamily tokenizer_family
Definition model.h:117
int unk_token_id
Definition model.h:94
int max_position_embeddings
Definition model.h:87