39 }
else if (!this->
lm_head.empty()) {
47 if (!
lw.q_proj_f32.empty())
return;
59 Logger::info(
"[DEQUANT_MEM] Layer " + std::to_string(
layer_idx) +
": Q-proj dequantization completed, f32 size=" + std::to_string(
lw.q_proj_f32.size()));
68 Logger::info(
"Clearing dequantized weights for layer " + std::to_string(
layer_idx) +
" to save memory.");
71 layer.q_proj_f32.clear();
72 layer.q_proj_f32.shrink_to_fit();
73 layer.k_proj_f32.clear();
74 layer.k_proj_f32.shrink_to_fit();
75 layer.v_proj_f32.clear();
76 layer.v_proj_f32.shrink_to_fit();
77 layer.o_proj_f32.clear();
78 layer.o_proj_f32.shrink_to_fit();
79 layer.gate_proj_f32.clear();
80 layer.gate_proj_f32.shrink_to_fit();
81 layer.up_proj_f32.clear();
82 layer.up_proj_f32.shrink_to_fit();
83 layer.down_proj_f32.clear();
84 layer.down_proj_f32.shrink_to_fit();
90 if (!
lw.k_proj_f32.empty())
return;
105 if (!
lw.v_proj_f32.empty())
return;
120 if (!
lw.o_proj_f32.empty())
return;
135 if (!
lw.gate_proj_f32.empty())
return;
151 if (!
lw.up_proj_f32.empty())
return;
167 if (!
lw.down_proj_f32.empty())
return;
183 Logger::info(
"Loading concatenated F32 weights for optimal GPU performance");
190 Logger::info(
"Loading F32 concatenated weights on-demand for GPU inference");
229 }
else if (!
q8_vec.empty()) {
231 for (
size_t i = 0;
i <
q8_vec.size(); ++
i) {
248 throw std::runtime_error(
"Layer " + std::to_string(
l_model_idx) +
": No " +
weight_name +
" weights found for GPU processing");
264 return lw.q_proj_f32;
266 [](
const LayerWeights&
lw) ->
const std::vector<uint16_t>& {
return lw.q_proj; },
267 [](
const LayerWeights&
lw) ->
const std::vector<block_q8_0>& {
return lw.q_proj_q8_0; },
268 [](
const LayerWeights&
lw) ->
const std::vector<block_q4_K>& {
return lw.q_proj_q4k; },
269 [](
const LayerWeights&
lw) ->
const std::vector<block_q6_K>& {
return lw.q_proj_q6k; },
280 Logger::info(
"Cleared Q-proj CPU memory immediately after GPU upload");
287 return lw.k_proj_f32;
289 [](
const LayerWeights&
lw) ->
const std::vector<uint16_t>& {
return lw.k_proj; },
290 [](
const LayerWeights&
lw) ->
const std::vector<block_q8_0>& {
return lw.k_proj_q8_0; },
291 [](
const LayerWeights&
lw) ->
const std::vector<block_q4_K>& {
return lw.k_proj_q4k; },
292 [](
const LayerWeights&
lw) ->
const std::vector<block_q6_K>& {
return lw.k_proj_q6k; },
302 Logger::info(
"Cleared K-proj CPU memory immediately after GPU upload");
309 return lw.v_proj_f32;
311 [](
const LayerWeights&
lw) ->
const std::vector<uint16_t>& {
return lw.v_proj; },
312 [](
const LayerWeights&
lw) ->
const std::vector<block_q8_0>& {
return lw.v_proj_q8_0; },
313 [](
const LayerWeights&
lw) ->
const std::vector<block_q4_K>& {
return lw.v_proj_q4k; },
314 [](
const LayerWeights&
lw) ->
const std::vector<block_q6_K>& {
return lw.v_proj_q6k; },
331 return lw.o_proj_f32;
333 [](
const LayerWeights&
lw) ->
const std::vector<uint16_t>& {
return lw.o_proj; },
334 [](
const LayerWeights&
lw) ->
const std::vector<block_q8_0>& {
return lw.o_proj_q8_0; },
335 [](
const LayerWeights&
lw) ->
const std::vector<block_q4_K>& {
return lw.o_proj_q4k; },
336 [](
const LayerWeights&
lw) ->
const std::vector<block_q6_K>& {
return lw.o_proj_q6k; },
353 return lw.gate_proj_f32;
355 [](
const LayerWeights&
lw) ->
const std::vector<uint16_t>& {
return lw.gate_proj; },
356 [](
const LayerWeights&
lw) ->
const std::vector<block_q8_0>& {
return lw.gate_proj_q8_0; },
357 [](
const LayerWeights&
lw) ->
const std::vector<block_q4_K>& {
return lw.gate_proj_q4k; },
358 [](
const LayerWeights&
lw) ->
const std::vector<block_q6_K>& {
return lw.gate_proj_q6k; },
375 return lw.up_proj_f32;
377 [](
const LayerWeights&
lw) ->
const std::vector<uint16_t>& {
return lw.up_proj; },
378 [](
const LayerWeights&
lw) ->
const std::vector<block_q8_0>& {
return lw.up_proj_q8_0; },
379 [](
const LayerWeights&
lw) ->
const std::vector<block_q4_K>& {
return lw.up_proj_q4k; },
380 [](
const LayerWeights&
lw) ->
const std::vector<block_q6_K>& {
return lw.up_proj_q6k; },
397 return lw.down_proj_f32;
399 [](
const LayerWeights&
lw) ->
const std::vector<uint16_t>& {
return lw.down_proj; },
400 [](
const LayerWeights&
lw) ->
const std::vector<block_q8_0>& {
return lw.down_proj_q8_0; },
401 [](
const LayerWeights&
lw) ->
const std::vector<block_q4_K>& {
return lw.down_proj_q4k; },
402 [](
const LayerWeights&
lw) ->
const std::vector<block_q6_K>& {
return lw.down_proj_q6k; },
416 Logger::info(
"Successfully loaded all concatenated F32 weights for GPU layers");
421 Logger::info(
"Loading concatenated BF16 weights for Tensor Core acceleration");
428 Logger::info(
"Converting F32 weights to BF16 for Tensor Core acceleration");
473 Logger::info(
"Successfully loaded all concatenated BF16 weights for Tensor Core acceleration");
489 Logger::info(
"Successfully freed all BF16 concatenated weights");
504 if (
lw.q_proj_f32_dev &&
lw.k_proj_f32_dev &&
lw.v_proj_f32_dev &&
505 lw.o_proj_f32_dev &&
lw.gate_proj_f32_dev &&
lw.up_proj_f32_dev &&
lw.down_proj_f32_dev) {
525 Logger::info(
"JIT loading layer " + std::to_string(
layer_idx) +
" weights to GPU (with aggressive eviction)");
543 ". Attempting emergency cleanup...");
555 throw std::runtime_error(
"GPU OOM: Cannot allocate " + std::to_string(
f32_weights.size() *
sizeof(
float)) +
557 " even after emergency cleanup. Try reducing --n-gpu-layers.");
575 [&
lw]() ->
const std::vector<float>& {
return lw.q_proj_f32; },
583 [&
lw]() ->
const std::vector<float>& {
return lw.k_proj_f32; },
591 [&
lw]() ->
const std::vector<float>& {
return lw.v_proj_f32; },
599 [&
lw]() ->
const std::vector<float>& {
return lw.o_proj_f32; },
607 [&
lw]() ->
const std::vector<float>& {
return lw.gate_proj_f32; },
608 lw.gate_proj_f32_dev,
615 [&
lw]() ->
const std::vector<float>& {
return lw.up_proj_f32; },
623 [&
lw]() ->
const std::vector<float>& {
return lw.down_proj_f32; },
624 lw.down_proj_f32_dev,
629 Logger::info(
"CPU-only build: ensure_layer_weights_on_gpu is a no-op for layer " + std::to_string(
layer_idx));
639 if (
lw.q_proj_f32_dev) {
cudaFree(
lw.q_proj_f32_dev);
lw.q_proj_f32_dev =
nullptr; }
640 if (
lw.k_proj_f32_dev) {
cudaFree(
lw.k_proj_f32_dev);
lw.k_proj_f32_dev =
nullptr; }
641 if (
lw.v_proj_f32_dev) {
cudaFree(
lw.v_proj_f32_dev);
lw.v_proj_f32_dev =
nullptr; }
642 if (
lw.o_proj_f32_dev) {
cudaFree(
lw.o_proj_f32_dev);
lw.o_proj_f32_dev =
nullptr; }
643 if (
lw.gate_proj_f32_dev) {
cudaFree(
lw.gate_proj_f32_dev);
lw.gate_proj_f32_dev =
nullptr; }
644 if (
lw.up_proj_f32_dev) {
cudaFree(
lw.up_proj_f32_dev);
lw.up_proj_f32_dev =
nullptr; }
645 if (
lw.down_proj_f32_dev) {
cudaFree(
lw.down_proj_f32_dev);
lw.down_proj_f32_dev =
nullptr; }
650 Logger::info(
"CPU-only build: free_layer_gpu_weights is a no-op for layer " + std::to_string(
layer_idx));
655 Logger::info(
"Mapping GGUF weights to model fields (ULTRA-OPTIMIZED VERSION)...");
657 const uint8_t* actual_data_block_start =
nullptr;
661 const uint8_t* mmap_buffer_start =
static_cast<const uint8_t*
>(gguf.
mapped_tensor_data);
663 Logger::info(
"map_gguf_weights: Using mmap mode (ZERO-COPY). Size: " +
667 Logger::info(
"map_gguf_weights: Using non-mmap mode. Size: " +
668 std::to_string(gguf.
tensor_data.size()) +
" bytes.");
670 Logger::error(
"GGUF tensor data is not available. Cannot map weights.");
675 Logger::info(
"Processing " + std::to_string(num_tensors) +
" tensors with ultra-optimized parallel mapping...");
678 std::vector<std::pair<std::string, GGUFTensorInfo>> tensor_pairs;
679 tensor_pairs.reserve(num_tensors);
680 for (
const auto& pair : gguf.tensor_infos_map) {
681 tensor_pairs.emplace_back(pair.first, pair.second);
685 const size_t typical_blocks = 4096;
689 for (
auto& layer : model.layers) {
690 if (layer.q_proj_q8_0.capacity() == 0) layer.q_proj_q8_0.reserve(typical_blocks);
691 if (layer.k_proj_q8_0.capacity() == 0) layer.k_proj_q8_0.reserve(typical_blocks);
692 if (layer.v_proj_q8_0.capacity() == 0) layer.v_proj_q8_0.reserve(typical_blocks);
693 if (layer.o_proj_q8_0.capacity() == 0) layer.o_proj_q8_0.reserve(typical_blocks);
694 if (layer.gate_proj_q8_0.capacity() == 0) layer.gate_proj_q8_0.reserve(typical_blocks);
695 if (layer.up_proj_q8_0.capacity() == 0) layer.up_proj_q8_0.reserve(typical_blocks);
696 if (layer.down_proj_q8_0.capacity() == 0) layer.down_proj_q8_0.reserve(typical_blocks);
700 std::vector<size_t> global_tensor_indices;
701 std::vector<std::vector<size_t>> layer_tensor_indices(model.
layers.size());
703 global_tensor_indices.reserve(10);
704 for (
auto& layer_indices : layer_tensor_indices) {
705 layer_indices.reserve(9);
709 for (
size_t i = 0; i < tensor_pairs.size(); ++i) {
710 const std::string& name = tensor_pairs[i].first;
711 if (name[0] ==
'o' || name[0] ==
't') {
712 global_tensor_indices.push_back(i);
713 }
else if (name.size() > 4 && name[0] ==
'b' && name[1] ==
'l' && name[2] ==
'k' && name[3] ==
'.') {
715 size_t layer_start = 4;
716 size_t layer_end = name.find(
'.', layer_start);
717 if (layer_end != std::string::npos) {
719 for (
size_t pos = layer_start; pos < layer_end; ++pos) {
720 layer_idx = layer_idx * 10 + (name[pos] -
'0');
722 if (layer_idx >= 0 &&
static_cast<size_t>(layer_idx) < model.
layers.size()) {
723 layer_tensor_indices[layer_idx].push_back(i);
729 std::atomic<int> processed_count{0};
730 std::atomic<int> error_count{0};
733 for (
size_t idx : global_tensor_indices) {
735 const std::string& target_field_key = tensor_pairs[idx].first;
737 const uint8_t* tensor_data_ptr = actual_data_block_start + info.
offset;
740 if (target_field_key ==
"output.weight") {
777 if (target_field_key ==
"token_embd.weight") {
814 if (target_field_key ==
"output_norm.weight") {
824 }
catch (
const std::exception& e) {
830 #pragma omp parallel for schedule(static) if(model.layers.size() > 4)
831 for (
int layer_idx = 0; layer_idx < (int)layer_tensor_indices.size(); ++layer_idx) {
832 const auto& layer_indices = layer_tensor_indices[layer_idx];
833 if (layer_indices.empty())
continue;
838 for (
size_t idx : layer_indices) {
839 const std::string& name = tensor_pairs[idx].first;
841 const uint8_t* tensor_data_ptr = actual_data_block_start + info.
offset;
844 const size_t last_dot = name.find_last_of(
'.');
845 if (last_dot == std::string::npos)
continue;
847 const char* field = name.c_str() + name.find(
'.', 4) + 1;
850 #define FAST_COPY_WEIGHT(target_vec, block_type) \
851 target_vec.resize(info.size_in_bytes / sizeof(block_type)); \
852 std::memcpy(target_vec.data(), tensor_data_ptr, info.size_in_bytes);
855 const char* name_cstr = name.c_str();
856 const size_t name_len = name.length();
858 if (name_len > 10 && name.find(
"attn_") != std::string::npos) {
859 if (name.find(
"attn_q.weight") != std::string::npos) {
867 }
else if (name.find(
"attn_k.weight") != std::string::npos) {
875 }
else if (name.find(
"attn_v.weight") != std::string::npos) {
883 }
else if (name.find(
"attn_output.weight") != std::string::npos) {
894 }
else if (name_len > 10 && name.find(
"ffn_") != std::string::npos) {
895 if (name.find(
"ffn_gate.weight") != std::string::npos) {
903 }
else if (name.find(
"ffn_up.weight") != std::string::npos) {
911 }
else if (name.find(
"ffn_down.weight") != std::string::npos) {
924 #undef FAST_COPY_WEIGHT
927 }
catch (
const std::exception& e) {
932 Logger::info(
"Finished mapping GGUF weights: " + std::to_string(processed_count.load()) +
"/" +
933 std::to_string(num_tensors) +
" tensors processed successfully (errors: " +
934 std::to_string(error_count.load()) +
") with ultra-optimized parallel mapping");
940 Logger::info(
"CPU-only build: ensure_f32_concatenated_weights_loaded is a no-op");
944 Logger::info(
"CPU-only build: ensure_bf16_concatenated_weights_loaded is a no-op");
948 Logger::info(
"CPU-only build: free_bf16_concatenated_weights is a no-op");
static void warning(const std::string &message)
static void info(const std::string &message)
static void error(const std::string &message)
Main transformer model class for TinyLlama.
void free_layer_gpu_weights(int layer_idx)
bool f32_concatenated_weights_loaded_
std::vector< block_q6_K > embed_tokens_q6k
void ensure_up_proj_dequantized(int layer_idx)
std::vector< float > final_norm_f32
void free_bf16_concatenated_weights()
void ensure_v_proj_dequantized(int layer_idx)
std::vector< block_q4_K > lm_head_q4k
std::vector< block_q6_K > lm_head_q6k
void ensure_layer_weights_on_gpu(int layer_idx)
void ensure_embed_tokens_dequantized()
std::vector< LayerWeights > layers
std::vector< block_q8_0 > embed_tokens_q8_0
void ensure_o_proj_dequantized(int layer_idx)
void clear_layer_dequantized_weights(int layer_idx)
std::vector< block_q4_K > embed_tokens_q4k
void ensure_k_proj_dequantized(int layer_idx)
std::vector< block_q8_0 > lm_head_q8_0
std::vector< uint16_t > lm_head
void ensure_f32_concatenated_weights_loaded()
std::vector< uint16_t > embed_tokens
std::vector< block_q8_K > embed_tokens_q8k
void ensure_bf16_concatenated_weights_loaded()
void ensure_q_proj_dequantized(int layer_idx)
void ensure_down_proj_dequantized(int layer_idx)
void ensure_gate_proj_dequantized(int layer_idx)
std::vector< float > embed_tokens_f32
std::vector< float > forward(std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask)
Run the forward pass for the model on CPU layers.
void ensure_lm_head_dequantized()
std::vector< float > lm_head_f32
std::vector< block_q8_K > lm_head_q8k
Type definitions for GGML (Georgi Gerganov Machine Learning) library.
constexpr size_t GGML_QK8_0
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Logging utilities for the TinyLlama implementation.
float bfloat16_to_float32(uint16_t b16)
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
void dequantize_vector_q6k_to_f32(const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q6_K blocks to a vector of float32.
void dequantize_vector_q8_0_to_f32(const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q8_0 blocks to a vector of float32.
void dequantize_vector_q4k_to_f32(const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q4_K blocks to a vector of float32.
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
Complete representation of a GGUF file's contents.
size_t offset_diff_for_mmap
std::vector< uint8_t > tensor_data
size_t mapped_tensor_data_size
void * mapped_tensor_data
std::map< std::string, GGUFTensorInfo > tensor_infos_map
Information about a tensor stored in a GGUF file.
Structure holding all weights for a single transformer layer.
std::vector< block_q6_K > down_proj_q6k
std::vector< block_q4_K > k_proj_q4k
std::vector< block_q6_K > k_proj_q6k
std::vector< float > input_layernorm_f32
std::vector< block_q8_K > o_proj_q8k
std::vector< uint16_t > gate_proj
std::vector< uint16_t > v_proj
std::vector< block_q4_K > v_proj_q4k
std::vector< block_q4_K > up_proj_q4k
std::vector< block_q8_0 > o_proj_q8_0
std::vector< uint16_t > o_proj
std::vector< block_q8_K > down_proj_q8k
std::vector< block_q4_K > down_proj_q4k
std::vector< block_q4_K > gate_proj_q4k
std::vector< block_q6_K > v_proj_q6k
std::vector< block_q8_K > up_proj_q8k
std::vector< block_q6_K > up_proj_q6k
std::vector< block_q8_0 > v_proj_q8_0
std::vector< block_q8_K > v_proj_q8k
std::vector< block_q8_0 > gate_proj_q8_0
std::vector< block_q6_K > q_proj_q6k
std::vector< block_q8_K > k_proj_q8k
std::vector< block_q6_K > gate_proj_q6k
std::vector< block_q8_K > gate_proj_q8k
std::vector< uint16_t > down_proj
std::vector< block_q8_0 > q_proj_q8_0
std::vector< block_q8_0 > k_proj_q8_0
std::vector< uint16_t > up_proj
std::vector< block_q4_K > o_proj_q4k
std::vector< uint16_t > q_proj
std::vector< block_q8_K > q_proj_q8k
std::vector< float > post_attention_layernorm_f32
std::vector< block_q6_K > o_proj_q6k
std::vector< block_q8_0 > down_proj_q8_0
std::vector< block_q8_0 > up_proj_q8_0
std::vector< block_q4_K > q_proj_q4k
std::vector< uint16_t > k_proj
int num_cpu_offload_layers
bool enable_memory_efficient_layers
4-bit K-quantized block structure
6-bit K-quantized block structure
Simple 8-bit quantized block structure.
8-bit K-quantized block structure with block sums
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)