TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
weight_management.cpp
Go to the documentation of this file.
1#include "weight_management.h"
2#include "ggml_types.h"
3#include "logger.h"
4#include "quantization.h"
5#include "utils.h"
6#ifdef HAS_CUDA
7#include "cuda_kernels.h"
8#endif
9
11 if (!this->embed_tokens_f32.empty()) return;
12
13 size_t total_elements_embed = static_cast<size_t>(config_.vocab_size) * config_.hidden_size;
14 if (!this->embed_tokens_q6k.empty()) {
15 dequantize_vector_q6k_to_f32(this->embed_tokens_q6k, this->embed_tokens_f32, total_elements_embed, 1);
16 } else if (!this->embed_tokens_q4k.empty()) {
17 dequantize_vector_q4k_to_f32(this->embed_tokens_q4k, this->embed_tokens_f32, total_elements_embed, 1);
18 } else if (!this->embed_tokens_q8k.empty()) {
19 dequantize_q8_k(this->embed_tokens_q8k, this->embed_tokens_f32, total_elements_embed, true);
20 } else if (!this->embed_tokens_q8_0.empty()) {
21 dequantize_vector_q8_0_to_f32(this->embed_tokens_q8_0, this->embed_tokens_f32, total_elements_embed, 1);
22 } else if (!this->embed_tokens.empty()) {
24 }
25}
26
28 if (!this->lm_head_f32.empty()) return;
29
30 size_t total_elements_lm_head = static_cast<size_t>(config_.vocab_size) * config_.hidden_size;
31 if (!this->lm_head_q6k.empty()) {
32 dequantize_vector_q6k_to_f32(this->lm_head_q6k, this->lm_head_f32, total_elements_lm_head, 1);
33 } else if (!this->lm_head_q4k.empty()) {
34 dequantize_vector_q4k_to_f32(this->lm_head_q4k, this->lm_head_f32, total_elements_lm_head, 1);
35 } else if (!this->lm_head_q8k.empty()) {
36 dequantize_q8_k(this->lm_head_q8k, this->lm_head_f32, total_elements_lm_head, true);
37 } else if (!this->lm_head_q8_0.empty()) {
38 dequantize_vector_q8_0_to_f32(this->lm_head_q8_0, this->lm_head_f32, total_elements_lm_head, 1);
39 } else if (!this->lm_head.empty()) {
41 }
42}
43
45 if (layer_idx < 0 || layer_idx >= layers.size()) return;
46 auto& lw = layers[layer_idx];
47 if (!lw.q_proj_f32.empty()) return;
48
50 size_t q_proj_elements = static_cast<size_t>(hs) * hs;
51 Logger::info("[DEQUANT_MEM] Layer " + std::to_string(layer_idx) + ": Q-proj dequantization starting, elements=" + std::to_string(q_proj_elements));
52
53 if (!lw.q_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.q_proj_q6k, lw.q_proj_f32, q_proj_elements, 0);
54 else if (!lw.q_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.q_proj_q4k, lw.q_proj_f32, q_proj_elements, 0);
55 else if (!lw.q_proj_q8k.empty()) dequantize_q8_k(lw.q_proj_q8k, lw.q_proj_f32, q_proj_elements, true);
56 else if (!lw.q_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.q_proj_q8_0, lw.q_proj_f32, q_proj_elements, 0);
57 else if (!lw.q_proj.empty()) lw.q_proj_f32 = bf16vec_to_float_vec(lw.q_proj);
58
59 Logger::info("[DEQUANT_MEM] Layer " + std::to_string(layer_idx) + ": Q-proj dequantization completed, f32 size=" + std::to_string(lw.q_proj_f32.size()));
60}
61
63 if (layer_idx < 0 || layer_idx >= static_cast<int>(layers.size())) {
64 Logger::warning("clear_layer_dequantized_weights: Invalid layer index " + std::to_string(layer_idx));
65 return;
66 }
67
68 Logger::info("Clearing dequantized weights for layer " + std::to_string(layer_idx) + " to save memory.");
69
70 auto& layer = layers[layer_idx];
71 layer.q_proj_f32.clear();
72 layer.q_proj_f32.shrink_to_fit();
73 layer.k_proj_f32.clear();
74 layer.k_proj_f32.shrink_to_fit();
75 layer.v_proj_f32.clear();
76 layer.v_proj_f32.shrink_to_fit();
77 layer.o_proj_f32.clear();
78 layer.o_proj_f32.shrink_to_fit();
79 layer.gate_proj_f32.clear();
80 layer.gate_proj_f32.shrink_to_fit();
81 layer.up_proj_f32.clear();
82 layer.up_proj_f32.shrink_to_fit();
83 layer.down_proj_f32.clear();
84 layer.down_proj_f32.shrink_to_fit();
85}
86
88 if (layer_idx < 0 || layer_idx >= layers.size()) return;
89 auto& lw = layers[layer_idx];
90 if (!lw.k_proj_f32.empty()) return;
91
93 size_t k_proj_elements = static_cast<size_t>(config_.num_key_value_heads * (hs / config_.num_attention_heads)) * hs;
94
95 if (!lw.k_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.k_proj_q6k, lw.k_proj_f32, k_proj_elements, 0);
96 else if (!lw.k_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.k_proj_q4k, lw.k_proj_f32, k_proj_elements, 0);
97 else if (!lw.k_proj_q8k.empty()) dequantize_q8_k(lw.k_proj_q8k, lw.k_proj_f32, k_proj_elements, false);
98 else if (!lw.k_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.k_proj_q8_0, lw.k_proj_f32, k_proj_elements, 0);
99 else if (!lw.k_proj.empty()) lw.k_proj_f32 = bf16vec_to_float_vec(lw.k_proj);
100}
101
103 if (layer_idx < 0 || layer_idx >= layers.size()) return;
104 auto& lw = layers[layer_idx];
105 if (!lw.v_proj_f32.empty()) return;
106
107 int hs = config_.hidden_size;
108 size_t v_proj_elements = static_cast<size_t>(config_.num_key_value_heads * (hs / config_.num_attention_heads)) * hs;
109
110 if (!lw.v_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.v_proj_q6k, lw.v_proj_f32, v_proj_elements, 0);
111 else if (!lw.v_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.v_proj_q4k, lw.v_proj_f32, v_proj_elements, 0);
112 else if (!lw.v_proj_q8k.empty()) dequantize_q8_k(lw.v_proj_q8k, lw.v_proj_f32, v_proj_elements, false);
113 else if (!lw.v_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.v_proj_q8_0, lw.v_proj_f32, v_proj_elements, 0);
114 else if (!lw.v_proj.empty()) lw.v_proj_f32 = bf16vec_to_float_vec(lw.v_proj);
115}
116
118 if (layer_idx < 0 || layer_idx >= layers.size()) return;
119 auto& lw = layers[layer_idx];
120 if (!lw.o_proj_f32.empty()) return;
121
122 int hs = config_.hidden_size;
123 size_t o_proj_elements = static_cast<size_t>(hs) * hs;
124
125 if (!lw.o_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.o_proj_q6k, lw.o_proj_f32, o_proj_elements, 0);
126 else if (!lw.o_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.o_proj_q4k, lw.o_proj_f32, o_proj_elements, 0);
127 else if (!lw.o_proj_q8k.empty()) dequantize_q8_k(lw.o_proj_q8k, lw.o_proj_f32, o_proj_elements, false);
128 else if (!lw.o_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.o_proj_q8_0, lw.o_proj_f32, o_proj_elements, 0);
129 else if (!lw.o_proj.empty()) lw.o_proj_f32 = bf16vec_to_float_vec(lw.o_proj);
130}
131
133 if (layer_idx < 0 || layer_idx >= layers.size()) return;
134 auto& lw = layers[layer_idx];
135 if (!lw.gate_proj_f32.empty()) return;
136
137 int hs = config_.hidden_size;
139 size_t gate_proj_elements = static_cast<size_t>(is) * hs;
140
141 if (!lw.gate_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.gate_proj_q6k, lw.gate_proj_f32, gate_proj_elements, 0);
142 else if (!lw.gate_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.gate_proj_q4k, lw.gate_proj_f32, gate_proj_elements, 0);
143 else if (!lw.gate_proj_q8k.empty()) dequantize_q8_k(lw.gate_proj_q8k, lw.gate_proj_f32, gate_proj_elements, false);
144 else if (!lw.gate_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.gate_proj_q8_0, lw.gate_proj_f32, gate_proj_elements, 0);
145 else if (!lw.gate_proj.empty()) lw.gate_proj_f32 = bf16vec_to_float_vec(lw.gate_proj);
146}
147
149 if (layer_idx < 0 || layer_idx >= layers.size()) return;
150 auto& lw = layers[layer_idx];
151 if (!lw.up_proj_f32.empty()) return;
152
153 int hs = config_.hidden_size;
155 size_t up_proj_elements = static_cast<size_t>(is) * hs;
156
157 if (!lw.up_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.up_proj_q6k, lw.up_proj_f32, up_proj_elements, 0);
158 else if (!lw.up_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.up_proj_q4k, lw.up_proj_f32, up_proj_elements, 0);
159 else if (!lw.up_proj_q8k.empty()) dequantize_q8_k(lw.up_proj_q8k, lw.up_proj_f32, up_proj_elements, false);
160 else if (!lw.up_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.up_proj_q8_0, lw.up_proj_f32, up_proj_elements, 0);
161 else if (!lw.up_proj.empty()) lw.up_proj_f32 = bf16vec_to_float_vec(lw.up_proj);
162}
163
165 if (layer_idx < 0 || layer_idx >= layers.size()) return;
166 auto& lw = layers[layer_idx];
167 if (!lw.down_proj_f32.empty()) return;
168
169 int hs = config_.hidden_size;
171 size_t down_proj_elements = static_cast<size_t>(hs) * is;
172
173 if (!lw.down_proj_q6k.empty()) dequantize_vector_q6k_to_f32(lw.down_proj_q6k, lw.down_proj_f32, down_proj_elements, 0);
174 else if (!lw.down_proj_q4k.empty()) dequantize_vector_q4k_to_f32(lw.down_proj_q4k, lw.down_proj_f32, down_proj_elements, 0);
175 else if (!lw.down_proj_q8k.empty()) dequantize_q8_k(lw.down_proj_q8k, lw.down_proj_f32, down_proj_elements, false);
176 else if (!lw.down_proj_q8_0.empty()) dequantize_vector_q8_0_to_f32(lw.down_proj_q8_0, lw.down_proj_f32, down_proj_elements, 0);
177 else if (!lw.down_proj.empty()) lw.down_proj_f32 = bf16vec_to_float_vec(lw.down_proj);
178}
179
180#ifdef HAS_CUDA
182 // OPTIMIZED: Use concatenated weights for maximum GPU performance
183 Logger::info("Loading concatenated F32 weights for optimal GPU performance");
184
187 return;
188 }
189
190 Logger::info("Loading F32 concatenated weights on-demand for GPU inference");
191
192 // Concatenated FP32 / Dequantized Layer Weights for GPU layers (w_..._f32_dev_ pointers)
193 int hs = config_.hidden_size;
196
200
202 const std::function<const std::vector<float>&(const LayerWeights&)>& f32_accessor,
203 const std::function<const std::vector<uint16_t>&(const LayerWeights&)>& bf16_accessor,
204 const std::function<const std::vector<block_q8_0>&(const LayerWeights&)>& q8_accessor,
205 const std::function<const std::vector<block_q4_K>&(const LayerWeights&)>& q4k_accessor,
206 const std::function<const std::vector<block_q6_K>&(const LayerWeights&)>& q6k_accessor,
207 float*& dev_ptr, size_t single_layer_elem_size, const std::string& weight_name) {
208
209 std::vector<float> concatenated_f32;
211
214 // Individual weight dequantization will be handled by the lambda accessors below
216
217 const auto& f32_vec = f32_accessor(lw);
218 const auto& bf16_vec = bf16_accessor(lw);
219 const auto& q8_vec = q8_accessor(lw);
220 const auto& q4k_vec = q4k_accessor(lw);
221 const auto& q6k_vec = q6k_accessor(lw);
222
223 if (!f32_vec.empty()) {
224 concatenated_f32.insert(concatenated_f32.end(), f32_vec.begin(), f32_vec.end());
225 } else if (!bf16_vec.empty()) {
226 for (uint16_t bf16_val : bf16_vec) {
228 }
229 } else if (!q8_vec.empty()) {
230 std::vector<float> temp_f32(q8_vec.size() * GGML_QK8_0);
231 for (size_t i = 0; i < q8_vec.size(); ++i) {
233 }
234 concatenated_f32.insert(concatenated_f32.end(), temp_f32.begin(), temp_f32.end());
235 } else if (!q4k_vec.empty()) {
236 std::vector<float> temp_f32(q4k_vec.size() * GGML_QK_K);
237 for (size_t i = 0; i < q4k_vec.size(); ++i) {
239 }
240 concatenated_f32.insert(concatenated_f32.end(), temp_f32.begin(), temp_f32.end());
241 } else if (!q6k_vec.empty()) {
242 std::vector<float> temp_f32(q6k_vec.size() * GGML_QK_K);
243 for (size_t i = 0; i < q6k_vec.size(); ++i) {
245 }
246 concatenated_f32.insert(concatenated_f32.end(), temp_f32.begin(), temp_f32.end());
247 } else {
248 throw std::runtime_error("Layer " + std::to_string(l_model_idx) + ": No " + weight_name + " weights found for GPU processing");
249 }
250 }
251
252 if (!concatenated_f32.empty()) {
253 if (dev_ptr) { cudaFree(dev_ptr); dev_ptr = nullptr; }
254 gpuErrchk(cudaMalloc(&dev_ptr, concatenated_f32.size() * sizeof(float)));
256 Logger::info("Loaded concatenated " + weight_name + " (F32) to GPU for " + std::to_string(active_num_gpu_layers) + " layers");
257 }
258 };
259
261 [this](const LayerWeights& lw) -> const std::vector<float>& {
262 int layer_idx = &lw - &layers[0];
264 return lw.q_proj_f32;
265 },
266 [](const LayerWeights& lw) -> const std::vector<uint16_t>& { return lw.q_proj; },
267 [](const LayerWeights& lw) -> const std::vector<block_q8_0>& { return lw.q_proj_q8_0; },
268 [](const LayerWeights& lw) -> const std::vector<block_q4_K>& { return lw.q_proj_q4k; },
269 [](const LayerWeights& lw) -> const std::vector<block_q6_K>& { return lw.q_proj_q6k; },
270 w_q_f32_dev_, layer_q_size_f32, "Q Proj");
271
272 // CRITICAL: Immediately clear Q-proj CPU memory after GPU upload to prevent OOM
277 layers[layer_idx].q_proj_f32.clear();
278 layers[layer_idx].q_proj_f32.shrink_to_fit();
279 }
280 Logger::info("Cleared Q-proj CPU memory immediately after GPU upload");
281 }
282
284 [this](const LayerWeights& lw) -> const std::vector<float>& {
285 int layer_idx = &lw - &layers[0];
287 return lw.k_proj_f32;
288 },
289 [](const LayerWeights& lw) -> const std::vector<uint16_t>& { return lw.k_proj; },
290 [](const LayerWeights& lw) -> const std::vector<block_q8_0>& { return lw.k_proj_q8_0; },
291 [](const LayerWeights& lw) -> const std::vector<block_q4_K>& { return lw.k_proj_q4k; },
292 [](const LayerWeights& lw) -> const std::vector<block_q6_K>& { return lw.k_proj_q6k; },
293 w_k_f32_dev_, layer_k_size_f32, "K Proj");
294
299 layers[layer_idx].k_proj_f32.clear();
300 layers[layer_idx].k_proj_f32.shrink_to_fit();
301 }
302 Logger::info("Cleared K-proj CPU memory immediately after GPU upload");
303 }
304
306 [this](const LayerWeights& lw) -> const std::vector<float>& {
307 int layer_idx = &lw - &layers[0];
309 return lw.v_proj_f32;
310 },
311 [](const LayerWeights& lw) -> const std::vector<uint16_t>& { return lw.v_proj; },
312 [](const LayerWeights& lw) -> const std::vector<block_q8_0>& { return lw.v_proj_q8_0; },
313 [](const LayerWeights& lw) -> const std::vector<block_q4_K>& { return lw.v_proj_q4k; },
314 [](const LayerWeights& lw) -> const std::vector<block_q6_K>& { return lw.v_proj_q6k; },
315 w_v_f32_dev_, layer_v_size_f32, "V Proj");
316
317 // Clear V-proj CPU memory immediately
322 layers[layer_idx].v_proj_f32.clear();
323 layers[layer_idx].v_proj_f32.shrink_to_fit();
324 }
325 }
326
328 [this](const LayerWeights& lw) -> const std::vector<float>& {
329 int layer_idx = &lw - &layers[0];
331 return lw.o_proj_f32;
332 },
333 [](const LayerWeights& lw) -> const std::vector<uint16_t>& { return lw.o_proj; },
334 [](const LayerWeights& lw) -> const std::vector<block_q8_0>& { return lw.o_proj_q8_0; },
335 [](const LayerWeights& lw) -> const std::vector<block_q4_K>& { return lw.o_proj_q4k; },
336 [](const LayerWeights& lw) -> const std::vector<block_q6_K>& { return lw.o_proj_q6k; },
337 w_o_f32_dev_, layer_o_size_f32, "O Proj");
338
339 // Clear O-proj CPU memory immediately
344 layers[layer_idx].o_proj_f32.clear();
345 layers[layer_idx].o_proj_f32.shrink_to_fit();
346 }
347 }
348
350 [this](const LayerWeights& lw) -> const std::vector<float>& {
351 int layer_idx = &lw - &layers[0];
353 return lw.gate_proj_f32;
354 },
355 [](const LayerWeights& lw) -> const std::vector<uint16_t>& { return lw.gate_proj; },
356 [](const LayerWeights& lw) -> const std::vector<block_q8_0>& { return lw.gate_proj_q8_0; },
357 [](const LayerWeights& lw) -> const std::vector<block_q4_K>& { return lw.gate_proj_q4k; },
358 [](const LayerWeights& lw) -> const std::vector<block_q6_K>& { return lw.gate_proj_q6k; },
360
361 // Clear Gate-proj CPU memory immediately
366 layers[layer_idx].gate_proj_f32.clear();
367 layers[layer_idx].gate_proj_f32.shrink_to_fit();
368 }
369 }
370
372 [this](const LayerWeights& lw) -> const std::vector<float>& {
373 int layer_idx = &lw - &layers[0];
375 return lw.up_proj_f32;
376 },
377 [](const LayerWeights& lw) -> const std::vector<uint16_t>& { return lw.up_proj; },
378 [](const LayerWeights& lw) -> const std::vector<block_q8_0>& { return lw.up_proj_q8_0; },
379 [](const LayerWeights& lw) -> const std::vector<block_q4_K>& { return lw.up_proj_q4k; },
380 [](const LayerWeights& lw) -> const std::vector<block_q6_K>& { return lw.up_proj_q6k; },
381 w_up_f32_dev_, layer_up_size_f32, "Up Proj");
382
383 // Clear Up-proj CPU memory immediately
388 layers[layer_idx].up_proj_f32.clear();
389 layers[layer_idx].up_proj_f32.shrink_to_fit();
390 }
391 }
392
394 [this](const LayerWeights& lw) -> const std::vector<float>& {
395 int layer_idx = &lw - &layers[0];
397 return lw.down_proj_f32;
398 },
399 [](const LayerWeights& lw) -> const std::vector<uint16_t>& { return lw.down_proj; },
400 [](const LayerWeights& lw) -> const std::vector<block_q8_0>& { return lw.down_proj_q8_0; },
401 [](const LayerWeights& lw) -> const std::vector<block_q4_K>& { return lw.down_proj_q4k; },
402 [](const LayerWeights& lw) -> const std::vector<block_q6_K>& { return lw.down_proj_q6k; },
404
405 // Clear Down-proj CPU memory immediately
410 layers[layer_idx].down_proj_f32.clear();
411 layers[layer_idx].down_proj_f32.shrink_to_fit();
412 }
413 }
414
416 Logger::info("Successfully loaded all concatenated F32 weights for GPU layers");
417}
418
419// BF16 Tensor Core weight management functions
421 Logger::info("Loading concatenated BF16 weights for Tensor Core acceleration");
422
425 return;
426 }
427
428 Logger::info("Converting F32 weights to BF16 for Tensor Core acceleration");
429
430 // Ensure F32 weights are loaded first
432
433 int hs = config_.hidden_size;
436
437 size_t layer_q_size = (size_t)hs*hs;
438 size_t layer_k_size = (size_t)kv_dim*hs;
439 size_t layer_v_size = (size_t)kv_dim*hs;
440 size_t layer_o_size = (size_t)hs*hs;
441 size_t layer_gate_size = (size_t)is*hs;
442 size_t layer_up_size = (size_t)is*hs;
443 size_t layer_down_size = (size_t)hs*is;
444
445 // Convert F32 weights to BF16 on GPU
446 auto convert_f32_to_bf16 = [&](float* f32_dev_ptr, uint16_t*& bf16_dev_ptr, size_t total_elements, const std::string& weight_name) {
447 if (f32_dev_ptr && total_elements > 0) {
448 if (bf16_dev_ptr) { cudaFree(bf16_dev_ptr); bf16_dev_ptr = nullptr; }
452 Logger::info("Converted " + weight_name + " from F32 to BF16 on GPU for Tensor Cores");
453 }
454 };
455
463
471
473 Logger::info("Successfully loaded all concatenated BF16 weights for Tensor Core acceleration");
474}
475
478
479 Logger::info("Freeing BF16 concatenated weights");
487
489 Logger::info("Successfully freed all BF16 concatenated weights");
490}
491
493#ifdef HAS_CUDA
494 if (layer_idx < 0 || layer_idx >= layers.size()) return;
495
496 // Check if this layer is supposed to be on GPU
500
502
503 // Check if weights are already loaded on GPU for this layer
504 if (lw.q_proj_f32_dev && lw.k_proj_f32_dev && lw.v_proj_f32_dev &&
505 lw.o_proj_f32_dev && lw.gate_proj_f32_dev && lw.up_proj_f32_dev && lw.down_proj_f32_dev) {
506 return; // Already loaded
507 }
508
509 // AGGRESSIVE MEMORY MANAGEMENT: Free previous layer weights to make room
510 // Keep only the current layer and maybe the next one
512 int prev_layer = layer_idx - 1;
513 if (prev_layer >= first_gpu_layer && prev_layer < layers.size()) {
515 }
516 }
517
518 // If still hitting memory limits, free ALL other GPU layers except current
519 if (layer_idx > first_gpu_layer + 1) {
520 for (int i = first_gpu_layer; i < layer_idx - 1; ++i) {
522 }
523 }
524
525 Logger::info("JIT loading layer " + std::to_string(layer_idx) + " weights to GPU (with aggressive eviction)");
526
527 // Dequantize and load each weight matrix individually
528 auto load_single_weight = [&](
529 const std::function<void()>& ensure_dequantized,
530 const std::function<const std::vector<float>&()>& get_f32_weights,
531 float*& dev_ptr,
532 const std::string& weight_name
533 ) {
535 const auto& f32_weights = get_f32_weights();
536 if (!f32_weights.empty()) {
537 if (dev_ptr) { cudaFree(dev_ptr); dev_ptr = nullptr; }
538
539 // Try allocation with error handling
540 cudaError_t malloc_result = cudaMalloc(&dev_ptr, f32_weights.size() * sizeof(float));
541 if (malloc_result != cudaSuccess) {
542 Logger::warning("GPU memory allocation failed for " + weight_name + " in layer " + std::to_string(layer_idx) +
543 ". Attempting emergency cleanup...");
544
545 // Emergency cleanup: free ALL other layer weights
547 if (emergency_idx != layer_idx) {
549 }
550 }
551
552 // Try allocation again after cleanup
553 malloc_result = cudaMalloc(&dev_ptr, f32_weights.size() * sizeof(float));
554 if (malloc_result != cudaSuccess) {
555 throw std::runtime_error("GPU OOM: Cannot allocate " + std::to_string(f32_weights.size() * sizeof(float)) +
556 " bytes for " + weight_name + " in layer " + std::to_string(layer_idx) +
557 " even after emergency cleanup. Try reducing --n-gpu-layers.");
558 }
559 Logger::info("Emergency cleanup successful, allocated " + weight_name);
560 }
561
563
564 // Immediately clear CPU memory to save RAM
566 const_cast<std::vector<float>&>(f32_weights).clear();
567 const_cast<std::vector<float>&>(f32_weights).shrink_to_fit();
568 }
569 }
570 };
571
572 // Load Q projection
575 [&lw]() -> const std::vector<float>& { return lw.q_proj_f32; },
576 lw.q_proj_f32_dev,
577 "Q Proj"
578 );
579
580 // Load K projection
583 [&lw]() -> const std::vector<float>& { return lw.k_proj_f32; },
584 lw.k_proj_f32_dev,
585 "K Proj"
586 );
587
588 // Load V projection
591 [&lw]() -> const std::vector<float>& { return lw.v_proj_f32; },
592 lw.v_proj_f32_dev,
593 "V Proj"
594 );
595
596 // Load O projection
599 [&lw]() -> const std::vector<float>& { return lw.o_proj_f32; },
600 lw.o_proj_f32_dev,
601 "O Proj"
602 );
603
604 // Load Gate projection
607 [&lw]() -> const std::vector<float>& { return lw.gate_proj_f32; },
608 lw.gate_proj_f32_dev,
609 "Gate Proj"
610 );
611
612 // Load Up projection
615 [&lw]() -> const std::vector<float>& { return lw.up_proj_f32; },
616 lw.up_proj_f32_dev,
617 "Up Proj"
618 );
619
620 // Load Down projection
623 [&lw]() -> const std::vector<float>& { return lw.down_proj_f32; },
624 lw.down_proj_f32_dev,
625 "Down Proj"
626 );
627#else
628 // CPU-only build - this function is a no-op
629 Logger::info("CPU-only build: ensure_layer_weights_on_gpu is a no-op for layer " + std::to_string(layer_idx));
630#endif
631}
632
633void TinyLlamaModel::free_layer_gpu_weights(int layer_idx) {
634#ifdef HAS_CUDA
635 if (layer_idx < 0 || layer_idx >= layers.size()) return;
636
638
639 if (lw.q_proj_f32_dev) { cudaFree(lw.q_proj_f32_dev); lw.q_proj_f32_dev = nullptr; }
640 if (lw.k_proj_f32_dev) { cudaFree(lw.k_proj_f32_dev); lw.k_proj_f32_dev = nullptr; }
641 if (lw.v_proj_f32_dev) { cudaFree(lw.v_proj_f32_dev); lw.v_proj_f32_dev = nullptr; }
642 if (lw.o_proj_f32_dev) { cudaFree(lw.o_proj_f32_dev); lw.o_proj_f32_dev = nullptr; }
643 if (lw.gate_proj_f32_dev) { cudaFree(lw.gate_proj_f32_dev); lw.gate_proj_f32_dev = nullptr; }
644 if (lw.up_proj_f32_dev) { cudaFree(lw.up_proj_f32_dev); lw.up_proj_f32_dev = nullptr; }
645 if (lw.down_proj_f32_dev) { cudaFree(lw.down_proj_f32_dev); lw.down_proj_f32_dev = nullptr; }
646
647 Logger::info("Freed GPU weights for layer " + std::to_string(layer_idx) + " (~200MB freed)");
648#else
649 // CPU-only build - this function is a no-op
650 Logger::info("CPU-only build: free_layer_gpu_weights is a no-op for layer " + std::to_string(layer_idx));
651#endif
652}
653
654void map_gguf_weights(const GGUFData& gguf, TinyLlamaModel& model) {
655 Logger::info("Mapping GGUF weights to model fields (ULTRA-OPTIMIZED VERSION)...");
656
657 const uint8_t* actual_data_block_start = nullptr;
658
659 // Determine which data source to use
660 if (gguf.mapped_tensor_data != nullptr && gguf.mapped_tensor_data_size > 0) {
661 const uint8_t* mmap_buffer_start = static_cast<const uint8_t*>(gguf.mapped_tensor_data);
662 actual_data_block_start = mmap_buffer_start + gguf.offset_diff_for_mmap;
663 Logger::info("map_gguf_weights: Using mmap mode (ZERO-COPY). Size: " +
664 std::to_string(gguf.mapped_tensor_data_size) + " bytes.");
665 } else if (!gguf.tensor_data.empty()) {
666 actual_data_block_start = gguf.tensor_data.data();
667 Logger::info("map_gguf_weights: Using non-mmap mode. Size: " +
668 std::to_string(gguf.tensor_data.size()) + " bytes.");
669 } else {
670 Logger::error("GGUF tensor data is not available. Cannot map weights.");
671 return;
672 }
673
674 const size_t num_tensors = gguf.tensor_infos_map.size();
675 Logger::info("Processing " + std::to_string(num_tensors) + " tensors with ultra-optimized parallel mapping...");
676
677 // Pre-allocate containers to avoid reallocations during parallel processing
678 std::vector<std::pair<std::string, GGUFTensorInfo>> tensor_pairs;
679 tensor_pairs.reserve(num_tensors);
680 for (const auto& pair : gguf.tensor_infos_map) {
681 tensor_pairs.emplace_back(pair.first, pair.second);
682 }
683
684 // Reserve capacity for major model containers to reduce allocations
685 const size_t typical_blocks = 4096;
686 if (model.lm_head_q8_0.capacity() == 0) model.lm_head_q8_0.reserve(32768);
687 if (model.embed_tokens_q8_0.capacity() == 0) model.embed_tokens_q8_0.reserve(32768);
688
689 for (auto& layer : model.layers) {
690 if (layer.q_proj_q8_0.capacity() == 0) layer.q_proj_q8_0.reserve(typical_blocks);
691 if (layer.k_proj_q8_0.capacity() == 0) layer.k_proj_q8_0.reserve(typical_blocks);
692 if (layer.v_proj_q8_0.capacity() == 0) layer.v_proj_q8_0.reserve(typical_blocks);
693 if (layer.o_proj_q8_0.capacity() == 0) layer.o_proj_q8_0.reserve(typical_blocks);
694 if (layer.gate_proj_q8_0.capacity() == 0) layer.gate_proj_q8_0.reserve(typical_blocks);
695 if (layer.up_proj_q8_0.capacity() == 0) layer.up_proj_q8_0.reserve(typical_blocks);
696 if (layer.down_proj_q8_0.capacity() == 0) layer.down_proj_q8_0.reserve(typical_blocks);
697 }
698
699 // BLAZING FAST: Sort tensors by type and process in bulk
700 std::vector<size_t> global_tensor_indices;
701 std::vector<std::vector<size_t>> layer_tensor_indices(model.layers.size());
702
703 global_tensor_indices.reserve(10); // output.weight, token_embd.weight, output_norm.weight, etc.
704 for (auto& layer_indices : layer_tensor_indices) {
705 layer_indices.reserve(9); // 7 weights + 2 norms per layer
706 }
707
708 // ULTRA-FAST categorization without string operations
709 for (size_t i = 0; i < tensor_pairs.size(); ++i) {
710 const std::string& name = tensor_pairs[i].first;
711 if (name[0] == 'o' || name[0] == 't') { // output.weight, token_embd.weight, output_norm.weight
712 global_tensor_indices.push_back(i);
713 } else if (name.size() > 4 && name[0] == 'b' && name[1] == 'l' && name[2] == 'k' && name[3] == '.') {
714 // Extract layer index without substr - MUCH faster
715 size_t layer_start = 4;
716 size_t layer_end = name.find('.', layer_start);
717 if (layer_end != std::string::npos) {
718 int layer_idx = 0;
719 for (size_t pos = layer_start; pos < layer_end; ++pos) {
720 layer_idx = layer_idx * 10 + (name[pos] - '0');
721 }
722 if (layer_idx >= 0 && static_cast<size_t>(layer_idx) < model.layers.size()) {
723 layer_tensor_indices[layer_idx].push_back(i);
724 }
725 }
726 }
727 }
728
729 std::atomic<int> processed_count{0};
730 std::atomic<int> error_count{0};
731
732 // Process global tensors sequentially (small count, avoid overhead)
733 for (size_t idx : global_tensor_indices) {
734 try {
735 const std::string& target_field_key = tensor_pairs[idx].first;
736 const GGUFTensorInfo& info = tensor_pairs[idx].second;
737 const uint8_t* tensor_data_ptr = actual_data_block_start + info.offset;
738
739 // Global tensors with optimized type dispatch
740 if (target_field_key == "output.weight") {
741 switch (info.type) {
743 size_t num_blocks = info.size_in_bytes / sizeof(block_q6_K);
744 model.lm_head_q6k.resize(num_blocks);
745 std::memcpy(model.lm_head_q6k.data(), tensor_data_ptr, info.size_in_bytes);
746 break;
747 }
749 size_t num_blocks = info.size_in_bytes / sizeof(block_q4_K);
750 model.lm_head_q4k.resize(num_blocks);
751 std::memcpy(model.lm_head_q4k.data(), tensor_data_ptr, info.size_in_bytes);
752 break;
753 }
755 size_t num_blocks = info.size_in_bytes / sizeof(block_q8_0);
756 model.lm_head_q8_0.resize(num_blocks);
757 std::memcpy(model.lm_head_q8_0.data(), tensor_data_ptr, info.size_in_bytes);
758 break;
759 }
761 size_t num_blocks = info.size_in_bytes / sizeof(block_q8_K);
762 model.lm_head_q8k.resize(num_blocks);
763 std::memcpy(model.lm_head_q8k.data(), tensor_data_ptr, info.size_in_bytes);
764 break;
765 }
767 size_t num_elements = info.size_in_bytes / sizeof(float);
768 model.lm_head_f32.resize(num_elements);
769 std::memcpy(model.lm_head_f32.data(), tensor_data_ptr, info.size_in_bytes);
770 break;
771 }
772 }
773 processed_count++;
774 continue;
775 }
776
777 if (target_field_key == "token_embd.weight") {
778 switch (info.type) {
780 size_t num_blocks = info.size_in_bytes / sizeof(block_q4_K);
781 model.embed_tokens_q4k.resize(num_blocks);
782 std::memcpy(model.embed_tokens_q4k.data(), tensor_data_ptr, info.size_in_bytes);
783 break;
784 }
786 size_t num_blocks = info.size_in_bytes / sizeof(block_q8_0);
787 model.embed_tokens_q8_0.resize(num_blocks);
788 std::memcpy(model.embed_tokens_q8_0.data(), tensor_data_ptr, info.size_in_bytes);
789 break;
790 }
792 size_t num_blocks = info.size_in_bytes / sizeof(block_q8_K);
793 model.embed_tokens_q8k.resize(num_blocks);
794 std::memcpy(model.embed_tokens_q8k.data(), tensor_data_ptr, info.size_in_bytes);
795 break;
796 }
798 size_t num_blocks = info.size_in_bytes / sizeof(block_q6_K);
799 model.embed_tokens_q6k.resize(num_blocks);
800 std::memcpy(model.embed_tokens_q6k.data(), tensor_data_ptr, info.size_in_bytes);
801 break;
802 }
804 size_t num_elements = info.size_in_bytes / sizeof(float);
805 model.embed_tokens_f32.resize(num_elements);
806 std::memcpy(model.embed_tokens_f32.data(), tensor_data_ptr, info.size_in_bytes);
807 break;
808 }
809 }
810 processed_count++;
811 continue;
812 }
813
814 if (target_field_key == "output_norm.weight") {
815 if (info.type == GGMLType::GGML_TYPE_F32) {
816 size_t num_elements = info.size_in_bytes / sizeof(float);
817 model.final_norm_f32.resize(num_elements);
818 std::memcpy(model.final_norm_f32.data(), tensor_data_ptr, info.size_in_bytes);
819 }
820 processed_count++;
821 continue;
822 }
823
824 } catch (const std::exception& e) {
825 error_count++;
826 }
827 }
828
829 // ULTRA-FAST: Process layers in parallel with pre-sorted tensors
830 #pragma omp parallel for schedule(static) if(model.layers.size() > 4)
831 for (int layer_idx = 0; layer_idx < (int)layer_tensor_indices.size(); ++layer_idx) {
832 const auto& layer_indices = layer_tensor_indices[layer_idx];
833 if (layer_indices.empty()) continue;
834
835 LayerWeights& layer = model.layers[layer_idx];
836
837 try {
838 for (size_t idx : layer_indices) {
839 const std::string& name = tensor_pairs[idx].first;
840 const GGUFTensorInfo& info = tensor_pairs[idx].second;
841 const uint8_t* tensor_data_ptr = actual_data_block_start + info.offset;
842
843 // BLAZING FAST: Direct character matching (faster than hashing)
844 const size_t last_dot = name.find_last_of('.');
845 if (last_dot == std::string::npos) continue;
846
847 const char* field = name.c_str() + name.find('.', 4) + 1;
848
849 // BLAZING FAST: Direct character-based dispatch without string operations
850 #define FAST_COPY_WEIGHT(target_vec, block_type) \
851 target_vec.resize(info.size_in_bytes / sizeof(block_type)); \
852 std::memcpy(target_vec.data(), tensor_data_ptr, info.size_in_bytes);
853
854 // IMPROVED: Pattern matching based on tensor name structure
855 const char* name_cstr = name.c_str();
856 const size_t name_len = name.length();
857
858 if (name_len > 10 && name.find("attn_") != std::string::npos) {
859 if (name.find("attn_q.weight") != std::string::npos) {
860 switch (info.type) {
861 case GGMLType::GGML_TYPE_Q8_0: FAST_COPY_WEIGHT(layer.q_proj_q8_0, block_q8_0); break;
862 case GGMLType::GGML_TYPE_Q4_K: FAST_COPY_WEIGHT(layer.q_proj_q4k, block_q4_K); break;
863 case GGMLType::GGML_TYPE_Q6_K: FAST_COPY_WEIGHT(layer.q_proj_q6k, block_q6_K); break;
864 case GGMLType::GGML_TYPE_Q8_K: FAST_COPY_WEIGHT(layer.q_proj_q8k, block_q8_K); break;
865 case GGMLType::GGML_TYPE_BF16: FAST_COPY_WEIGHT(layer.q_proj, uint16_t); break;
866 }
867 } else if (name.find("attn_k.weight") != std::string::npos) {
868 switch (info.type) {
869 case GGMLType::GGML_TYPE_Q8_0: FAST_COPY_WEIGHT(layer.k_proj_q8_0, block_q8_0); break;
870 case GGMLType::GGML_TYPE_Q4_K: FAST_COPY_WEIGHT(layer.k_proj_q4k, block_q4_K); break;
871 case GGMLType::GGML_TYPE_Q6_K: FAST_COPY_WEIGHT(layer.k_proj_q6k, block_q6_K); break;
872 case GGMLType::GGML_TYPE_Q8_K: FAST_COPY_WEIGHT(layer.k_proj_q8k, block_q8_K); break;
873 case GGMLType::GGML_TYPE_BF16: FAST_COPY_WEIGHT(layer.k_proj, uint16_t); break;
874 }
875 } else if (name.find("attn_v.weight") != std::string::npos) {
876 switch (info.type) {
877 case GGMLType::GGML_TYPE_Q8_0: FAST_COPY_WEIGHT(layer.v_proj_q8_0, block_q8_0); break;
878 case GGMLType::GGML_TYPE_Q4_K: FAST_COPY_WEIGHT(layer.v_proj_q4k, block_q4_K); break;
879 case GGMLType::GGML_TYPE_Q6_K: FAST_COPY_WEIGHT(layer.v_proj_q6k, block_q6_K); break;
880 case GGMLType::GGML_TYPE_Q8_K: FAST_COPY_WEIGHT(layer.v_proj_q8k, block_q8_K); break;
881 case GGMLType::GGML_TYPE_BF16: FAST_COPY_WEIGHT(layer.v_proj, uint16_t); break;
882 }
883 } else if (name.find("attn_output.weight") != std::string::npos) {
884 switch (info.type) {
885 case GGMLType::GGML_TYPE_Q8_0: FAST_COPY_WEIGHT(layer.o_proj_q8_0, block_q8_0); break;
886 case GGMLType::GGML_TYPE_Q4_K: FAST_COPY_WEIGHT(layer.o_proj_q4k, block_q4_K); break;
887 case GGMLType::GGML_TYPE_Q6_K: FAST_COPY_WEIGHT(layer.o_proj_q6k, block_q6_K); break;
888 case GGMLType::GGML_TYPE_Q8_K: FAST_COPY_WEIGHT(layer.o_proj_q8k, block_q8_K); break;
889 case GGMLType::GGML_TYPE_BF16: FAST_COPY_WEIGHT(layer.o_proj, uint16_t); break;
890 }
891 } else if (name.find("attn_norm.weight") != std::string::npos && info.type == GGMLType::GGML_TYPE_F32) {
892 FAST_COPY_WEIGHT(layer.input_layernorm_f32, float);
893 }
894 } else if (name_len > 10 && name.find("ffn_") != std::string::npos) {
895 if (name.find("ffn_gate.weight") != std::string::npos) {
896 switch (info.type) {
897 case GGMLType::GGML_TYPE_Q8_0: FAST_COPY_WEIGHT(layer.gate_proj_q8_0, block_q8_0); break;
898 case GGMLType::GGML_TYPE_Q4_K: FAST_COPY_WEIGHT(layer.gate_proj_q4k, block_q4_K); break;
899 case GGMLType::GGML_TYPE_Q6_K: FAST_COPY_WEIGHT(layer.gate_proj_q6k, block_q6_K); break;
900 case GGMLType::GGML_TYPE_Q8_K: FAST_COPY_WEIGHT(layer.gate_proj_q8k, block_q8_K); break;
901 case GGMLType::GGML_TYPE_BF16: FAST_COPY_WEIGHT(layer.gate_proj, uint16_t); break;
902 }
903 } else if (name.find("ffn_up.weight") != std::string::npos) {
904 switch (info.type) {
905 case GGMLType::GGML_TYPE_Q8_0: FAST_COPY_WEIGHT(layer.up_proj_q8_0, block_q8_0); break;
906 case GGMLType::GGML_TYPE_Q4_K: FAST_COPY_WEIGHT(layer.up_proj_q4k, block_q4_K); break;
907 case GGMLType::GGML_TYPE_Q6_K: FAST_COPY_WEIGHT(layer.up_proj_q6k, block_q6_K); break;
908 case GGMLType::GGML_TYPE_Q8_K: FAST_COPY_WEIGHT(layer.up_proj_q8k, block_q8_K); break;
909 case GGMLType::GGML_TYPE_BF16: FAST_COPY_WEIGHT(layer.up_proj, uint16_t); break;
910 }
911 } else if (name.find("ffn_down.weight") != std::string::npos) {
912 switch (info.type) {
913 case GGMLType::GGML_TYPE_Q8_0: FAST_COPY_WEIGHT(layer.down_proj_q8_0, block_q8_0); break;
914 case GGMLType::GGML_TYPE_Q4_K: FAST_COPY_WEIGHT(layer.down_proj_q4k, block_q4_K); break;
915 case GGMLType::GGML_TYPE_Q6_K: FAST_COPY_WEIGHT(layer.down_proj_q6k, block_q6_K); break;
916 case GGMLType::GGML_TYPE_Q8_K: FAST_COPY_WEIGHT(layer.down_proj_q8k, block_q8_K); break;
917 case GGMLType::GGML_TYPE_BF16: FAST_COPY_WEIGHT(layer.down_proj, uint16_t); break;
918 }
919 } else if (name.find("ffn_norm.weight") != std::string::npos && info.type == GGMLType::GGML_TYPE_F32) {
920 FAST_COPY_WEIGHT(layer.post_attention_layernorm_f32, float);
921 }
922 }
923
924 #undef FAST_COPY_WEIGHT
925 }
926 processed_count++;
927 } catch (const std::exception& e) {
928 error_count++;
929 }
930 }
931
932 Logger::info("Finished mapping GGUF weights: " + std::to_string(processed_count.load()) + "/" +
933 std::to_string(num_tensors) + " tensors processed successfully (errors: " +
934 std::to_string(error_count.load()) + ") with ultra-optimized parallel mapping");
935}
936
937#else // HAS_CUDA
938
940 Logger::info("CPU-only build: ensure_f32_concatenated_weights_loaded is a no-op");
941}
942
944 Logger::info("CPU-only build: ensure_bf16_concatenated_weights_loaded is a no-op");
945}
946
948 Logger::info("CPU-only build: free_bf16_concatenated_weights is a no-op");
949}
950
951#endif // HAS_CUDA
952
static void warning(const std::string &message)
Definition logger.cpp:139
static void info(const std::string &message)
Definition logger.cpp:135
static void error(const std::string &message)
Definition logger.cpp:143
Main transformer model class for TinyLlama.
Definition model.h:285
void free_layer_gpu_weights(int layer_idx)
bool f32_concatenated_weights_loaded_
Definition model.h:558
std::vector< block_q6_K > embed_tokens_q6k
Definition model.h:488
void ensure_up_proj_dequantized(int layer_idx)
std::vector< float > final_norm_f32
Definition model.h:486
void free_bf16_concatenated_weights()
void ensure_v_proj_dequantized(int layer_idx)
std::vector< block_q4_K > lm_head_q4k
Definition model.h:487
std::vector< block_q6_K > lm_head_q6k
Definition model.h:488
void ensure_layer_weights_on_gpu(int layer_idx)
void ensure_embed_tokens_dequantized()
std::vector< LayerWeights > layers
Definition model.h:491
std::vector< block_q8_0 > embed_tokens_q8_0
Definition model.h:489
ModelConfig config_
Definition model.h:480
void ensure_o_proj_dequantized(int layer_idx)
void clear_layer_dequantized_weights(int layer_idx)
std::vector< block_q4_K > embed_tokens_q4k
Definition model.h:487
void ensure_k_proj_dequantized(int layer_idx)
std::vector< block_q8_0 > lm_head_q8_0
Definition model.h:489
std::vector< uint16_t > lm_head
Definition model.h:484
void ensure_f32_concatenated_weights_loaded()
std::vector< uint16_t > embed_tokens
Definition model.h:483
std::vector< block_q8_K > embed_tokens_q8k
Definition model.h:490
void ensure_bf16_concatenated_weights_loaded()
void ensure_q_proj_dequantized(int layer_idx)
void ensure_down_proj_dequantized(int layer_idx)
void ensure_gate_proj_dequantized(int layer_idx)
std::vector< float > embed_tokens_f32
Definition model.h:486
std::vector< float > forward(std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask)
Run the forward pass for the model on CPU layers.
Definition model.cpp:536
void ensure_lm_head_dequantized()
std::vector< float > lm_head_f32
Definition model.h:486
std::vector< block_q8_K > lm_head_q8k
Definition model.h:490
Type definitions for GGML (Georgi Gerganov Machine Learning) library.
@ GGML_TYPE_F32
Definition ggml_types.h:22
@ GGML_TYPE_BF16
Definition ggml_types.h:40
@ GGML_TYPE_Q8_K
Definition ggml_types.h:36
@ GGML_TYPE_Q6_K
Definition ggml_types.h:35
@ GGML_TYPE_Q8_0
Definition ggml_types.h:29
@ GGML_TYPE_Q4_K
Definition ggml_types.h:33
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
Logging utilities for the TinyLlama implementation.
float bfloat16_to_float32(uint16_t b16)
Definition utils.cpp:144
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
void dequantize_vector_q6k_to_f32(const std::vector< block_q6_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q6_K blocks to a vector of float32.
void dequantize_vector_q8_0_to_f32(const std::vector< block_q8_0 > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q8_0 blocks to a vector of float32.
void dequantize_vector_q4k_to_f32(const std::vector< block_q4_K > &q_weights, std::vector< float > &f32_weights, size_t total_num_elements, int log_first_n_blocks)
Dequantizes a vector of Q4_K blocks to a vector of float32.
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q8_k(const std::vector< block_q8_K > &q_data, std::vector< float > &x, int n, bool log_this_block)
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
Complete representation of a GGUF file's contents.
size_t offset_diff_for_mmap
std::vector< uint8_t > tensor_data
size_t mapped_tensor_data_size
void * mapped_tensor_data
std::map< std::string, GGUFTensorInfo > tensor_infos_map
Information about a tensor stored in a GGUF file.
size_t size_in_bytes
uint64_t offset
Structure holding all weights for a single transformer layer.
Definition model.h:238
std::vector< block_q6_K > down_proj_q6k
Definition model.h:258
std::vector< block_q4_K > k_proj_q4k
Definition model.h:255
std::vector< block_q6_K > k_proj_q6k
Definition model.h:257
std::vector< float > input_layernorm_f32
Definition model.h:251
std::vector< block_q8_K > o_proj_q8k
Definition model.h:261
std::vector< uint16_t > gate_proj
Definition model.h:247
std::vector< uint16_t > v_proj
Definition model.h:244
std::vector< block_q4_K > v_proj_q4k
Definition model.h:255
std::vector< block_q4_K > up_proj_q4k
Definition model.h:256
std::vector< block_q8_0 > o_proj_q8_0
Definition model.h:259
std::vector< uint16_t > o_proj
Definition model.h:245
std::vector< block_q8_K > down_proj_q8k
Definition model.h:262
std::vector< block_q4_K > down_proj_q4k
Definition model.h:256
std::vector< block_q4_K > gate_proj_q4k
Definition model.h:256
std::vector< block_q6_K > v_proj_q6k
Definition model.h:257
std::vector< block_q8_K > up_proj_q8k
Definition model.h:262
std::vector< block_q6_K > up_proj_q6k
Definition model.h:258
std::vector< block_q8_0 > v_proj_q8_0
Definition model.h:259
std::vector< block_q8_K > v_proj_q8k
Definition model.h:261
std::vector< block_q8_0 > gate_proj_q8_0
Definition model.h:260
std::vector< block_q6_K > q_proj_q6k
Definition model.h:257
std::vector< block_q8_K > k_proj_q8k
Definition model.h:261
std::vector< block_q6_K > gate_proj_q6k
Definition model.h:258
std::vector< block_q8_K > gate_proj_q8k
Definition model.h:262
std::vector< uint16_t > down_proj
Definition model.h:249
std::vector< block_q8_0 > q_proj_q8_0
Definition model.h:259
std::vector< block_q8_0 > k_proj_q8_0
Definition model.h:259
std::vector< uint16_t > up_proj
Definition model.h:248
std::vector< block_q4_K > o_proj_q4k
Definition model.h:255
std::vector< uint16_t > q_proj
Definition model.h:242
std::vector< block_q8_K > q_proj_q8k
Definition model.h:261
std::vector< float > post_attention_layernorm_f32
Definition model.h:252
std::vector< block_q6_K > o_proj_q6k
Definition model.h:257
std::vector< block_q8_0 > down_proj_q8_0
Definition model.h:260
std::vector< block_q8_0 > up_proj_q8_0
Definition model.h:260
std::vector< block_q4_K > q_proj_q4k
Definition model.h:255
std::vector< uint16_t > k_proj
Definition model.h:243
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
int num_attention_heads
Definition model.h:83
int intermediate_size
Definition model.h:82
int num_cpu_offload_layers
Definition model.h:104
bool enable_memory_efficient_layers
Definition model.h:107
int num_hidden_layers
Definition model.h:85
int num_key_value_heads
Definition model.h:84
4-bit K-quantized block structure
6-bit K-quantized block structure
Simple 8-bit quantized block structure.
8-bit K-quantized block structure with block sums
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198