TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
gpu_initialization.cpp
Go to the documentation of this file.
2#include "logger.h"
3#include "utils.h"
4#include "quantization.h"
5#include "model_constants.h"
6#include "model_macros.h"
7#include "weight_management.h"
8#ifdef HAS_CUDA
9#include "cuda_kernels.h"
10#endif
11#include <algorithm>
12#include <cmath>
13#include <cstring>
14
16 Logger::info("[INIT_GPU_ROPE_DEBUG_L1113] Absolute Start of initialize_gpu_and_rope: config_.num_cpu_offload_layers = " + std::to_string(config_.num_cpu_offload_layers) +
17 ", config_.num_hidden_layers = " + std::to_string(config_.num_hidden_layers));
18 Logger::info("[GPU_ROPE_INIT_ENTRY] Entered initialize_gpu_and_rope. Requested CPU Offload Layers: " + std::to_string(config_.num_cpu_offload_layers) + ", Total Hidden Layers: " + std::to_string(config_.num_hidden_layers));
19 int hs = config_.hidden_size;
21 int nhl = config_.num_hidden_layers;
22 int vs = config_.vocab_size;
23 int n_heads = config_.num_attention_heads;
24 int n_kv_heads = config_.num_key_value_heads;
25
26 int num_cpu_layers_clamped = config_.num_cpu_offload_layers;
27 if (num_cpu_layers_clamped < 0) num_cpu_layers_clamped = 0;
28 if (num_cpu_layers_clamped > nhl) {
29 Logger::warning("Requested CPU offload layers (" + std::to_string(config_.num_cpu_offload_layers) +
30 ") exceeds total hidden layers (" + std::to_string(nhl) +
31 "). Clamping to " + std::to_string(nhl) + " layers on CPU.");
32 num_cpu_layers_clamped = nhl;
33 }
34 int active_num_cpu_layers = num_cpu_layers_clamped;
35 int active_num_gpu_layers = nhl - active_num_cpu_layers;
36
37 Logger::info("Effective CPU layers for this init: " + std::to_string(active_num_cpu_layers) + ", Effective GPU layers for this init: " + std::to_string(active_num_gpu_layers));
38
39 if (hs <= 0) throw std::runtime_error("Invalid model config: hidden_size must be positive.");
40 if (vs <= 0) throw std::runtime_error("Invalid model config: vocab_size must be positive.");
41 if (n_heads <= 0) throw std::runtime_error("Invalid model config: num_attention_heads must be positive.");
42 if (n_kv_heads <= 0) throw std::runtime_error("Invalid model config: num_key_value_heads must be positive.");
43 if (hs % n_heads != 0) throw std::runtime_error("Invalid model config: hidden_size not divisible by num_attention_heads.");
44
45 int kv_dim = (hs / n_heads) * n_kv_heads;
46 int head_dim = hs / n_heads;
47
48 Logger::info("Precomputing RoPE frequencies on CPU (always done).");
49 int max_seq_len = config_.max_position_embeddings;
50 precomputed_freqs_cis_.resize((max_seq_len * head_dim) / 2);
51 float theta = config_.rope_theta;
52 for (int pos = 0; pos < max_seq_len; ++pos) {
53 for (int i_rope = 0; i_rope < head_dim; i_rope += 2) {
54 float freq = std::pow(theta, -((float)i_rope) / head_dim);
55 float angle = pos * freq;
56 precomputed_freqs_cis_[(pos * head_dim / 2) + (i_rope / 2)] = {std::cos(angle), std::sin(angle)};
57 }
58 }
59 Logger::info("Finished precomputing RoPE cos/sin frequencies on CPU.");
60
61#ifdef HAS_CUDA
62#define SAFE_CUDA_FREE(ptr) if(ptr) { cudaFree(ptr); ptr = nullptr; }
63
64 if (active_num_gpu_layers == 0) {
65 Logger::info("No layers assigned to GPU (active_num_gpu_layers = 0). Cleaning up existing CUDA resources and skipping GPU initialization.");
66
67 SAFE_CUDA_FREE(final_norm_dev);
68 for (int i = 0; i < nhl; ++i) {
69 SAFE_CUDA_FREE(layers[i].input_layernorm_dev);
70 SAFE_CUDA_FREE(layers[i].post_attention_layernorm_dev);
71 }
72 SAFE_CUDA_FREE(token_embedding_table_dev_);
73 SAFE_CUDA_FREE(lm_head_dev_);
74 SAFE_CUDA_FREE(w_q_dev_); SAFE_CUDA_FREE(w_k_dev_); SAFE_CUDA_FREE(w_v_dev_); SAFE_CUDA_FREE(w_o_dev_);
75 SAFE_CUDA_FREE(w_gate_dev_); SAFE_CUDA_FREE(w_up_dev_); SAFE_CUDA_FREE(w_down_dev_);
76 SAFE_CUDA_FREE(all_freqs_cis_dev);
77 SAFE_CUDA_FREE(x_dev_); SAFE_CUDA_FREE(x_norm_dev_); SAFE_CUDA_FREE(x_resid1_dev_); SAFE_CUDA_FREE(x_resid2_dev_);
78 SAFE_CUDA_FREE(q_dev_); SAFE_CUDA_FREE(k_dev_); SAFE_CUDA_FREE(v_dev_); SAFE_CUDA_FREE(attn_out_dev_);
79 SAFE_CUDA_FREE(attn_proj_dev_); SAFE_CUDA_FREE(gate_vec_dev_); SAFE_CUDA_FREE(up_vec_dev_);
80 SAFE_CUDA_FREE(swiglu_vec_dev_); SAFE_CUDA_FREE(mlp_down_dev_); SAFE_CUDA_FREE(logits_dev_);
81 SAFE_CUDA_FREE(token_embedding_table_f32_dev_);
82 SAFE_CUDA_FREE(lm_head_f32_dev_);
83 SAFE_CUDA_FREE(w_q_f32_dev_); SAFE_CUDA_FREE(w_k_f32_dev_); SAFE_CUDA_FREE(w_v_f32_dev_); SAFE_CUDA_FREE(w_o_f32_dev_);
84 SAFE_CUDA_FREE(w_gate_f32_dev_); SAFE_CUDA_FREE(w_up_f32_dev_); SAFE_CUDA_FREE(w_down_f32_dev_);
85
86 if (cublas_handle_) { cublasDestroy(cublas_handle_); cublas_handle_ = nullptr; }
87 return;
88 }
89
90 Logger::info("Initializing CUDA resources for " + std::to_string(active_num_gpu_layers) + " GPU layers.");
91 if (!cublas_handle_) {
92 cublasStatus_t cublas_status = cublasCreate(&cublas_handle_);
93 if (cublas_status != CUBLAS_STATUS_SUCCESS) {
94 throw std::runtime_error("Failed to initialize cuBLAS: " + std::to_string(cublas_status));
95 }
96 Logger::info("cuBLAS handle created successfully.");
97
98 // Check for BF16 Tensor Core support
99 this->use_bf16_tensor_cores_ = false; // Default to false
100 cudaDeviceProp props;
101 int current_device;
102 gpuErrchk(cudaGetDevice(&current_device));
103 gpuErrchk(cudaGetDeviceProperties(&props, current_device));
104
105 bool has_bf16_tensor_core_hw = ((props.major == 7 && props.minor == 5) || props.major >= 8);
106 bool dimensions_ok_for_tensor_cores = (config_.hidden_size % 8 == 0) && (config_.vocab_size % 4 == 0);
107
108 if (has_bf16_tensor_core_hw) {
109 if (dimensions_ok_for_tensor_cores) {
110 Logger::info("GPU " + std::string(props.name) + " (CC " + std::to_string(props.major) + "." + std::to_string(props.minor) + ") supports BF16 Tensor Cores AND model dimensions (hs: " + std::to_string(config_.hidden_size) + ", vs: " + std::to_string(config_.vocab_size) + ") are compatible. Enabling Tensor Core path for matvec_bf16_f32.");
111 this->use_bf16_tensor_cores_ = true;
112 } else {
113 Logger::info("GPU " + std::string(props.name) + " (CC " + std::to_string(props.major) + "." + std::to_string(props.minor) + ") supports BF16 Tensor Cores, BUT model dimensions (hs: " + std::to_string(config_.hidden_size) + " (must be div by 8), vs: " + std::to_string(config_.vocab_size) + " (must be div by 4)) are NOT compatible. Disabling Tensor Core path for matvec_bf16_f32.");
114 }
115 } else {
116 Logger::info("GPU " + std::string(props.name) + " (CC " + std::to_string(props.major) + "." + std::to_string(props.minor) + ") does not meet criteria for BF16 Tensor Core use (requires CC >= 7.5). Disabling Tensor Core path for matvec_bf16_f32.");
117 }
118 }
119
120 if (final_norm_f32.empty() && !final_norm.empty()) {
121 Logger::info("Converting final_norm (BF16) to FP32 for GPU.");
123 }
124 if (!final_norm_f32.empty()) {
125 SAFE_CUDA_FREE(final_norm_dev);
126 gpuErrchk(cudaMalloc(&final_norm_dev, final_norm_f32.size() * sizeof(float)));
127 gpuErrchk(cudaMemcpy(final_norm_dev, final_norm_f32.data(), final_norm_f32.size() * sizeof(float), cudaMemcpyHostToDevice));
128 Logger::info("Copied final_norm weights (FP32) to GPU.");
129 } else {
130 Logger::warning("Final norm weights (FP32) are empty, skipping GPU copy. This might be an issue if GPU layers are expected to use it.");
131 }
132
133 for (int i = 0; i < active_num_cpu_layers; ++i) {
134 if (static_cast<size_t>(i) < layers.size()) {
135 SAFE_CUDA_FREE(layers[i].input_layernorm_dev);
136 SAFE_CUDA_FREE(layers[i].post_attention_layernorm_dev);
137 }
138 }
139 Logger::info("Copying layer norm weights (FP32) to GPU for layers " + std::to_string(active_num_cpu_layers) + " to " + std::to_string(nhl - 1));
140 Logger::info("[INIT_DEBUG_PRE_LOOP] Active CPU layers: " + std::to_string(active_num_cpu_layers));
141 if (nhl > 0 && layers.size() > 0) {
142 Logger::info("[INIT_DEBUG_PRE_LOOP] layers[0].input_layernorm_f32.empty(): " + std::string(layers[0].input_layernorm_f32.empty() ? "YES" : "NO") +
143 ", Size: " + std::to_string(layers[0].input_layernorm_f32.size()));
144 }
145 for (int i = active_num_cpu_layers; i < nhl; ++i) {
146 if (static_cast<size_t>(i) >= layers.size()) {
147 Logger::error("Layer index " + std::to_string(i) + " out of bounds for layers vector (size: " + std::to_string(layers.size()) + ")");
148 continue;
149 }
150 SAFE_CUDA_FREE(layers[i].input_layernorm_dev);
151 SAFE_CUDA_FREE(layers[i].post_attention_layernorm_dev);
152
153 if (layers[i].input_layernorm_f32.empty() && !layers[i].input_layernorm.empty()) {
154 layers[i].input_layernorm_f32 = bf16vec_to_float_vec(layers[i].input_layernorm);
155 }
156 if (layers[i].post_attention_layernorm_f32.empty() && !layers[i].post_attention_layernorm.empty()) {
157 layers[i].post_attention_layernorm_f32 = bf16vec_to_float_vec(layers[i].post_attention_layernorm);
158 }
159
160 if (!layers[i].input_layernorm_f32.empty()) {
161 gpuErrchk(cudaMalloc(&layers[i].input_layernorm_dev, layers[i].input_layernorm_f32.size() * sizeof(float)));
162 gpuErrchk(cudaMemcpy(layers[i].input_layernorm_dev, layers[i].input_layernorm_f32.data(), layers[i].input_layernorm_f32.size() * sizeof(float), cudaMemcpyHostToDevice));
163 if (i == active_num_cpu_layers) {
164 Logger::info("[INIT_DEBUG] layers[" + std::to_string(i) + "].input_layernorm_dev allocated. Pointer: " + Logger::ptrToString(layers[i].input_layernorm_dev) +
165 ", Size used for malloc: " + std::to_string(layers[i].input_layernorm_f32.size() * sizeof(float)) + " bytes (" +
166 std::to_string(layers[i].input_layernorm_f32.size()) + " elements). Host vector empty: " + (layers[i].input_layernorm_f32.empty() ? "YES" : "NO"));
167 }
168 } else {
169 throw std::runtime_error("GPU Layer " + std::to_string(i) + ": input_layernorm_f32 weights are empty. Cannot offload to GPU without them.");
170 }
171
172 if (!layers[i].post_attention_layernorm_f32.empty()) {
173 gpuErrchk(cudaMalloc(&layers[i].post_attention_layernorm_dev, layers[i].post_attention_layernorm_f32.size() * sizeof(float)));
174 gpuErrchk(cudaMemcpy(layers[i].post_attention_layernorm_dev, layers[i].post_attention_layernorm_f32.data(), layers[i].post_attention_layernorm_f32.size() * sizeof(float), cudaMemcpyHostToDevice));
175 } else {
176 throw std::runtime_error("GPU Layer " + std::to_string(i) + ": post_attention_layernorm_f32 weights are empty. Cannot offload to GPU without them.");
177 }
178 }
179 Logger::info("Finished processing layer norm weights for GPU layers.");
180
181
182 SAFE_CUDA_FREE(token_embedding_table_dev_);
183 SAFE_CUDA_FREE(token_embedding_table_f32_dev_);
185 bool token_embeddings_processed_to_gpu_bf16 = false;
186
187 if (active_num_gpu_layers > 0) {
188 if (!embed_tokens.empty()) {
189 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, embed_tokens.size() * sizeof(uint16_t)));
190 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, embed_tokens.data(), embed_tokens.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
191 Logger::info("Copied token_embedding_table (bf16 direct from model.embed_tokens) to GPU.");
192 token_embeddings_processed_to_gpu_bf16 = true;
193 }
194 else if (!embed_tokens_f32.empty()) {
195 std::vector<uint16_t> bf16_data(embed_tokens_f32.size());
196 #pragma omp parallel for
197 for (int i = 0; i < (int)embed_tokens_f32.size(); ++i) {
198 bf16_data[i] = float32_to_bfloat16(embed_tokens_f32[i]);
199 }
200 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() * sizeof(uint16_t)));
201 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
202 Logger::info("Converted token_embedding_table (fp32 source -> bf16) to GPU.");
203 token_embeddings_processed_to_gpu_bf16 = true;
204 }
205 else if (!embed_tokens_q8_0.empty()) {
206 std::vector<float> temp_f32_data(embed_tokens_q8_0.size() * GGML_QK8_0);
207 #pragma omp parallel for
208 for (int i = 0; i < (int)embed_tokens_q8_0.size(); ++i) {
209 dequantize_q8_0_block(&embed_tokens_q8_0[i], &temp_f32_data[i * GGML_QK8_0]);
210 }
211 std::vector<uint16_t> bf16_data(temp_f32_data.size());
212 #pragma omp parallel for
213 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
214 bf16_data[i] = float32_to_bfloat16(temp_f32_data[i]);
215 }
216 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() * sizeof(uint16_t)));
217 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
218 Logger::info("Dequantized token_embedding_table (Q8_0 -> fp32 -> bf16) to GPU.");
219 token_embeddings_processed_to_gpu_bf16 = true;
220 }
221 else if (!embed_tokens_q4k.empty()) {
222 std::vector<float> temp_f32_data(embed_tokens_q4k.size() * GGML_QK_K);
223 #pragma omp parallel for
224 for (int i = 0; i < (int)embed_tokens_q4k.size(); ++i) {
225 dequantize_q4_k_m(&embed_tokens_q4k[i], &temp_f32_data[i * GGML_QK_K], GGML_QK_K);
226 }
227 std::vector<uint16_t> bf16_data(temp_f32_data.size());
228 #pragma omp parallel for
229 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
230 bf16_data[i] = float32_to_bfloat16(temp_f32_data[i]);
231 }
232 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() * sizeof(uint16_t)));
233 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
234 Logger::info("Dequantized token_embedding_table (Q4_K -> fp32 -> bf16) to GPU.");
235 token_embeddings_processed_to_gpu_bf16 = true;
236 }
237 else if (!embed_tokens_q6k.empty()) {
238 std::vector<float> temp_f32_data(embed_tokens_q6k.size() * GGML_QK_K);
239 #pragma omp parallel for
240 for (int i = 0; i < (int)embed_tokens_q6k.size(); ++i) {
241 dequantize_q6_k(&embed_tokens_q6k[i], &temp_f32_data[i * GGML_QK_K], GGML_QK_K);
242 }
243 std::vector<uint16_t> bf16_data(temp_f32_data.size());
244 #pragma omp parallel for
245 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
246 bf16_data[i] = float32_to_bfloat16(temp_f32_data[i]);
247 }
248 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() * sizeof(uint16_t)));
249 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
250 Logger::info("Dequantized token_embedding_table (Q6_K -> fp32 -> bf16) to GPU.");
251 token_embeddings_processed_to_gpu_bf16 = true;
252 }
253
254 if (token_embeddings_processed_to_gpu_bf16) {
255 Logger::info("[INIT_DEBUG] token_embedding_table_dev_ (BF16 on GPU) processed. Pointer: " + Logger::ptrToString(token_embedding_table_dev_) +
256 ". Flag token_embeddings_processed_to_gpu_bf16: YES");
257 }
258 if (!token_embeddings_processed_to_gpu_bf16 && active_num_gpu_layers > 0) {
259 Logger::warning("Token embeddings were not processed to GPU as BF16, despite GPU layers being active. This might indicate missing source embedding data in the model structure or an unhandled GGUF type for embeddings.");
260 }
261 } else {
262 Logger::info("No GPU layers active, skipping token embedding table processing for GPU.");
263 }
264
265 SAFE_CUDA_FREE(lm_head_dev_);
266 SAFE_CUDA_FREE(lm_head_f32_dev_);
268 bool lm_head_processed_to_gpu_bf16 = false;
269
270 if (active_num_gpu_layers > 0) {
271 if (!lm_head.empty()) {
272 gpuErrchk(cudaMalloc(&lm_head_dev_, lm_head.size() * sizeof(uint16_t)));
273 gpuErrchk(cudaMemcpy(lm_head_dev_, lm_head.data(), lm_head.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
274 Logger::info("Copied lm_head (bf16 direct from model.lm_head) to GPU.");
275 lm_head_processed_to_gpu_bf16 = true;
276 }
277 else if (!lm_head_f32.empty()) {
278 std::vector<uint16_t> bf16_data(lm_head_f32.size());
279 #pragma omp parallel for
280 for (int i = 0; i < (int)lm_head_f32.size(); ++i) {
281 bf16_data[i] = float32_to_bfloat16(lm_head_f32[i]);
282 }
283 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() * sizeof(uint16_t)));
284 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
285 Logger::info("Converted lm_head (fp32 source -> bf16) to GPU.");
286 lm_head_processed_to_gpu_bf16 = true;
287 }
288 else if (!lm_head_q8_0.empty()) {
289 std::vector<float> temp_f32_data(lm_head_q8_0.size() * GGML_QK8_0);
290 #pragma omp parallel for
291 for (int i = 0; i < (int)lm_head_q8_0.size(); ++i) {
292 dequantize_q8_0_block(&lm_head_q8_0[i], &temp_f32_data[i * GGML_QK8_0]);
293 }
294 std::vector<uint16_t> bf16_data(temp_f32_data.size());
295 #pragma omp parallel for
296 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
297 bf16_data[i] = float32_to_bfloat16(temp_f32_data[i]);
298 }
299 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() * sizeof(uint16_t)));
300 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
301 Logger::info("Dequantized lm_head (Q8_0 -> fp32 -> bf16) to GPU.");
302 lm_head_processed_to_gpu_bf16 = true;
303 }
304 else if (!lm_head_q4k.empty()) {
305 std::vector<float> temp_f32_data(lm_head_q4k.size() * GGML_QK_K);
306 #pragma omp parallel for
307 for (int i = 0; i < (int)lm_head_q4k.size(); ++i) {
308 dequantize_q4_k_m(&lm_head_q4k[i], &temp_f32_data[i * GGML_QK_K], GGML_QK_K);
309 }
310 std::vector<uint16_t> bf16_data(temp_f32_data.size());
311 #pragma omp parallel for
312 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
313 bf16_data[i] = float32_to_bfloat16(temp_f32_data[i]);
314 }
315 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() * sizeof(uint16_t)));
316 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
317 Logger::info("Dequantized lm_head (Q4_K -> fp32 -> bf16) to GPU.");
318 lm_head_processed_to_gpu_bf16 = true;
319 }
320 else if (!lm_head_q6k.empty()) {
321 std::vector<float> temp_f32_data(lm_head_q6k.size() * GGML_QK_K);
322 #pragma omp parallel for
323 for (int i = 0; i < (int)lm_head_q6k.size(); ++i) {
324 dequantize_q6_k(&lm_head_q6k[i], &temp_f32_data[i * GGML_QK_K], GGML_QK_K);
325 }
326 std::vector<uint16_t> bf16_data(temp_f32_data.size());
327 #pragma omp parallel for
328 for (int i = 0; i < (int)temp_f32_data.size(); ++i) {
329 bf16_data[i] = float32_to_bfloat16(temp_f32_data[i]);
330 }
331 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() * sizeof(uint16_t)));
332 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() * sizeof(uint16_t), cudaMemcpyHostToDevice));
333 Logger::info("Dequantized lm_head (Q6_K -> fp32 -> bf16) to GPU.");
334 lm_head_processed_to_gpu_bf16 = true;
335 }
336
337 if (!lm_head_processed_to_gpu_bf16) {
338 Logger::warning("LM head was not processed to GPU as BF16, despite GPU layers being active. This might indicate missing source LM head data in the model structure or an unhandled GGUF type for LM head.");
339 }
340 } else {
341 Logger::info("No GPU layers active, skipping LM head processing for GPU.");
342 }
343
344 SAFE_CUDA_FREE(lm_head_f32_dev_);
345
346 if (active_num_gpu_layers > 0) {
347 if (!lm_head_f32.empty()) {
348 gpuErrchk(cudaMalloc(&lm_head_f32_dev_, lm_head_f32.size() * sizeof(float)));
349 gpuErrchk(cudaMemcpy(lm_head_f32_dev_, lm_head_f32.data(), lm_head_f32.size() * sizeof(float), cudaMemcpyHostToDevice));
350 Logger::info("[INIT_GPU_ROPE] Copied lm_head_f32 (host FP32) to GPU for lm_head_f32_dev_. Pointer: " + Logger::ptrToString(lm_head_f32_dev_));
351 } else {
352 Logger::error("[INIT_GPU_ROPE] Host lm_head_f32 is EMPTY. Cannot populate lm_head_f32_dev_. This WILL CAUSE a cublasSgemm error in the final matvec. Check model loading and initialize_weights logic for lm_head_f32 population.");
353 lm_head_f32_dev_ = nullptr;
354 }
355 } else {
356 lm_head_f32_dev_ = nullptr;
357 }
358
359
360 Logger::info("Finished processing embedding and LM head tables for GPU.");
361
362 SAFE_CUDA_FREE(all_freqs_cis_dev);
363 if (active_num_gpu_layers > 0) {
364 if (!precomputed_freqs_cis_.empty()) {
365 size_t total_freq_elements = precomputed_freqs_cis_.size() * 2;
366 gpuErrchk(cudaMalloc(&all_freqs_cis_dev, total_freq_elements * sizeof(float)));
367 std::vector<float> flat_host_freqs; flat_host_freqs.reserve(total_freq_elements);
368 for (const auto& p : precomputed_freqs_cis_) { flat_host_freqs.push_back(p.first); flat_host_freqs.push_back(p.second); }
369 gpuErrchk(cudaMemcpy(all_freqs_cis_dev, flat_host_freqs.data(), total_freq_elements * sizeof(float), cudaMemcpyHostToDevice));
370 Logger::info("Copied all precomputed RoPE frequencies to persistent GPU buffer.");
371 } else {
372 Logger::warning("Host precomputed_freqs_cis_ is empty. Skipping GPU RoPE buffer allocation. This WILL cause issues if GPU layers use RoPE.");
373 }
374 Logger::info("Finished processing RoPE frequencies for GPU.");
375 } else {
376 Logger::info("No GPU layers active, skipping RoPE GPU buffer allocation.");
377 }
378
379 if (active_num_gpu_layers > 0) {
380 Logger::info("Allocating/Reallocating persistent GPU workspace buffers for " + std::to_string(active_num_gpu_layers) + " GPU layers.");
381 size_t hs_bytes = (size_t)hs * sizeof(float);
382 size_t is_bytes = (size_t)is * sizeof(float);
383 size_t vs_bytes = (size_t)vs * sizeof(float);
384 size_t k_dev_size_bytes = (size_t)n_kv_heads * head_dim * sizeof(float);
385 size_t v_dev_size_bytes = (size_t)n_kv_heads * head_dim * sizeof(float);
386
387#define REALLOC_GPU_WORKSPACE(ptr, sz) SAFE_CUDA_FREE(ptr); gpuErrchk(cudaMalloc(&ptr, sz));
388 REALLOC_GPU_WORKSPACE(x_dev_, hs_bytes);
389 REALLOC_GPU_WORKSPACE(x_norm_dev_, hs_bytes);
390 REALLOC_GPU_WORKSPACE(x_resid1_dev_, hs_bytes);
391 REALLOC_GPU_WORKSPACE(x_resid2_dev_, hs_bytes);
392 REALLOC_GPU_WORKSPACE(q_dev_, hs_bytes);
393 REALLOC_GPU_WORKSPACE(k_dev_, k_dev_size_bytes);
394 REALLOC_GPU_WORKSPACE(v_dev_, v_dev_size_bytes);
395 REALLOC_GPU_WORKSPACE(attn_out_dev_, hs_bytes);
396 REALLOC_GPU_WORKSPACE(attn_proj_dev_, hs_bytes);
397 REALLOC_GPU_WORKSPACE(gate_vec_dev_, is_bytes);
398 REALLOC_GPU_WORKSPACE(up_vec_dev_, is_bytes);
399 REALLOC_GPU_WORKSPACE(swiglu_vec_dev_, is_bytes);
400 REALLOC_GPU_WORKSPACE(mlp_down_dev_, hs_bytes);
401 REALLOC_GPU_WORKSPACE(logits_dev_, vs_bytes);
402 Logger::info("Finished allocating/reallocating GPU workspace buffers.");
403 } else {
404 Logger::info("No GPU layers active, skipping GPU workspace buffer allocation.");
405 SAFE_CUDA_FREE(x_dev_); SAFE_CUDA_FREE(x_norm_dev_); SAFE_CUDA_FREE(x_resid1_dev_); SAFE_CUDA_FREE(x_resid2_dev_);
406 SAFE_CUDA_FREE(q_dev_); SAFE_CUDA_FREE(k_dev_); SAFE_CUDA_FREE(v_dev_); SAFE_CUDA_FREE(attn_out_dev_); SAFE_CUDA_FREE(attn_proj_dev_);
407 SAFE_CUDA_FREE(gate_vec_dev_); SAFE_CUDA_FREE(up_vec_dev_); SAFE_CUDA_FREE(swiglu_vec_dev_); SAFE_CUDA_FREE(mlp_down_dev_); SAFE_CUDA_FREE(logits_dev_);
408 }
409
410 if (active_num_gpu_layers > 0) {
411 selective_dequant_buffer_size_ = static_cast<size_t>(config_.max_position_embeddings) * head_dim;
412 size_t selective_buffer_bytes = selective_dequant_buffer_size_ * sizeof(float);
413
414 if (selective_dequant_buffer_size_ > 0) {
415 SAFE_CUDA_FREE(selective_k_dequant_buffer_dev_);
416 gpuErrchk(cudaMalloc(&selective_k_dequant_buffer_dev_, selective_buffer_bytes));
417 SAFE_CUDA_FREE(selective_v_dequant_buffer_dev_);
418 gpuErrchk(cudaMalloc(&selective_v_dequant_buffer_dev_, selective_buffer_bytes));
419 Logger::info("Allocated SELECTIVE KVCache dequantization buffers (K and V) on GPU. Size per buffer: " +
420 std::to_string(selective_buffer_bytes / (1024.0 * 1024.0)) + " MB (vs " +
421 std::to_string((static_cast<size_t>(config_.max_position_embeddings) * n_kv_heads * head_dim * sizeof(float)) / (1024.0 * 1024.0)) + " MB for full buffers)");
422 } else {
423 Logger::warning("Selective KVCache dequantization buffer size is 0. Skipping allocation.");
424 SAFE_CUDA_FREE(selective_k_dequant_buffer_dev_);
425 SAFE_CUDA_FREE(selective_v_dequant_buffer_dev_);
426 }
427
428 SAFE_CUDA_FREE(dequant_k_cache_buffer_dev_);
429 SAFE_CUDA_FREE(dequant_v_cache_buffer_dev_);
430 } else {
431 SAFE_CUDA_FREE(dequant_k_cache_buffer_dev_);
432 SAFE_CUDA_FREE(dequant_v_cache_buffer_dev_);
433 SAFE_CUDA_FREE(selective_k_dequant_buffer_dev_);
434 SAFE_CUDA_FREE(selective_v_dequant_buffer_dev_);
435 }
436
437 bool process_bf16_concat_weights = active_num_gpu_layers > 0 && !layers[active_num_cpu_layers].q_proj.empty();
438 if (process_bf16_concat_weights) {
439 size_t layer_q_size = (size_t)hs*hs, layer_k_size = (size_t)kv_dim*hs, layer_v_size = (size_t)kv_dim*hs, layer_o_size = (size_t)hs*hs;
440 size_t layer_gate_size = (size_t)is*hs, layer_up_size = (size_t)is*hs, layer_down_size = (size_t)hs*is;
441
442 std::vector<uint16_t> h_q, h_k, h_v, h_o, h_gate, h_up, h_down;
443 h_q.reserve(active_num_gpu_layers * layer_q_size); h_k.reserve(active_num_gpu_layers * layer_k_size);
444 h_v.reserve(active_num_gpu_layers * layer_v_size); h_o.reserve(active_num_gpu_layers * layer_o_size);
445 h_gate.reserve(active_num_gpu_layers * layer_gate_size); h_up.reserve(active_num_gpu_layers * layer_up_size);
446 h_down.reserve(active_num_gpu_layers * layer_down_size);
447
448 Logger::info("Concatenating BF16 weights for GPU layers on host (zero-padding if missing for a layer)...");
449 for (int i = 0; i < active_num_gpu_layers; ++i) {
450 int model_layer_idx = active_num_cpu_layers + i;
451 const auto& lw = layers[model_layer_idx];
452
453 if (!lw.q_proj.empty()) {
454 h_q.insert(h_q.end(), lw.q_proj.begin(), lw.q_proj.end());
455 } else {
456 h_q.insert(h_q.end(), layer_q_size, bfloat16::ZERO);
457 }
458
459 if (!lw.k_proj.empty()) {
460 h_k.insert(h_k.end(), lw.k_proj.begin(), lw.k_proj.end());
461 } else {
462 h_k.insert(h_k.end(), layer_k_size, bfloat16::ZERO);
463 }
464
465 if (!lw.v_proj.empty()) {
466 h_v.insert(h_v.end(), lw.v_proj.begin(), lw.v_proj.end());
467 } else {
468 h_v.insert(h_v.end(), layer_v_size, bfloat16::ZERO);
469 }
470
471 if (!lw.o_proj.empty()) {
472 h_o.insert(h_o.end(), lw.o_proj.begin(), lw.o_proj.end());
473 } else {
474 h_o.insert(h_o.end(), layer_o_size, bfloat16::ZERO);
475 }
476
477 if (!lw.gate_proj.empty()) {
478 h_gate.insert(h_gate.end(), lw.gate_proj.begin(), lw.gate_proj.end());
479 } else {
480 h_gate.insert(h_gate.end(), layer_gate_size, bfloat16::ZERO);
481 }
482
483 if (!lw.up_proj.empty()) {
484 h_up.insert(h_up.end(), lw.up_proj.begin(), lw.up_proj.end());
485 } else {
486 h_up.insert(h_up.end(), layer_up_size, bfloat16::ZERO);
487 }
488
489 if (!lw.down_proj.empty()) {
490 h_down.insert(h_down.end(), lw.down_proj.begin(), lw.down_proj.end());
491 } else {
492 h_down.insert(h_down.end(), layer_down_size, bfloat16::ZERO);
493 }
494 }
495
496#define ALLOC_COPY_CONCAT_BF16(dev_ptr, host_vec, weight_name_str) \
497 SAFE_CUDA_FREE(dev_ptr); \
498 if (!host_vec.empty()) { \
499 gpuErrchk(cudaMalloc(&dev_ptr, host_vec.size() * sizeof(uint16_t))); \
500 gpuErrchk(cudaMemcpy(dev_ptr, host_vec.data(), host_vec.size() * sizeof(uint16_t), cudaMemcpyHostToDevice)); \
501 Logger::info("Copied concatenated " weight_name_str " (BF16) to GPU for GPU layers."); \
502 } else if (active_num_gpu_layers > 0) { \
503 Logger::info("Host vector for concatenated " weight_name_str " (BF16) is empty. Skipping GPU copy."); \
504 }
505
506 ALLOC_COPY_CONCAT_BF16(w_q_dev_, h_q, "Q Proj"); ALLOC_COPY_CONCAT_BF16(w_k_dev_, h_k, "K Proj"); ALLOC_COPY_CONCAT_BF16(w_v_dev_, h_v, "V Proj");
507 ALLOC_COPY_CONCAT_BF16(w_o_dev_, h_o, "O Proj"); ALLOC_COPY_CONCAT_BF16(w_gate_dev_, h_gate, "Gate Proj");
508 ALLOC_COPY_CONCAT_BF16(w_up_dev_, h_up, "Up Proj"); ALLOC_COPY_CONCAT_BF16(w_down_dev_, h_down, "Down Proj");
509#undef ALLOC_COPY_CONCAT_BF16
510
511 } else {
512 Logger::info("Skipping BF16 concatenated layer weight processing (first GPU layer appears not to use BF16 q_proj, or no GPU layers).");
513 SAFE_CUDA_FREE(w_q_dev_); SAFE_CUDA_FREE(w_k_dev_); SAFE_CUDA_FREE(w_v_dev_); SAFE_CUDA_FREE(w_o_dev_);
514 SAFE_CUDA_FREE(w_gate_dev_); SAFE_CUDA_FREE(w_up_dev_); SAFE_CUDA_FREE(w_down_dev_);
515 }
516
517 Logger::info("DEFERRING concatenated F32 weight processing for GPU layers to save memory during initialization");
518 Logger::info("Concatenated F32 weights will be processed on-demand during first inference");
519
520 SAFE_CUDA_FREE(w_q_f32_dev_); SAFE_CUDA_FREE(w_k_f32_dev_); SAFE_CUDA_FREE(w_v_f32_dev_); SAFE_CUDA_FREE(w_o_f32_dev_);
521 SAFE_CUDA_FREE(w_gate_f32_dev_); SAFE_CUDA_FREE(w_up_f32_dev_); SAFE_CUDA_FREE(w_down_f32_dev_);
522
523 // Free BF16 concatenated weights
524 SAFE_CUDA_FREE(w_q_bf16_dev_); SAFE_CUDA_FREE(w_k_bf16_dev_); SAFE_CUDA_FREE(w_v_bf16_dev_); SAFE_CUDA_FREE(w_o_bf16_dev_);
525 SAFE_CUDA_FREE(w_gate_bf16_dev_); SAFE_CUDA_FREE(w_up_bf16_dev_); SAFE_CUDA_FREE(w_down_bf16_dev_);
526
527 Logger::info("Finished deferring concatenated F32 weight processing for GPU layers.");
528
529 // Allocate persistent batch processing buffers for GPU memory optimization
530 if (active_num_gpu_layers > 0) {
531 allocate_persistent_batch_buffers();
532 }
533
534#undef SAFE_CUDA_FREE
535#else
536 if (active_num_gpu_layers > 0 && nhl > 0) {
537 Logger::warning("CUDA not available, but " + std::to_string(active_num_gpu_layers) + " layer(s) were configured for GPU. Model will run entirely on CPU.");
538 } else {
539 Logger::info("CUDA not available or no GPU layers configured. Model will run entirely on CPU.");
540 }
541#endif
542}
static void warning(const std::string &message)
Definition logger.cpp:139
static std::string ptrToString(const void *ptr)
Definition logger.cpp:225
static void info(const std::string &message)
Definition logger.cpp:135
static void error(const std::string &message)
Definition logger.cpp:143
bool use_bf16_tensor_cores_
Definition model.h:481
std::vector< block_q6_K > embed_tokens_q6k
Definition model.h:488
std::vector< float > final_norm_f32
Definition model.h:486
std::vector< uint16_t > final_norm
Definition model.h:485
std::vector< block_q4_K > lm_head_q4k
Definition model.h:487
std::vector< block_q6_K > lm_head_q6k
Definition model.h:488
std::vector< std::pair< float, float > > precomputed_freqs_cis_
Definition model.h:554
void ensure_embed_tokens_dequantized()
std::vector< LayerWeights > layers
Definition model.h:491
std::vector< block_q8_0 > embed_tokens_q8_0
Definition model.h:489
ModelConfig config_
Definition model.h:480
std::vector< block_q4_K > embed_tokens_q4k
Definition model.h:487
std::vector< block_q8_0 > lm_head_q8_0
Definition model.h:489
std::vector< uint16_t > lm_head
Definition model.h:484
std::vector< uint16_t > embed_tokens
Definition model.h:483
std::vector< float > embed_tokens_f32
Definition model.h:486
void ensure_lm_head_dequantized()
std::vector< float > lm_head_f32
Definition model.h:486
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
Logging utilities for the TinyLlama implementation.
Constants used throughout the TinyLlama model implementation.
constexpr uint16_t ZERO
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
int num_attention_heads
Definition model.h:83
int intermediate_size
Definition model.h:82
int num_cpu_offload_layers
Definition model.h:104
float rope_theta
Definition model.h:89
int num_hidden_layers
Definition model.h:85
int num_key_value_heads
Definition model.h:84
int max_position_embeddings
Definition model.h:87
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198
uint16_t float32_to_bfloat16(float val)
Definition utils.cpp:136