27 if (num_cpu_layers_clamped < 0) num_cpu_layers_clamped = 0;
28 if (num_cpu_layers_clamped > nhl) {
30 ") exceeds total hidden layers (" + std::to_string(nhl) +
31 "). Clamping to " + std::to_string(nhl) +
" layers on CPU.");
32 num_cpu_layers_clamped = nhl;
34 int active_num_cpu_layers = num_cpu_layers_clamped;
35 int active_num_gpu_layers = nhl - active_num_cpu_layers;
37 Logger::info(
"Effective CPU layers for this init: " + std::to_string(active_num_cpu_layers) +
", Effective GPU layers for this init: " + std::to_string(active_num_gpu_layers));
39 if (hs <= 0)
throw std::runtime_error(
"Invalid model config: hidden_size must be positive.");
40 if (vs <= 0)
throw std::runtime_error(
"Invalid model config: vocab_size must be positive.");
41 if (n_heads <= 0)
throw std::runtime_error(
"Invalid model config: num_attention_heads must be positive.");
42 if (n_kv_heads <= 0)
throw std::runtime_error(
"Invalid model config: num_key_value_heads must be positive.");
43 if (hs % n_heads != 0)
throw std::runtime_error(
"Invalid model config: hidden_size not divisible by num_attention_heads.");
45 int kv_dim = (hs / n_heads) * n_kv_heads;
46 int head_dim = hs / n_heads;
48 Logger::info(
"Precomputing RoPE frequencies on CPU (always done).");
52 for (
int pos = 0; pos < max_seq_len; ++pos) {
53 for (
int i_rope = 0; i_rope < head_dim; i_rope += 2) {
54 float freq = std::pow(theta, -((
float)i_rope) / head_dim);
55 float angle = pos * freq;
59 Logger::info(
"Finished precomputing RoPE cos/sin frequencies on CPU.");
62#define SAFE_CUDA_FREE(ptr) if(ptr) { cudaFree(ptr); ptr = nullptr; }
64 if (active_num_gpu_layers == 0) {
65 Logger::info(
"No layers assigned to GPU (active_num_gpu_layers = 0). Cleaning up existing CUDA resources and skipping GPU initialization.");
67 SAFE_CUDA_FREE(final_norm_dev);
68 for (
int i = 0; i < nhl; ++i) {
69 SAFE_CUDA_FREE(
layers[i].input_layernorm_dev);
70 SAFE_CUDA_FREE(
layers[i].post_attention_layernorm_dev);
72 SAFE_CUDA_FREE(token_embedding_table_dev_);
73 SAFE_CUDA_FREE(lm_head_dev_);
74 SAFE_CUDA_FREE(w_q_dev_); SAFE_CUDA_FREE(w_k_dev_); SAFE_CUDA_FREE(w_v_dev_); SAFE_CUDA_FREE(w_o_dev_);
75 SAFE_CUDA_FREE(w_gate_dev_); SAFE_CUDA_FREE(w_up_dev_); SAFE_CUDA_FREE(w_down_dev_);
76 SAFE_CUDA_FREE(all_freqs_cis_dev);
77 SAFE_CUDA_FREE(x_dev_); SAFE_CUDA_FREE(x_norm_dev_); SAFE_CUDA_FREE(x_resid1_dev_); SAFE_CUDA_FREE(x_resid2_dev_);
78 SAFE_CUDA_FREE(q_dev_); SAFE_CUDA_FREE(k_dev_); SAFE_CUDA_FREE(v_dev_); SAFE_CUDA_FREE(attn_out_dev_);
79 SAFE_CUDA_FREE(attn_proj_dev_); SAFE_CUDA_FREE(gate_vec_dev_); SAFE_CUDA_FREE(up_vec_dev_);
80 SAFE_CUDA_FREE(swiglu_vec_dev_); SAFE_CUDA_FREE(mlp_down_dev_); SAFE_CUDA_FREE(logits_dev_);
81 SAFE_CUDA_FREE(token_embedding_table_f32_dev_);
82 SAFE_CUDA_FREE(lm_head_f32_dev_);
83 SAFE_CUDA_FREE(w_q_f32_dev_); SAFE_CUDA_FREE(w_k_f32_dev_); SAFE_CUDA_FREE(w_v_f32_dev_); SAFE_CUDA_FREE(w_o_f32_dev_);
84 SAFE_CUDA_FREE(w_gate_f32_dev_); SAFE_CUDA_FREE(w_up_f32_dev_); SAFE_CUDA_FREE(w_down_f32_dev_);
86 if (cublas_handle_) { cublasDestroy(cublas_handle_); cublas_handle_ =
nullptr; }
90 Logger::info(
"Initializing CUDA resources for " + std::to_string(active_num_gpu_layers) +
" GPU layers.");
91 if (!cublas_handle_) {
92 cublasStatus_t cublas_status = cublasCreate(&cublas_handle_);
93 if (cublas_status != CUBLAS_STATUS_SUCCESS) {
94 throw std::runtime_error(
"Failed to initialize cuBLAS: " + std::to_string(cublas_status));
100 cudaDeviceProp props;
102 gpuErrchk(cudaGetDevice(¤t_device));
103 gpuErrchk(cudaGetDeviceProperties(&props, current_device));
105 bool has_bf16_tensor_core_hw = ((props.major == 7 && props.minor == 5) || props.major >= 8);
108 if (has_bf16_tensor_core_hw) {
109 if (dimensions_ok_for_tensor_cores) {
110 Logger::info(
"GPU " + std::string(props.name) +
" (CC " + std::to_string(props.major) +
"." + std::to_string(props.minor) +
") supports BF16 Tensor Cores AND model dimensions (hs: " + std::to_string(
config_.
hidden_size) +
", vs: " + std::to_string(
config_.
vocab_size) +
") are compatible. Enabling Tensor Core path for matvec_bf16_f32.");
113 Logger::info(
"GPU " + std::string(props.name) +
" (CC " + std::to_string(props.major) +
"." + std::to_string(props.minor) +
") supports BF16 Tensor Cores, BUT model dimensions (hs: " + std::to_string(
config_.
hidden_size) +
" (must be div by 8), vs: " + std::to_string(
config_.
vocab_size) +
" (must be div by 4)) are NOT compatible. Disabling Tensor Core path for matvec_bf16_f32.");
116 Logger::info(
"GPU " + std::string(props.name) +
" (CC " + std::to_string(props.major) +
"." + std::to_string(props.minor) +
") does not meet criteria for BF16 Tensor Core use (requires CC >= 7.5). Disabling Tensor Core path for matvec_bf16_f32.");
121 Logger::info(
"Converting final_norm (BF16) to FP32 for GPU.");
125 SAFE_CUDA_FREE(final_norm_dev);
126 gpuErrchk(cudaMalloc(&final_norm_dev,
final_norm_f32.size() *
sizeof(
float)));
128 Logger::info(
"Copied final_norm weights (FP32) to GPU.");
130 Logger::warning(
"Final norm weights (FP32) are empty, skipping GPU copy. This might be an issue if GPU layers are expected to use it.");
133 for (
int i = 0; i < active_num_cpu_layers; ++i) {
134 if (
static_cast<size_t>(i) <
layers.size()) {
135 SAFE_CUDA_FREE(
layers[i].input_layernorm_dev);
136 SAFE_CUDA_FREE(
layers[i].post_attention_layernorm_dev);
139 Logger::info(
"Copying layer norm weights (FP32) to GPU for layers " + std::to_string(active_num_cpu_layers) +
" to " + std::to_string(nhl - 1));
140 Logger::info(
"[INIT_DEBUG_PRE_LOOP] Active CPU layers: " + std::to_string(active_num_cpu_layers));
141 if (nhl > 0 &&
layers.size() > 0) {
142 Logger::info(
"[INIT_DEBUG_PRE_LOOP] layers[0].input_layernorm_f32.empty(): " + std::string(
layers[0].input_layernorm_f32.empty() ?
"YES" :
"NO") +
143 ", Size: " + std::to_string(
layers[0].input_layernorm_f32.size()));
145 for (
int i = active_num_cpu_layers; i < nhl; ++i) {
146 if (
static_cast<size_t>(i) >=
layers.size()) {
147 Logger::error(
"Layer index " + std::to_string(i) +
" out of bounds for layers vector (size: " + std::to_string(
layers.size()) +
")");
150 SAFE_CUDA_FREE(
layers[i].input_layernorm_dev);
151 SAFE_CUDA_FREE(
layers[i].post_attention_layernorm_dev);
153 if (
layers[i].input_layernorm_f32.empty() && !
layers[i].input_layernorm.empty()) {
156 if (
layers[i].post_attention_layernorm_f32.empty() && !
layers[i].post_attention_layernorm.empty()) {
160 if (!
layers[i].input_layernorm_f32.empty()) {
161 gpuErrchk(cudaMalloc(&
layers[i].input_layernorm_dev,
layers[i].input_layernorm_f32.size() *
sizeof(
float)));
162 gpuErrchk(cudaMemcpy(
layers[i].input_layernorm_dev,
layers[i].input_layernorm_f32.data(),
layers[i].input_layernorm_f32.size() *
sizeof(
float), cudaMemcpyHostToDevice));
163 if (i == active_num_cpu_layers) {
165 ", Size used for malloc: " + std::to_string(
layers[i].input_layernorm_f32.size() *
sizeof(
float)) +
" bytes (" +
166 std::to_string(
layers[i].input_layernorm_f32.size()) +
" elements). Host vector empty: " + (
layers[i].input_layernorm_f32.empty() ?
"YES" :
"NO"));
169 throw std::runtime_error(
"GPU Layer " + std::to_string(i) +
": input_layernorm_f32 weights are empty. Cannot offload to GPU without them.");
172 if (!
layers[i].post_attention_layernorm_f32.empty()) {
173 gpuErrchk(cudaMalloc(&
layers[i].post_attention_layernorm_dev,
layers[i].post_attention_layernorm_f32.size() *
sizeof(
float)));
174 gpuErrchk(cudaMemcpy(
layers[i].post_attention_layernorm_dev,
layers[i].post_attention_layernorm_f32.data(),
layers[i].post_attention_layernorm_f32.size() *
sizeof(
float), cudaMemcpyHostToDevice));
176 throw std::runtime_error(
"GPU Layer " + std::to_string(i) +
": post_attention_layernorm_f32 weights are empty. Cannot offload to GPU without them.");
179 Logger::info(
"Finished processing layer norm weights for GPU layers.");
182 SAFE_CUDA_FREE(token_embedding_table_dev_);
183 SAFE_CUDA_FREE(token_embedding_table_f32_dev_);
185 bool token_embeddings_processed_to_gpu_bf16 =
false;
187 if (active_num_gpu_layers > 0) {
189 gpuErrchk(cudaMalloc(&token_embedding_table_dev_,
embed_tokens.size() *
sizeof(uint16_t)));
190 gpuErrchk(cudaMemcpy(token_embedding_table_dev_,
embed_tokens.data(),
embed_tokens.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
191 Logger::info(
"Copied token_embedding_table (bf16 direct from model.embed_tokens) to GPU.");
192 token_embeddings_processed_to_gpu_bf16 =
true;
196 #pragma omp parallel for
200 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() *
sizeof(uint16_t)));
201 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
202 Logger::info(
"Converted token_embedding_table (fp32 source -> bf16) to GPU.");
203 token_embeddings_processed_to_gpu_bf16 =
true;
207 #pragma omp parallel for
211 std::vector<uint16_t> bf16_data(temp_f32_data.size());
212 #pragma omp parallel for
213 for (
int i = 0; i < (int)temp_f32_data.size(); ++i) {
216 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() *
sizeof(uint16_t)));
217 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
218 Logger::info(
"Dequantized token_embedding_table (Q8_0 -> fp32 -> bf16) to GPU.");
219 token_embeddings_processed_to_gpu_bf16 =
true;
223 #pragma omp parallel for
227 std::vector<uint16_t> bf16_data(temp_f32_data.size());
228 #pragma omp parallel for
229 for (
int i = 0; i < (int)temp_f32_data.size(); ++i) {
232 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() *
sizeof(uint16_t)));
233 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
234 Logger::info(
"Dequantized token_embedding_table (Q4_K -> fp32 -> bf16) to GPU.");
235 token_embeddings_processed_to_gpu_bf16 =
true;
239 #pragma omp parallel for
243 std::vector<uint16_t> bf16_data(temp_f32_data.size());
244 #pragma omp parallel for
245 for (
int i = 0; i < (int)temp_f32_data.size(); ++i) {
248 gpuErrchk(cudaMalloc(&token_embedding_table_dev_, bf16_data.size() *
sizeof(uint16_t)));
249 gpuErrchk(cudaMemcpy(token_embedding_table_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
250 Logger::info(
"Dequantized token_embedding_table (Q6_K -> fp32 -> bf16) to GPU.");
251 token_embeddings_processed_to_gpu_bf16 =
true;
254 if (token_embeddings_processed_to_gpu_bf16) {
256 ". Flag token_embeddings_processed_to_gpu_bf16: YES");
258 if (!token_embeddings_processed_to_gpu_bf16 && active_num_gpu_layers > 0) {
259 Logger::warning(
"Token embeddings were not processed to GPU as BF16, despite GPU layers being active. This might indicate missing source embedding data in the model structure or an unhandled GGUF type for embeddings.");
262 Logger::info(
"No GPU layers active, skipping token embedding table processing for GPU.");
265 SAFE_CUDA_FREE(lm_head_dev_);
266 SAFE_CUDA_FREE(lm_head_f32_dev_);
268 bool lm_head_processed_to_gpu_bf16 =
false;
270 if (active_num_gpu_layers > 0) {
272 gpuErrchk(cudaMalloc(&lm_head_dev_,
lm_head.size() *
sizeof(uint16_t)));
273 gpuErrchk(cudaMemcpy(lm_head_dev_,
lm_head.data(),
lm_head.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
274 Logger::info(
"Copied lm_head (bf16 direct from model.lm_head) to GPU.");
275 lm_head_processed_to_gpu_bf16 =
true;
278 std::vector<uint16_t> bf16_data(
lm_head_f32.size());
279 #pragma omp parallel for
280 for (
int i = 0; i < (int)
lm_head_f32.size(); ++i) {
283 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() *
sizeof(uint16_t)));
284 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
285 Logger::info(
"Converted lm_head (fp32 source -> bf16) to GPU.");
286 lm_head_processed_to_gpu_bf16 =
true;
290 #pragma omp parallel for
294 std::vector<uint16_t> bf16_data(temp_f32_data.size());
295 #pragma omp parallel for
296 for (
int i = 0; i < (int)temp_f32_data.size(); ++i) {
299 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() *
sizeof(uint16_t)));
300 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
301 Logger::info(
"Dequantized lm_head (Q8_0 -> fp32 -> bf16) to GPU.");
302 lm_head_processed_to_gpu_bf16 =
true;
306 #pragma omp parallel for
307 for (
int i = 0; i < (int)
lm_head_q4k.size(); ++i) {
310 std::vector<uint16_t> bf16_data(temp_f32_data.size());
311 #pragma omp parallel for
312 for (
int i = 0; i < (int)temp_f32_data.size(); ++i) {
315 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() *
sizeof(uint16_t)));
316 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
317 Logger::info(
"Dequantized lm_head (Q4_K -> fp32 -> bf16) to GPU.");
318 lm_head_processed_to_gpu_bf16 =
true;
322 #pragma omp parallel for
323 for (
int i = 0; i < (int)
lm_head_q6k.size(); ++i) {
326 std::vector<uint16_t> bf16_data(temp_f32_data.size());
327 #pragma omp parallel for
328 for (
int i = 0; i < (int)temp_f32_data.size(); ++i) {
331 gpuErrchk(cudaMalloc(&lm_head_dev_, bf16_data.size() *
sizeof(uint16_t)));
332 gpuErrchk(cudaMemcpy(lm_head_dev_, bf16_data.data(), bf16_data.size() *
sizeof(uint16_t), cudaMemcpyHostToDevice));
333 Logger::info(
"Dequantized lm_head (Q6_K -> fp32 -> bf16) to GPU.");
334 lm_head_processed_to_gpu_bf16 =
true;
337 if (!lm_head_processed_to_gpu_bf16) {
338 Logger::warning(
"LM head was not processed to GPU as BF16, despite GPU layers being active. This might indicate missing source LM head data in the model structure or an unhandled GGUF type for LM head.");
341 Logger::info(
"No GPU layers active, skipping LM head processing for GPU.");
344 SAFE_CUDA_FREE(lm_head_f32_dev_);
346 if (active_num_gpu_layers > 0) {
348 gpuErrchk(cudaMalloc(&lm_head_f32_dev_,
lm_head_f32.size() *
sizeof(
float)));
349 gpuErrchk(cudaMemcpy(lm_head_f32_dev_,
lm_head_f32.data(),
lm_head_f32.size() *
sizeof(
float), cudaMemcpyHostToDevice));
352 Logger::error(
"[INIT_GPU_ROPE] Host lm_head_f32 is EMPTY. Cannot populate lm_head_f32_dev_. This WILL CAUSE a cublasSgemm error in the final matvec. Check model loading and initialize_weights logic for lm_head_f32 population.");
353 lm_head_f32_dev_ =
nullptr;
356 lm_head_f32_dev_ =
nullptr;
360 Logger::info(
"Finished processing embedding and LM head tables for GPU.");
362 SAFE_CUDA_FREE(all_freqs_cis_dev);
363 if (active_num_gpu_layers > 0) {
366 gpuErrchk(cudaMalloc(&all_freqs_cis_dev, total_freq_elements *
sizeof(
float)));
367 std::vector<float> flat_host_freqs; flat_host_freqs.reserve(total_freq_elements);
368 for (
const auto& p :
precomputed_freqs_cis_) { flat_host_freqs.push_back(p.first); flat_host_freqs.push_back(p.second); }
369 gpuErrchk(cudaMemcpy(all_freqs_cis_dev, flat_host_freqs.data(), total_freq_elements *
sizeof(
float), cudaMemcpyHostToDevice));
370 Logger::info(
"Copied all precomputed RoPE frequencies to persistent GPU buffer.");
372 Logger::warning(
"Host precomputed_freqs_cis_ is empty. Skipping GPU RoPE buffer allocation. This WILL cause issues if GPU layers use RoPE.");
374 Logger::info(
"Finished processing RoPE frequencies for GPU.");
376 Logger::info(
"No GPU layers active, skipping RoPE GPU buffer allocation.");
379 if (active_num_gpu_layers > 0) {
380 Logger::info(
"Allocating/Reallocating persistent GPU workspace buffers for " + std::to_string(active_num_gpu_layers) +
" GPU layers.");
381 size_t hs_bytes = (size_t)hs *
sizeof(
float);
382 size_t is_bytes = (size_t)is *
sizeof(
float);
383 size_t vs_bytes = (size_t)vs *
sizeof(
float);
384 size_t k_dev_size_bytes = (size_t)n_kv_heads * head_dim *
sizeof(
float);
385 size_t v_dev_size_bytes = (size_t)n_kv_heads * head_dim *
sizeof(
float);
387#define REALLOC_GPU_WORKSPACE(ptr, sz) SAFE_CUDA_FREE(ptr); gpuErrchk(cudaMalloc(&ptr, sz));
388 REALLOC_GPU_WORKSPACE(x_dev_, hs_bytes);
389 REALLOC_GPU_WORKSPACE(x_norm_dev_, hs_bytes);
390 REALLOC_GPU_WORKSPACE(x_resid1_dev_, hs_bytes);
391 REALLOC_GPU_WORKSPACE(x_resid2_dev_, hs_bytes);
392 REALLOC_GPU_WORKSPACE(q_dev_, hs_bytes);
393 REALLOC_GPU_WORKSPACE(k_dev_, k_dev_size_bytes);
394 REALLOC_GPU_WORKSPACE(v_dev_, v_dev_size_bytes);
395 REALLOC_GPU_WORKSPACE(attn_out_dev_, hs_bytes);
396 REALLOC_GPU_WORKSPACE(attn_proj_dev_, hs_bytes);
397 REALLOC_GPU_WORKSPACE(gate_vec_dev_, is_bytes);
398 REALLOC_GPU_WORKSPACE(up_vec_dev_, is_bytes);
399 REALLOC_GPU_WORKSPACE(swiglu_vec_dev_, is_bytes);
400 REALLOC_GPU_WORKSPACE(mlp_down_dev_, hs_bytes);
401 REALLOC_GPU_WORKSPACE(logits_dev_, vs_bytes);
402 Logger::info(
"Finished allocating/reallocating GPU workspace buffers.");
404 Logger::info(
"No GPU layers active, skipping GPU workspace buffer allocation.");
405 SAFE_CUDA_FREE(x_dev_); SAFE_CUDA_FREE(x_norm_dev_); SAFE_CUDA_FREE(x_resid1_dev_); SAFE_CUDA_FREE(x_resid2_dev_);
406 SAFE_CUDA_FREE(q_dev_); SAFE_CUDA_FREE(k_dev_); SAFE_CUDA_FREE(v_dev_); SAFE_CUDA_FREE(attn_out_dev_); SAFE_CUDA_FREE(attn_proj_dev_);
407 SAFE_CUDA_FREE(gate_vec_dev_); SAFE_CUDA_FREE(up_vec_dev_); SAFE_CUDA_FREE(swiglu_vec_dev_); SAFE_CUDA_FREE(mlp_down_dev_); SAFE_CUDA_FREE(logits_dev_);
410 if (active_num_gpu_layers > 0) {
412 size_t selective_buffer_bytes = selective_dequant_buffer_size_ *
sizeof(float);
414 if (selective_dequant_buffer_size_ > 0) {
415 SAFE_CUDA_FREE(selective_k_dequant_buffer_dev_);
416 gpuErrchk(cudaMalloc(&selective_k_dequant_buffer_dev_, selective_buffer_bytes));
417 SAFE_CUDA_FREE(selective_v_dequant_buffer_dev_);
418 gpuErrchk(cudaMalloc(&selective_v_dequant_buffer_dev_, selective_buffer_bytes));
419 Logger::info(
"Allocated SELECTIVE KVCache dequantization buffers (K and V) on GPU. Size per buffer: " +
420 std::to_string(selective_buffer_bytes / (1024.0 * 1024.0)) +
" MB (vs " +
421 std::to_string((
static_cast<size_t>(
config_.
max_position_embeddings) * n_kv_heads * head_dim *
sizeof(
float)) / (1024.0 * 1024.0)) +
" MB for full buffers)");
423 Logger::warning(
"Selective KVCache dequantization buffer size is 0. Skipping allocation.");
424 SAFE_CUDA_FREE(selective_k_dequant_buffer_dev_);
425 SAFE_CUDA_FREE(selective_v_dequant_buffer_dev_);
428 SAFE_CUDA_FREE(dequant_k_cache_buffer_dev_);
429 SAFE_CUDA_FREE(dequant_v_cache_buffer_dev_);
431 SAFE_CUDA_FREE(dequant_k_cache_buffer_dev_);
432 SAFE_CUDA_FREE(dequant_v_cache_buffer_dev_);
433 SAFE_CUDA_FREE(selective_k_dequant_buffer_dev_);
434 SAFE_CUDA_FREE(selective_v_dequant_buffer_dev_);
437 bool process_bf16_concat_weights = active_num_gpu_layers > 0 && !
layers[active_num_cpu_layers].q_proj.empty();
438 if (process_bf16_concat_weights) {
439 size_t layer_q_size = (size_t)hs*hs, layer_k_size = (
size_t)kv_dim*hs, layer_v_size = (size_t)kv_dim*hs, layer_o_size = (
size_t)hs*hs;
440 size_t layer_gate_size = (size_t)is*hs, layer_up_size = (
size_t)is*hs, layer_down_size = (size_t)hs*is;
442 std::vector<uint16_t> h_q, h_k, h_v, h_o, h_gate, h_up, h_down;
443 h_q.reserve(active_num_gpu_layers * layer_q_size); h_k.reserve(active_num_gpu_layers * layer_k_size);
444 h_v.reserve(active_num_gpu_layers * layer_v_size); h_o.reserve(active_num_gpu_layers * layer_o_size);
445 h_gate.reserve(active_num_gpu_layers * layer_gate_size); h_up.reserve(active_num_gpu_layers * layer_up_size);
446 h_down.reserve(active_num_gpu_layers * layer_down_size);
448 Logger::info(
"Concatenating BF16 weights for GPU layers on host (zero-padding if missing for a layer)...");
449 for (
int i = 0; i < active_num_gpu_layers; ++i) {
450 int model_layer_idx = active_num_cpu_layers + i;
451 const auto& lw =
layers[model_layer_idx];
453 if (!lw.q_proj.empty()) {
454 h_q.insert(h_q.end(), lw.q_proj.begin(), lw.q_proj.end());
459 if (!lw.k_proj.empty()) {
460 h_k.insert(h_k.end(), lw.k_proj.begin(), lw.k_proj.end());
465 if (!lw.v_proj.empty()) {
466 h_v.insert(h_v.end(), lw.v_proj.begin(), lw.v_proj.end());
471 if (!lw.o_proj.empty()) {
472 h_o.insert(h_o.end(), lw.o_proj.begin(), lw.o_proj.end());
477 if (!lw.gate_proj.empty()) {
478 h_gate.insert(h_gate.end(), lw.gate_proj.begin(), lw.gate_proj.end());
483 if (!lw.up_proj.empty()) {
484 h_up.insert(h_up.end(), lw.up_proj.begin(), lw.up_proj.end());
489 if (!lw.down_proj.empty()) {
490 h_down.insert(h_down.end(), lw.down_proj.begin(), lw.down_proj.end());
496#define ALLOC_COPY_CONCAT_BF16(dev_ptr, host_vec, weight_name_str) \
497 SAFE_CUDA_FREE(dev_ptr); \
498 if (!host_vec.empty()) { \
499 gpuErrchk(cudaMalloc(&dev_ptr, host_vec.size() * sizeof(uint16_t))); \
500 gpuErrchk(cudaMemcpy(dev_ptr, host_vec.data(), host_vec.size() * sizeof(uint16_t), cudaMemcpyHostToDevice)); \
501 Logger::info("Copied concatenated " weight_name_str " (BF16) to GPU for GPU layers."); \
502 } else if (active_num_gpu_layers > 0) { \
503 Logger::info("Host vector for concatenated " weight_name_str " (BF16) is empty. Skipping GPU copy."); \
506 ALLOC_COPY_CONCAT_BF16(w_q_dev_, h_q,
"Q Proj"); ALLOC_COPY_CONCAT_BF16(w_k_dev_, h_k,
"K Proj"); ALLOC_COPY_CONCAT_BF16(w_v_dev_, h_v,
"V Proj");
507 ALLOC_COPY_CONCAT_BF16(w_o_dev_, h_o,
"O Proj"); ALLOC_COPY_CONCAT_BF16(w_gate_dev_, h_gate,
"Gate Proj");
508 ALLOC_COPY_CONCAT_BF16(w_up_dev_, h_up,
"Up Proj"); ALLOC_COPY_CONCAT_BF16(w_down_dev_, h_down,
"Down Proj");
509#undef ALLOC_COPY_CONCAT_BF16
512 Logger::info(
"Skipping BF16 concatenated layer weight processing (first GPU layer appears not to use BF16 q_proj, or no GPU layers).");
513 SAFE_CUDA_FREE(w_q_dev_); SAFE_CUDA_FREE(w_k_dev_); SAFE_CUDA_FREE(w_v_dev_); SAFE_CUDA_FREE(w_o_dev_);
514 SAFE_CUDA_FREE(w_gate_dev_); SAFE_CUDA_FREE(w_up_dev_); SAFE_CUDA_FREE(w_down_dev_);
517 Logger::info(
"DEFERRING concatenated F32 weight processing for GPU layers to save memory during initialization");
518 Logger::info(
"Concatenated F32 weights will be processed on-demand during first inference");
520 SAFE_CUDA_FREE(w_q_f32_dev_); SAFE_CUDA_FREE(w_k_f32_dev_); SAFE_CUDA_FREE(w_v_f32_dev_); SAFE_CUDA_FREE(w_o_f32_dev_);
521 SAFE_CUDA_FREE(w_gate_f32_dev_); SAFE_CUDA_FREE(w_up_f32_dev_); SAFE_CUDA_FREE(w_down_f32_dev_);
524 SAFE_CUDA_FREE(w_q_bf16_dev_); SAFE_CUDA_FREE(w_k_bf16_dev_); SAFE_CUDA_FREE(w_v_bf16_dev_); SAFE_CUDA_FREE(w_o_bf16_dev_);
525 SAFE_CUDA_FREE(w_gate_bf16_dev_); SAFE_CUDA_FREE(w_up_bf16_dev_); SAFE_CUDA_FREE(w_down_bf16_dev_);
527 Logger::info(
"Finished deferring concatenated F32 weight processing for GPU layers.");
530 if (active_num_gpu_layers > 0) {
531 allocate_persistent_batch_buffers();
536 if (active_num_gpu_layers > 0 && nhl > 0) {
537 Logger::warning(
"CUDA not available, but " + std::to_string(active_num_gpu_layers) +
" layer(s) were configured for GPU. Model will run entirely on CPU.");
539 Logger::info(
"CUDA not available or no GPU layers configured. Model will run entirely on CPU.");