11 int total_num_model_layers,
int num_gpu_layers_to_allocate,
12 int max_seq_len_arg,
int num_kv_heads,
13 int head_dim,
int max_batch_size_arg) {
20 layers.resize(total_num_model_layers);
23 size_t cache_size_per_layer =
static_cast<size_t>(max_seq_len_arg) *
24 static_cast<size_t>(max_batch_size_arg) *
25 static_cast<size_t>(num_kv_heads) *
26 static_cast<size_t>(head_dim);
27 if (cache_size_per_layer == 0 && max_seq_len_arg > 0 && total_num_model_layers > 0) {
28 throw std::runtime_error(
29 "KVCache (CPU): Calculated cache size is zero for non-empty model. Check parameters.");
32 for (
int l = 0; l < total_num_model_layers; ++l) {
34 layers[l].k.assign(cache_size_per_layer, 0.0f);
35 layers[l].v.assign(cache_size_per_layer, 0.0f);
36 }
catch (
const std::bad_alloc& e) {
38 std::to_string(l) +
": " + e.what());
43 std::to_string(total_num_model_layers) +
" layers.");
46 this->allocated_num_layers = num_gpu_layers_to_allocate;
47 this->allocated_max_seq_len = max_seq_len_arg;
48 this->allocated_num_kv_heads = num_kv_heads;
49 this->allocated_head_dim = head_dim;
51 if (num_gpu_layers_to_allocate > 0) {
52 if (num_gpu_layers_to_allocate > total_num_model_layers) {
53 Logger::warning(
"KVCache::initialize: num_gpu_layers_to_allocate (" + std::to_string(num_gpu_layers_to_allocate) +
54 ") > total_num_model_layers (" + std::to_string(total_num_model_layers) +
55 "). Clamping to total_num_model_layers.");
56 this->allocated_num_layers = total_num_model_layers;
57 num_gpu_layers_to_allocate = total_num_model_layers;
60 size_t cache_elems_per_layer_gpu =
static_cast<size_t>(max_seq_len_arg) *
61 static_cast<size_t>(num_kv_heads) *
62 static_cast<size_t>(head_dim);
64 size_t fp32_cache_bytes_per_layer_gpu = cache_elems_per_layer_gpu *
sizeof(float);
65 size_t int8_cache_bytes_per_layer_gpu = cache_elems_per_layer_gpu *
sizeof(int8_t);
66 size_t num_scales_per_layer_gpu =
static_cast<size_t>(max_seq_len_arg) *
static_cast<size_t>(num_kv_heads);
67 size_t scales_bytes_per_layer_gpu = num_scales_per_layer_gpu *
sizeof(float);
70 throw std::runtime_error(
71 "KVCache (CUDA INT8): Calculated cache elements per layer is zero. Check parameters.");
72 }
else if (cache_elems_per_layer_gpu == 0) {
73 throw std::runtime_error(
74 "KVCache (CUDA FP32): Calculated cache elements per layer is zero. Check parameters.");
78 Logger::info(
"Allocating INT8 KVCache + FP32 Scales on GPU for " + std::to_string(num_gpu_layers_to_allocate) +
79 " layers. Data size per layer: " +
80 std::to_string(int8_cache_bytes_per_layer_gpu / (1024.0 * 1024.0)) +
81 " MB. Scales size per layer: " +
82 std::to_string(scales_bytes_per_layer_gpu / (1024.0 * 1024.0)) +
" MB");
84 Logger::info(
"Allocating FP32 KVCache on GPU for " + std::to_string(num_gpu_layers_to_allocate) +
85 " layers, size per layer: " +
86 std::to_string(fp32_cache_bytes_per_layer_gpu / (1024.0 * 1024.0)) +
91 Logger::info(
"KVCache GPU allocation will target model layers from index " + std::to_string(gpu_layer_start_model_idx) +
92 " to " + std::to_string(gpu_layer_start_model_idx + num_gpu_layers_to_allocate - 1));
94 for (
int i = 0; i < num_gpu_layers_to_allocate; ++i) {
95 int current_model_idx_for_gpu = gpu_layer_start_model_idx + i;
97 if (current_model_idx_for_gpu < 0 ||
static_cast<size_t>(current_model_idx_for_gpu) >=
layers.size()) {
98 Logger::error(
"KVCache::initialize: Calculated current_model_idx_for_gpu (" + std::to_string(current_model_idx_for_gpu) +
") is out of bounds for layers vector (size " + std::to_string(
layers.size()) +
"). Skipping this layer.");
102 if (
layers[current_model_idx_for_gpu].k_dev_fp32) {
104 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) +
" K dev fp32 pointer without proper destruction?");
105 gpuErrchk(cudaFree(
layers[current_model_idx_for_gpu].k_dev_fp32));
106 layers[current_model_idx_for_gpu].k_dev_fp32 =
nullptr;
108 if (
layers[current_model_idx_for_gpu].v_dev_fp32) {
110 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) +
" V dev fp32 pointer without proper destruction?");
111 gpuErrchk(cudaFree(
layers[current_model_idx_for_gpu].v_dev_fp32));
112 layers[current_model_idx_for_gpu].v_dev_fp32 =
nullptr;
114 if (
layers[current_model_idx_for_gpu].k_dev_quantized) {
116 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) +
" K dev quantized pointer without proper destruction?");
117 gpuErrchk(cudaFree(
layers[current_model_idx_for_gpu].k_dev_quantized));
118 layers[current_model_idx_for_gpu].k_dev_quantized =
nullptr;
120 if (
layers[current_model_idx_for_gpu].v_dev_quantized) {
122 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) +
" V dev quantized pointer without proper destruction?");
123 gpuErrchk(cudaFree(
layers[current_model_idx_for_gpu].v_dev_quantized));
124 layers[current_model_idx_for_gpu].v_dev_quantized =
nullptr;
126 if (
layers[current_model_idx_for_gpu].k_dev_scales) {
128 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) +
" K dev scales pointer without proper destruction?");
129 gpuErrchk(cudaFree(
layers[current_model_idx_for_gpu].k_dev_scales));
130 layers[current_model_idx_for_gpu].k_dev_scales =
nullptr;
132 if (
layers[current_model_idx_for_gpu].v_dev_scales) {
134 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) +
" V dev scales pointer without proper destruction?");
135 gpuErrchk(cudaFree(
layers[current_model_idx_for_gpu].v_dev_scales));
136 layers[current_model_idx_for_gpu].v_dev_scales =
nullptr;
140 gpuErrchk(cudaMalloc(&
layers[current_model_idx_for_gpu].k_dev_quantized, int8_cache_bytes_per_layer_gpu));
141 gpuErrchk(cudaMalloc(&
layers[current_model_idx_for_gpu].v_dev_quantized, int8_cache_bytes_per_layer_gpu));
142 gpuErrchk(cudaMalloc(&
layers[current_model_idx_for_gpu].k_dev_scales, scales_bytes_per_layer_gpu));
143 gpuErrchk(cudaMalloc(&
layers[current_model_idx_for_gpu].v_dev_scales, scales_bytes_per_layer_gpu));
145 gpuErrchk(cudaMemset(
layers[current_model_idx_for_gpu].k_dev_quantized, 0, int8_cache_bytes_per_layer_gpu));
146 gpuErrchk(cudaMemset(
layers[current_model_idx_for_gpu].v_dev_quantized, 0, int8_cache_bytes_per_layer_gpu));
147 gpuErrchk(cudaMemset(
layers[current_model_idx_for_gpu].k_dev_scales, 0, scales_bytes_per_layer_gpu));
148 gpuErrchk(cudaMemset(
layers[current_model_idx_for_gpu].v_dev_scales, 0, scales_bytes_per_layer_gpu));
150 gpuErrchk(cudaMalloc(&
layers[current_model_idx_for_gpu].k_dev_fp32, fp32_cache_bytes_per_layer_gpu));
151 gpuErrchk(cudaMalloc(&
layers[current_model_idx_for_gpu].v_dev_fp32, fp32_cache_bytes_per_layer_gpu));
152 gpuErrchk(cudaMemset(
layers[current_model_idx_for_gpu].k_dev_fp32, 0, fp32_cache_bytes_per_layer_gpu));
153 gpuErrchk(cudaMemset(
layers[current_model_idx_for_gpu].v_dev_fp32, 0, fp32_cache_bytes_per_layer_gpu));
156 Logger::info(
"KVCache GPU allocation and zeroing complete for " + std::to_string(num_gpu_layers_to_allocate) +
" layers.");
158 Logger::info(
"KVCache: No GPU layers requested for allocation (num_gpu_layers_to_allocate is 0). Skipping GPU KVCache allocation.");
159 this->allocated_num_layers = 0;
163 Logger::info(
"KVCache (CPU-only build) initialized with dimensions for " +
164 std::to_string(total_num_model_layers) +
" layers, " +
165 std::to_string(max_seq_len_arg) +
" seq len, " +
166 std::to_string(num_kv_heads) +
" KV heads, " +
167 std::to_string(head_dim) +
" head dim");