TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
kv_cache.cpp
Go to the documentation of this file.
1#include "kv_cache.h"
2#include "logger.h"
3
4#ifdef HAS_CUDA
5#include "cuda_kernels.h"
6#include <cuda_runtime.h>
7#include <cublas_v2.h>
8#endif
9
10void KVCache::initialize(const ModelConfig& config,
11 int total_num_model_layers, int num_gpu_layers_to_allocate,
12 int max_seq_len_arg, int num_kv_heads,
13 int head_dim, int max_batch_size_arg) {
14 this->total_model_layers_ = total_num_model_layers;
15 this->max_seq_len_config_ = max_seq_len_arg;
16 this->max_batch_size = max_batch_size_arg;
17 this->current_batch_size = 0;
18 this->batch_seq_lens.clear();
19 this->batch_seq_lens.resize(max_batch_size_arg, 0);
20 layers.resize(total_num_model_layers);
21 seq_len = 0;
22 Logger::info("Allocating KVCache host vectors...");
23 size_t cache_size_per_layer = static_cast<size_t>(max_seq_len_arg) *
24 static_cast<size_t>(max_batch_size_arg) *
25 static_cast<size_t>(num_kv_heads) *
26 static_cast<size_t>(head_dim);
27 if (cache_size_per_layer == 0 && max_seq_len_arg > 0 && total_num_model_layers > 0) {
28 throw std::runtime_error(
29 "KVCache (CPU): Calculated cache size is zero for non-empty model. Check parameters.");
30 }
31
32 for (int l = 0; l < total_num_model_layers; ++l) {
33 try {
34 layers[l].k.assign(cache_size_per_layer, 0.0f);
35 layers[l].v.assign(cache_size_per_layer, 0.0f);
36 } catch (const std::bad_alloc& e) {
37 Logger::error("Failed to allocate CPU KVCache for layer " +
38 std::to_string(l) + ": " + e.what());
39 throw;
40 }
41 }
42 Logger::info("KVCache (CPU) vectors allocated for " +
43 std::to_string(total_num_model_layers) + " layers.");
44
45#ifdef HAS_CUDA
46 this->allocated_num_layers = num_gpu_layers_to_allocate;
47 this->allocated_max_seq_len = max_seq_len_arg;
48 this->allocated_num_kv_heads = num_kv_heads;
49 this->allocated_head_dim = head_dim;
50
51 if (num_gpu_layers_to_allocate > 0) {
52 if (num_gpu_layers_to_allocate > total_num_model_layers) {
53 Logger::warning("KVCache::initialize: num_gpu_layers_to_allocate (" + std::to_string(num_gpu_layers_to_allocate) +
54 ") > total_num_model_layers (" + std::to_string(total_num_model_layers) +
55 "). Clamping to total_num_model_layers.");
56 this->allocated_num_layers = total_num_model_layers;
57 num_gpu_layers_to_allocate = total_num_model_layers;
58 }
59
60 size_t cache_elems_per_layer_gpu = static_cast<size_t>(max_seq_len_arg) *
61 static_cast<size_t>(num_kv_heads) *
62 static_cast<size_t>(head_dim);
63
64 size_t fp32_cache_bytes_per_layer_gpu = cache_elems_per_layer_gpu * sizeof(float);
65 size_t int8_cache_bytes_per_layer_gpu = cache_elems_per_layer_gpu * sizeof(int8_t);
66 size_t num_scales_per_layer_gpu = static_cast<size_t>(max_seq_len_arg) * static_cast<size_t>(num_kv_heads);
67 size_t scales_bytes_per_layer_gpu = num_scales_per_layer_gpu * sizeof(float);
68
69 if (cache_elems_per_layer_gpu == 0 && config.use_kvcache_quantization) {
70 throw std::runtime_error(
71 "KVCache (CUDA INT8): Calculated cache elements per layer is zero. Check parameters.");
72 } else if (cache_elems_per_layer_gpu == 0) {
73 throw std::runtime_error(
74 "KVCache (CUDA FP32): Calculated cache elements per layer is zero. Check parameters.");
75 }
76
77 if (config.use_kvcache_quantization) {
78 Logger::info("Allocating INT8 KVCache + FP32 Scales on GPU for " + std::to_string(num_gpu_layers_to_allocate) +
79 " layers. Data size per layer: " +
80 std::to_string(int8_cache_bytes_per_layer_gpu / (1024.0 * 1024.0)) +
81 " MB. Scales size per layer: " +
82 std::to_string(scales_bytes_per_layer_gpu / (1024.0 * 1024.0)) + " MB");
83 } else {
84 Logger::info("Allocating FP32 KVCache on GPU for " + std::to_string(num_gpu_layers_to_allocate) +
85 " layers, size per layer: " +
86 std::to_string(fp32_cache_bytes_per_layer_gpu / (1024.0 * 1024.0)) +
87 " MB");
88 }
89
90 int gpu_layer_start_model_idx = this->total_model_layers_ - num_gpu_layers_to_allocate;
91 Logger::info("KVCache GPU allocation will target model layers from index " + std::to_string(gpu_layer_start_model_idx) +
92 " to " + std::to_string(gpu_layer_start_model_idx + num_gpu_layers_to_allocate - 1));
93
94 for (int i = 0; i < num_gpu_layers_to_allocate; ++i) {
95 int current_model_idx_for_gpu = gpu_layer_start_model_idx + i;
96
97 if (current_model_idx_for_gpu < 0 || static_cast<size_t>(current_model_idx_for_gpu) >= layers.size()) {
98 Logger::error("KVCache::initialize: Calculated current_model_idx_for_gpu (" + std::to_string(current_model_idx_for_gpu) + ") is out of bounds for layers vector (size " + std::to_string(layers.size()) + "). Skipping this layer.");
99 continue;
100 }
101
102 if (layers[current_model_idx_for_gpu].k_dev_fp32) {
104 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) + " K dev fp32 pointer without proper destruction?");
105 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].k_dev_fp32));
106 layers[current_model_idx_for_gpu].k_dev_fp32 = nullptr;
107 }
108 if (layers[current_model_idx_for_gpu].v_dev_fp32) {
110 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) + " V dev fp32 pointer without proper destruction?");
111 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].v_dev_fp32));
112 layers[current_model_idx_for_gpu].v_dev_fp32 = nullptr;
113 }
114 if (layers[current_model_idx_for_gpu].k_dev_quantized) {
116 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) + " K dev quantized pointer without proper destruction?");
117 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].k_dev_quantized));
118 layers[current_model_idx_for_gpu].k_dev_quantized = nullptr;
119 }
120 if (layers[current_model_idx_for_gpu].v_dev_quantized) {
122 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) + " V dev quantized pointer without proper destruction?");
123 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].v_dev_quantized));
124 layers[current_model_idx_for_gpu].v_dev_quantized = nullptr;
125 }
126 if (layers[current_model_idx_for_gpu].k_dev_scales) {
128 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) + " K dev scales pointer without proper destruction?");
129 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].k_dev_scales));
130 layers[current_model_idx_for_gpu].k_dev_scales = nullptr;
131 }
132 if (layers[current_model_idx_for_gpu].v_dev_scales) {
134 "KVCache::initialize: Re-initializing KVCache layer " + std::to_string(current_model_idx_for_gpu) + " V dev scales pointer without proper destruction?");
135 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].v_dev_scales));
136 layers[current_model_idx_for_gpu].v_dev_scales = nullptr;
137 }
138
139 if (config.use_kvcache_quantization) {
140 gpuErrchk(cudaMalloc(&layers[current_model_idx_for_gpu].k_dev_quantized, int8_cache_bytes_per_layer_gpu));
141 gpuErrchk(cudaMalloc(&layers[current_model_idx_for_gpu].v_dev_quantized, int8_cache_bytes_per_layer_gpu));
142 gpuErrchk(cudaMalloc(&layers[current_model_idx_for_gpu].k_dev_scales, scales_bytes_per_layer_gpu));
143 gpuErrchk(cudaMalloc(&layers[current_model_idx_for_gpu].v_dev_scales, scales_bytes_per_layer_gpu));
144
145 gpuErrchk(cudaMemset(layers[current_model_idx_for_gpu].k_dev_quantized, 0, int8_cache_bytes_per_layer_gpu));
146 gpuErrchk(cudaMemset(layers[current_model_idx_for_gpu].v_dev_quantized, 0, int8_cache_bytes_per_layer_gpu));
147 gpuErrchk(cudaMemset(layers[current_model_idx_for_gpu].k_dev_scales, 0, scales_bytes_per_layer_gpu));
148 gpuErrchk(cudaMemset(layers[current_model_idx_for_gpu].v_dev_scales, 0, scales_bytes_per_layer_gpu));
149 } else {
150 gpuErrchk(cudaMalloc(&layers[current_model_idx_for_gpu].k_dev_fp32, fp32_cache_bytes_per_layer_gpu));
151 gpuErrchk(cudaMalloc(&layers[current_model_idx_for_gpu].v_dev_fp32, fp32_cache_bytes_per_layer_gpu));
152 gpuErrchk(cudaMemset(layers[current_model_idx_for_gpu].k_dev_fp32, 0, fp32_cache_bytes_per_layer_gpu));
153 gpuErrchk(cudaMemset(layers[current_model_idx_for_gpu].v_dev_fp32, 0, fp32_cache_bytes_per_layer_gpu));
154 }
155 }
156 Logger::info("KVCache GPU allocation and zeroing complete for " + std::to_string(num_gpu_layers_to_allocate) + " layers.");
157 } else {
158 Logger::info("KVCache: No GPU layers requested for allocation (num_gpu_layers_to_allocate is 0). Skipping GPU KVCache allocation.");
159 this->allocated_num_layers = 0;
160 }
161
162#else
163 Logger::info("KVCache (CPU-only build) initialized with dimensions for " +
164 std::to_string(total_num_model_layers) + " layers, " +
165 std::to_string(max_seq_len_arg) + " seq len, " +
166 std::to_string(num_kv_heads) + " KV heads, " +
167 std::to_string(head_dim) + " head dim");
168#endif
169}
170
171#ifdef HAS_CUDA
173 if (allocated_num_layers > 0) {
174 Logger::info("KVCache::destroy_gpu_resources: Freeing KVCache CUDA memory for " +
175 std::to_string(allocated_num_layers) + " allocated layers.");
176 }
177 if (allocated_num_layers > 0 && total_model_layers_ > 0) {
178 int gpu_layer_start_model_idx = total_model_layers_ - allocated_num_layers;
179 if (gpu_layer_start_model_idx < 0) {
180 Logger::warning("KVCache::destroy_gpu_resources: gpu_layer_start_model_idx (" +
181 std::to_string(gpu_layer_start_model_idx) + ") is negative. Clamping to 0.");
182 gpu_layer_start_model_idx = 0;
183 }
184
185 for (int i = 0; i < allocated_num_layers; ++i) {
186 int current_model_idx_for_gpu = gpu_layer_start_model_idx + i;
187 if (static_cast<size_t>(current_model_idx_for_gpu) < layers.size()) {
188 if (layers[current_model_idx_for_gpu].k_dev_quantized) {
189 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].k_dev_quantized));
190 layers[current_model_idx_for_gpu].k_dev_quantized = nullptr;
191 }
192 if (layers[current_model_idx_for_gpu].v_dev_quantized) {
193 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].v_dev_quantized));
194 layers[current_model_idx_for_gpu].v_dev_quantized = nullptr;
195 }
196 if (layers[current_model_idx_for_gpu].k_dev_scales) {
197 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].k_dev_scales));
198 layers[current_model_idx_for_gpu].k_dev_scales = nullptr;
199 }
200 if (layers[current_model_idx_for_gpu].v_dev_scales) {
201 gpuErrchk(cudaFree(layers[current_model_idx_for_gpu].v_dev_scales));
202 layers[current_model_idx_for_gpu].v_dev_scales = nullptr;
203 }
204 } else {
205 Logger::warning("KVCache::destroy_gpu_resources: current_model_idx_for_gpu (" +
206 std::to_string(current_model_idx_for_gpu) + ") out of bounds for layers vector (size " +
207 std::to_string(layers.size()) + "). Skipping free for this index.");
208 }
209 }
210 } else if (allocated_num_layers > 0) {
211 Logger::warning("KVCache::destroy_gpu_resources: allocated_num_layers is " + std::to_string(allocated_num_layers) +
212 " but total_model_layers_ is " + std::to_string(total_model_layers_) + ". Skipping GPU free to prevent errors.");
213 }
214 allocated_num_layers = 0;
215}
216#else
218 // No-op for CPU-only builds
219}
220#endif
static void warning(const std::string &message)
Definition logger.cpp:139
static void info(const std::string &message)
Definition logger.cpp:135
static void error(const std::string &message)
Definition logger.cpp:143
Logging utilities for the TinyLlama implementation.
int max_batch_size
Definition model.h:159
int max_seq_len_config_
Definition model.h:163
void initialize(const ModelConfig &config, int total_num_model_layers, int num_gpu_layers_to_allocate, int max_seq_len_arg, int num_kv_heads, int head_dim, int max_batch_size_arg=1)
Initializes the KV cache with given dimensions.
Definition kv_cache.cpp:10
int total_model_layers_
Definition model.h:162
std::vector< KVCacheLayer > layers
Definition model.h:152
int seq_len
Definition model.h:155
void destroy_gpu_resources()
Definition kv_cache.cpp:217
std::vector< int > batch_seq_lens
Definition model.h:158
int current_batch_size
Definition model.h:160
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
bool use_kvcache_quantization
Definition model.h:103