TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
api.cpp
Go to the documentation of this file.
1#include "api.h"
2#include "gguf_parser.h"
3#include "model_macros.h"
4
5#include <algorithm>
6#include <chrono>
7#include <cmath>
8#include <cstdint>
9#include <filesystem>
10#include <fstream>
11#include <functional>
12#include <iostream>
13#include <map>
14#include <memory>
15#include <nlohmann/json.hpp>
16#include <numeric>
17#include <random>
18#include <stdexcept>
19#include <string>
20#include <vector>
21#include <iomanip>
22#include <sstream>
23
24#include "logger.h"
25#include "model.h"
26#include "safetensors_loader.h"
27#include "tokenizer.h"
28
29namespace tinyllama {
30
31
32static void log_vector_summary_detailed(const std::string& name,
33 const std::vector<float>& v,
34 int current_pos, int current_layer,
35 int N) {
36 if (v.empty()) {
37 Logger::info(name + " (pos=" + std::to_string(current_pos) + ", layer=" +
38 std::to_string(current_layer) + "): EMPTY VECTOR");
39 return;
40 }
41 std::stringstream ss;
42 ss << name << " (pos=" << std::to_string(current_pos)
43 << ", layer=" << std::to_string(current_layer) << "): size=" << v.size();
44 ss << ", first " << N << ": [";
45 for (int i = 0; i < N && i < v.size(); ++i) {
46 ss << std::fixed << std::setprecision(4) << v[i]
47 << (i == N - 1 || i == v.size() - 1 ? "" : ", ");
48 }
49 ss << "]";
50 float min_val = v[0], max_val = v[0], sum = 0.0f;
51 bool all_finite = true;
52 for (float val : v) {
53 if (val < min_val) min_val = val;
54 if (val > max_val) max_val = val;
55 sum += val;
56 if (!std::isfinite(val)) all_finite = false;
57 }
58 ss << ", min=" << std::fixed << std::setprecision(4) << min_val;
59 ss << ", max=" << std::fixed << std::setprecision(4) << max_val;
60 ss << ", mean=" << std::fixed << std::setprecision(4) << (sum / v.size());
61 ss << ", finite=" << (all_finite ? "yes" : "no");
62 Logger::info(ss.str());
63}
64
65static std::string read_file_api(const std::string& path) {
66 std::filesystem::path fs_path(path);
67 std::ifstream file(fs_path, std::ios::binary);
68 if (!file) throw std::runtime_error("Failed to open file: " + path);
69 return std::string((std::istreambuf_iterator<char>(file)),
70 std::istreambuf_iterator<char>());
71}
72
73static int argmax(const std::vector<float>& v) {
74 if (v.empty()) {
75 Logger::error("Cannot perform argmax on empty vector");
76 return -1;
77 }
78
79 return std::distance(v.begin(), std::max_element(v.begin(), v.end()));
80}
81
82static int sample_top_k_top_p_temperature(const std::vector<float>& logits,
83 float temperature, int top_k,
84 float top_p, std::mt19937& rng) {
85 if (logits.empty()) {
86 throw std::runtime_error("Cannot sample from empty logits.");
87 }
88
89 // If temperature is very low, fall back to greedy sampling
90 if (temperature < 0.05f) {
91 return std::distance(logits.begin(), std::max_element(logits.begin(), logits.end()));
92 }
93
94 int vocab_size = logits.size();
95
96 top_k = (std::min)(top_k, vocab_size);
97 if (top_k <= 0) top_k = vocab_size;
98
99 std::vector<float> scaled_logits(vocab_size);
100 float max_logit = -std::numeric_limits<float>::infinity();
101 for (float logit : logits) max_logit = (std::max)(max_logit, logit);
102
103 // Scale logits to avoid numerical instability
104 const float scale = 1.0f / temperature;
105 for (int i = 0; i < vocab_size; ++i) {
106 scaled_logits[i] = (logits[i] - max_logit) * scale;
107 }
108
109 std::vector<double> probs_double(vocab_size);
110 double sum_exp = 0.0;
111 for (int i = 0; i < vocab_size; ++i) {
112 probs_double[i] = std::exp(static_cast<double>(scaled_logits[i]));
113 sum_exp += probs_double[i];
114 }
115
116 // Normalize probabilities
117 if (sum_exp > 0.0) {
118 for (int i = 0; i < vocab_size; ++i) {
119 probs_double[i] /= sum_exp;
120 }
121 } else {
122 // If all probabilities are zero, fall back to uniform distribution
123 for (int i = 0; i < vocab_size; ++i) {
124 probs_double[i] = 1.0 / vocab_size;
125 }
126 }
127
128 std::vector<std::pair<float, int>> prob_idx(vocab_size);
129 for (int i = 0; i < vocab_size; ++i) {
130 prob_idx[i] = {static_cast<float>(probs_double[i]), i};
131 }
132
133 std::sort(prob_idx.begin(), prob_idx.end(),
134 std::greater<std::pair<float, int>>());
135
136 if (top_k < vocab_size) {
137 prob_idx.resize(top_k);
138 }
139
140 float cumulative_prob = 0.0f;
141 int last_idx = 0;
142 for (int i = 0; i < prob_idx.size(); ++i) {
143 cumulative_prob += prob_idx[i].first;
144 last_idx = i;
145 if (cumulative_prob >= top_p) {
146 break;
147 }
148 }
149 prob_idx.resize(last_idx + 1);
150
151 float final_sum = 0.0f;
152 for (const auto& pi : prob_idx) {
153 final_sum += pi.first;
154 }
155
156 // Renormalize probabilities after top-k and top-p filtering
157 std::vector<float> final_probs(prob_idx.size());
158 if (final_sum > 0.0f) {
159 for (size_t i = 0; i < prob_idx.size(); ++i) {
160 final_probs[i] = prob_idx[i].first / final_sum;
161 }
162 } else {
163 // If all probabilities are zero after filtering, use uniform distribution
164 float uniform_prob = 1.0f / prob_idx.size();
165 std::fill(final_probs.begin(), final_probs.end(), uniform_prob);
166 }
167
168 std::discrete_distribution<int> dist(final_probs.begin(), final_probs.end());
169 int sampled_idx_in_filtered = dist(rng);
170
171 return prob_idx[sampled_idx_in_filtered].second;
172}
173
174
175TinyLlamaSession::TinyLlamaSession(const std::string& model_path_arg,
176 const std::string& tokenizer_path_arg,
177 int threads,
178 int num_gpu_layers_from_cli,
179 bool cli_use_mmap,
180 bool use_kv_quant,
181 bool use_batch_generation,
182 int max_batch_size)
183 : threads_(threads), use_batch_generation_(use_batch_generation),
184 max_batch_size_(max_batch_size), rng_(std::random_device{}()) {
185 Logger::info("TinyLlamaSession constructor entered. Model path: " + model_path_arg +
186 ", Tokenizer path: " + tokenizer_path_arg +
187 ", Threads: " + std::to_string(threads) +
188 ", Num GPU Layers (CLI): " + std::to_string(num_gpu_layers_from_cli) +
189 ", Use MMAP (CLI): " + (cli_use_mmap ? "true" : "false") +
190 ", Use KV Quant (CLI): " + (use_kv_quant ? "true" : "false"));
191
192 std::string effective_model_file_path = model_path_arg;
193 std::string path_for_config_json = model_path_arg;
194
195 ModelConfig initial_model_config_for_model_ctor;
196 initial_model_config_for_model_ctor.use_mmap_for_gguf = cli_use_mmap;
197 initial_model_config_for_model_ctor.use_kvcache_quantization = use_kv_quant;
198 if (num_gpu_layers_from_cli < 0) {
199 initial_model_config_for_model_ctor.num_cpu_offload_layers = 0;
200 } else {
201 initial_model_config_for_model_ctor.num_cpu_offload_layers = num_gpu_layers_from_cli;
202 }
203
204 std::filesystem::path fs_model_path(model_path_arg);
205 bool is_dir = std::filesystem::is_directory(fs_model_path);
206
207 if (is_dir) {
208 Logger::info("Model path is a directory. Assuming SafeTensors model directory: " + model_path_arg);
209 effective_model_file_path = (fs_model_path / "model.safetensors").string();
210 std::string config_json_path_in_dir = (fs_model_path / "config.json").string();
211
212 Logger::info("Derived SafeTensors model file path: " + effective_model_file_path);
213 Logger::info("Path for loading config.json: " + config_json_path_in_dir);
214
215 // Directly populate initial_model_config_for_model_ctor
216 // load_model_config_from_json returns bool and populates the passed ModelConfig&
217 bool st_config_loaded = SafeTensorsLoader::load_model_config_from_json(config_json_path_in_dir, initial_model_config_for_model_ctor);
218
219 if (st_config_loaded) {
220 Logger::info("Successfully loaded config.json directly into initial_model_config_for_model_ctor.");
221 // Log tokenizer_family IMMEDIATELY after loading from config.json
222 std::string family_after_json_load = "UNKNOWN_POST_JSON_LOAD_DIR_CASE";
223 if (initial_model_config_for_model_ctor.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE) family_after_json_load = "LLAMA_SENTENCEPIECE";
224 else if (initial_model_config_for_model_ctor.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN) family_after_json_load = "LLAMA3_TIKTOKEN";
225 Logger::info("[API_CPP_POST_JSON_LOAD_DIR_CASE] Tokenizer family in initial_model_config_for_model_ctor: " + family_after_json_load);
226 // tokenizer_family should now be set in initial_model_config_for_model_ctor
227 // num_hidden_layers is also now set from config.json
228 } else {
229 Logger::warning("Failed to load config.json for SafeTensors. initial_model_config_for_model_ctor will have defaults/CLI overrides for some fields, tokenizer_family likely UNKNOWN.");
230 // If config.json fails, num_hidden_layers might be 0 or default, which will affect cpu offload calculation.
231 // It's crucial config.json loads for correct layer counts.
232 }
233
234 // Apply CLI overrides for mmap and GPU layers. GPU layer logic depends on total_hidden_layers from config.
235 initial_model_config_for_model_ctor.use_mmap_for_gguf = cli_use_mmap;
236
237 int total_layers_from_config = initial_model_config_for_model_ctor.num_hidden_layers;
238 if (total_layers_from_config <= 0 && st_config_loaded) {
239 Logger::warning("config.json loaded but num_hidden_layers is <= 0. GPU offload logic might be incorrect.");
240 } else if (total_layers_from_config <= 0 && !st_config_loaded) {
241 Logger::warning("config.json NOT loaded and num_hidden_layers is <= 0 (default). GPU offload may not work as expected. Model load will likely fail.");
242 // If config.json didn't load, total_layers_from_config is likely 0 from default ModelConfig.
243 // The TinyLlamaModel constructor will ultimately use its own parsed config, but this intermediate step needs care.
244 }
245
246 if (num_gpu_layers_from_cli < 0) { // -1 signifies all layers on GPU
247 initial_model_config_for_model_ctor.num_cpu_offload_layers = 0;
248 } else if (num_gpu_layers_from_cli == 0) { // 0 signifies all layers on CPU
249 initial_model_config_for_model_ctor.num_cpu_offload_layers = total_layers_from_config; // All layers offloaded to CPU
250 } else { // N > 0 signifies N layers on GPU
251 if (total_layers_from_config > 0) {
252 initial_model_config_for_model_ctor.num_cpu_offload_layers = total_layers_from_config - num_gpu_layers_from_cli;
253 } else {
254 // Cannot determine actual GPU layer count if total_layers_from_config is unknown.
255 // Pass the CLI hint, TinyLlamaModel ctor will deal with it against its own parsed config.
256 initial_model_config_for_model_ctor.num_cpu_offload_layers = num_gpu_layers_from_cli;
257 Logger::warning("Total hidden layers unknown from config.json before model load; passing num_gpu_layers_from_cli as num_cpu_offload_layers hint.");
258 }
259 }
260 // Clamp num_cpu_offload_layers
261 if (total_layers_from_config > 0) {
262 initial_model_config_for_model_ctor.num_cpu_offload_layers = std::max(0, std::min(initial_model_config_for_model_ctor.num_cpu_offload_layers, total_layers_from_config));
263 }
264
265 initial_model_config_for_model_ctor.is_gguf_file_loaded = false;
266
267 SafeTensorsLoader st_loader(effective_model_file_path);
268 model_ = std::make_unique<TinyLlamaModel>(initial_model_config_for_model_ctor, st_loader);
269 config_ = model_->get_config();
270 config_.is_gguf_file_loaded = false; // Ensure this is set for session's copy too
271
272 config_.use_kvcache_quantization = use_kv_quant; // Re-apply CLI/constructor preference
273
274 Logger::info("TinyLlamaSession: Finalizing ModelConfig for KVCache initialization. use_kvcache_quantization set to: " +
275 std::string(config_.use_kvcache_quantization ? "true" : "false"));
276
283
284 } else { // Not a directory, assume it's a file path and check extension
285 std::string extension = fs_model_path.extension().string();
286 std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
287
288 if (extension == ".gguf") {
289 Logger::info("GGUF model type detected by extension for Session constructor: " + model_path_arg);
290 model_ = std::make_unique<TinyLlamaModel>(initial_model_config_for_model_ctor, model_path_arg);
291 config_ = model_->get_config();
292 } else if (extension == ".safetensors") {
293 Logger::info("SafeTensors model type detected by extension for Session constructor (file case): " + model_path_arg);
294 effective_model_file_path = model_path_arg;
295
296 bool st_config_loaded = SafeTensorsLoader::load_model_config_from_json(effective_model_file_path, initial_model_config_for_model_ctor);
297 if (st_config_loaded) {
298 Logger::info("Successfully loaded config.json for SafeTensors in Session ctor (file case).");
299 // Log tokenizer_family IMMEDIATELY after loading from config.json (file case)
300 std::string family_after_json_load_file_case = "UNKNOWN_POST_JSON_LOAD_FILE_CASE";
301 if (initial_model_config_for_model_ctor.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE) family_after_json_load_file_case = "LLAMA_SENTENCEPIECE";
302 else if (initial_model_config_for_model_ctor.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN) family_after_json_load_file_case = "LLAMA3_TIKTOKEN";
303 Logger::info("[API_CPP_POST_JSON_LOAD_FILE_CASE] Tokenizer family in initial_model_config_for_model_ctor: " + family_after_json_load_file_case);
304 // tokenizer_family and num_hidden_layers are now set in initial_model_config_for_model_ctor
305 } else {
306 Logger::warning("Failed to load config.json for SafeTensors in Session ctor (file case). Model will use defaults or CLI overrides.");
307 }
308
309 initial_model_config_for_model_ctor.use_mmap_for_gguf = cli_use_mmap;
310 // Correctly calculate num_cpu_offload_layers based on total_layers_from_config
311 int total_layers_from_config_file_case = initial_model_config_for_model_ctor.num_hidden_layers;
312 if (num_gpu_layers_from_cli < 0) {
313 initial_model_config_for_model_ctor.num_cpu_offload_layers = 0;
314 } else if (num_gpu_layers_from_cli == 0) {
315 initial_model_config_for_model_ctor.num_cpu_offload_layers = total_layers_from_config_file_case;
316 } else {
317 if (total_layers_from_config_file_case > 0) {
318 initial_model_config_for_model_ctor.num_cpu_offload_layers = total_layers_from_config_file_case - num_gpu_layers_from_cli;
319 } else {
320 initial_model_config_for_model_ctor.num_cpu_offload_layers = num_gpu_layers_from_cli;
321 Logger::warning("Total hidden layers unknown from config.json (file case); passing num_gpu_layers_from_cli as num_cpu_offload_layers hint.");
322 }
323 }
324 if (total_layers_from_config_file_case > 0) {
325 initial_model_config_for_model_ctor.num_cpu_offload_layers = std::max(0, std::min(initial_model_config_for_model_ctor.num_cpu_offload_layers, total_layers_from_config_file_case));
326 }
327
328 initial_model_config_for_model_ctor.is_gguf_file_loaded = false;
329
330 SafeTensorsLoader st_loader(effective_model_file_path);
331 model_ = std::make_unique<TinyLlamaModel>(initial_model_config_for_model_ctor, st_loader);
332 config_ = model_->get_config();
333 config_.is_gguf_file_loaded = false; // Ensure this is set for session's copy too
334
335 config_.use_kvcache_quantization = use_kv_quant; // Re-apply CLI/constructor preference
336
337 Logger::info("TinyLlamaSession: Finalizing ModelConfig for KVCache initialization. use_kvcache_quantization set to: " +
338 std::string(config_.use_kvcache_quantization ? "true" : "false"));
339
340 // Initialize KVCache with potentially updated config_ (from model load)
341 // and the now-set use_kvcache_quantization flag.
348 } else {
349 throw std::runtime_error("Unsupported model file type or extension in Session constructor: " + model_path_arg +
350 ". Please provide a directory for SafeTensors, a .gguf file, or a .safetensors file.");
351 }
352 }
353
354 if (!model_) {
355 throw std::runtime_error("Model pointer is null after instantiation attempt in Session constructor.");
356 }
357
358 try {
360 const GGUFData* gguf_data = model_->get_gguf_data();
361 if (!gguf_data) {
362 throw std::runtime_error("GGUF model loaded but GGUFData is null in Session constructor.");
363 }
364 tokenizer_ = std::make_unique<Tokenizer>(*gguf_data, config_);
365 Logger::info("Tokenizer initialized from GGUF metadata.");
366 } else { // SafeTensors (either from directory or direct .safetensors file)
367 std::filesystem::path p_tokenizer_arg(tokenizer_path_arg);
368 std::string tokenizer_dir = p_tokenizer_arg.parent_path().string();
369 if (tokenizer_dir.empty()) {
370 tokenizer_dir = ".";
371 }
372
373 std::string vocab_json_path = (std::filesystem::path(tokenizer_dir) / "tokenizer.json").string();
374 // The model_path for the tokenizer constructor should be the actual sentencepiece model file, e.g. data/tokenizer.model
375 // tokenizer_path_arg already holds this (e.g. data/tokenizer.model)
376 std::string sp_model_path = tokenizer_path_arg;
377
378 Logger::info("Initializing Tokenizer for SafeTensors. Vocab JSON path: " + vocab_json_path + ", SP Model path: " + sp_model_path);
379 // Log the tokenizer_family from the config_ that will be passed to the Tokenizer
380 std::string family_to_log = "UNKNOWN_IN_API_CPP";
381 if (config_.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE) family_to_log = "LLAMA_SENTENCEPIECE";
382 else if (config_.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN) family_to_log = "LLAMA3_TIKTOKEN";
383 Logger::info("[API_CPP_TOKENIZER_INIT] Tokenizer family from session config for SafeTensors: " + family_to_log);
384
385 tokenizer_ = std::make_unique<Tokenizer>(vocab_json_path, sp_model_path, config_);
386 Logger::info("Tokenizer initialized from external files for SafeTensors model.");
387 }
388 } catch (const std::exception& e) {
389 throw std::runtime_error(std::string("Failed to initialize Tokenizer: ") + e.what());
390 }
391
392 if (!tokenizer_) {
393 throw std::runtime_error("Tokenizer pointer is null after instantiation attempt.");
394 }
395
396 eos_token_id_ = config_.eos_token_id; // Use the session's config_ which is now model's config
397
398 const ModelConfig& final_model_config = model_->get_config(); // Explicitly use model's config
399 int total_model_layers = final_model_config.num_hidden_layers;
400
401 int effective_cpu_offload_layers = final_model_config.num_cpu_offload_layers;
402 int gpu_layers_for_kvcache = total_model_layers - effective_cpu_offload_layers;
403 if (gpu_layers_for_kvcache < 0) gpu_layers_for_kvcache = 0; // Sanity check, should not happen if model ctor is correct
404 if (gpu_layers_for_kvcache > total_model_layers) gpu_layers_for_kvcache = total_model_layers; // Sanity
405
406 Logger::info("[Session KVCache Init] Total Layers: " + std::to_string(total_model_layers) +
407 ", Effective CPU Offload by Model: " + std::to_string(effective_cpu_offload_layers) +
408 ", GPU Layers for KVCache: " + std::to_string(gpu_layers_for_kvcache));
409
410 if (total_model_layers <= 0) {
411 throw std::runtime_error("Model config has zero or negative num_hidden_layers before KVCache init.");
412 }
413 if (final_model_config.num_attention_heads <= 0) {
414 throw std::runtime_error("Model config has zero or negative num_attention_heads before KVCache init.");
415 }
416
417 int head_dim = final_model_config.hidden_size / final_model_config.num_attention_heads;
418
419 kv_cache_.initialize(final_model_config, // Total layers for CPU part of KVCache
420 total_model_layers,
421 gpu_layers_for_kvcache, // Actual GPU layers for device memory
422 final_model_config.max_position_embeddings,
423 final_model_config.num_key_value_heads,
424 head_dim,
426 Logger::info("TinyLlamaSession initialization complete (after KVCache init).");
427}
428
430 Logger::info("TinyLlamaSession: Destroyed.");
431}
432
433std::string TinyLlamaSession::generate(const std::string& user_prompt, int steps,
434 float temperature,
435 int top_k, float top_p,
436 const std::string& system_prompt_arg, // Renamed for clarity inside function
437 bool apply_q_a_format_cli_hint) { // Renamed for clarity
438 auto t_start = std::chrono::high_resolution_clock::now(); // Start timing
439
440 generated_text_for_api_return_.clear(); // Clear for new generation
441 generated_stream_.str(""); // Clear for new generation
442 generated_stream_.clear(); // Clear error flags
443
444 Logger::info("[Generate API] User prompt: \"" + user_prompt + "\", System prompt: \"" + system_prompt_arg + "\", Steps: " + std::to_string(steps));
445
446 if (!model_ || !tokenizer_) {
447 throw std::runtime_error("Model or tokenizer not loaded.");
448 }
449
450 std::string final_prompt_for_encoding;
451 bool used_chat_template = false;
452
453 // Log conditions for chat template application
454 if (tokenizer_) {
455 bool gguf_template_empty = tokenizer_->get_gguf_chat_template().empty();
456 Logger::info("[Generate API] GGUF chat template from tokenizer is empty: " + std::string(gguf_template_empty ? "true" : "false"));
457 if (!gguf_template_empty) {
458 Logger::info("[Generate API] GGUF Template Content (first 100 chars): " + tokenizer_->get_gguf_chat_template().substr(0, 100));
459 }
460 } else {
461 Logger::warning("[Generate API] Tokenizer is null before checking chat template!");
462 }
463 std::string family_log_str = "UNKNOWN";
464 if (config_.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE) family_log_str = "LLAMA_SENTENCEPIECE";
465 else if (config_.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN) family_log_str = "LLAMA3_TIKTOKEN";
466 Logger::info("[Generate API] Configured tokenizer_family: " + family_log_str);
467
468 // New Priority Logic:
469 // Priority 1 (NEW): Legacy Q/A formatting if CLI hint is true
470 if (apply_q_a_format_cli_hint) {
471 Logger::info("[Generate API] Using legacy Q/A formatting (CLI Hint is true - Priority 1).");
472 std::string temp_prompt = user_prompt;
473 if (!system_prompt_arg.empty()) {
474 temp_prompt = system_prompt_arg + "\\n\\nQ: " + user_prompt + "\\nA:";
475 } else {
476 temp_prompt = "Q: " + user_prompt + "\\nA:";
477 }
478 final_prompt_for_encoding = temp_prompt;
479 used_chat_template = false; // Q/A is not a 'chat template' in the GGUF sense
480 }
481 // Priority 2 (WAS 1): GGUF Chat Template from Tokenizer (only if Q/A hint is false)
482 else if (tokenizer_ && !tokenizer_->get_gguf_chat_template().empty()) {
483 std::string gguf_template_content = tokenizer_->get_gguf_chat_template();
484 bool is_llama_sentencepiece_family = (config_.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE);
485 bool looks_like_jinja = (gguf_template_content.find("{%") != std::string::npos);
486
487 if (is_llama_sentencepiece_family && looks_like_jinja) {
488 Logger::info("[Generate API] Detected LLAMA_SENTENCEPIECE model with a Jinja-like GGUF template. Forcing Q/A format to avoid C++ Jinja processing issues (Priority 2 Override).");
489 std::string temp_prompt = user_prompt;
490 if (!system_prompt_arg.empty()) {
491 temp_prompt = system_prompt_arg + "\\\\n\\\\nQ: " + user_prompt + "\\\\nA:";
492 } else {
493 temp_prompt = "Q: " + user_prompt + "\\\\nA:";
494 }
495 final_prompt_for_encoding = temp_prompt;
496 used_chat_template = false;
497 } else {
498 Logger::info("[Generate API] Using GGUF chat template from tokenizer (Q/A Hint false - Priority 2).");
499 final_prompt_for_encoding = tokenizer_->apply_chat_template(user_prompt, system_prompt_arg, config_);
500 used_chat_template = true;
501 }
502 }
503 // Priority 3 (WAS 2): Llama 3 family specific template (only if Q/A hint false and no GGUF template)
505 Logger::info("[Generate API] Llama 3 tokenizer family detected, using apply_chat_template (Q/A Hint false, No GGUF template - Priority 3).");
506 final_prompt_for_encoding = tokenizer_->apply_chat_template(user_prompt, system_prompt_arg, config_);
507 used_chat_template = true;
508 }
509 // Priority 4 (WAS 3/4 depending on apply_q_a_format_cli_hint): Raw prompt (if all above are false)
510 else {
511 Logger::info("[Generate API] No applicable template/hint. Using user prompt as is (prepending system prompt if available - Priority 4).");
512 if (!system_prompt_arg.empty()) {
513 final_prompt_for_encoding = system_prompt_arg + "\\n\\n" + user_prompt;
514 } else {
515 final_prompt_for_encoding = user_prompt;
516 }
517 used_chat_template = false;
518 }
519
520 Logger::debug("[Generate API] Final prompt for encoding (first 100 chars): \\\"" + final_prompt_for_encoding.substr(0, 100) + "\\\"");
521
522 std::vector<int> tokens = tokenizer_->encode(final_prompt_for_encoding, true, false, Tokenizer::PreTokenizeMethod::DEFAULT); // add_bos=true, add_eos=false (EOS handled by loop)
523
524 if (tokens.empty()) {
525 Logger::warning("Tokenization resulted in empty ID list for prompt: " +
526 final_prompt_for_encoding);
527 return "";
528 }
529
530 int num_prompt_tokens = tokens.size();
531 Logger::info("[Generate API] Number of prompt tokens: " + std::to_string(num_prompt_tokens));
532
533 int total_steps = num_prompt_tokens + steps -1; // Max total tokens including prompt
534 int generated_count = 0;
535 int next_token_id = -1;
536
537 std::vector<float> logits; // Declare logits here, to be populated by prefill or loop
538 std::vector<int> generated_token_ids; // Track generated tokens separately
539
540 kv_cache_.clear_data(); // Clear K/V vector data for all layers
541 kv_cache_.seq_len = 0; // Reset KVCache logical sequence length for new sequence
542
543 std::vector<float> current_data_host; // To hold embedding or output of CPU layers
544 int start_pos_for_loop = 0;
545
546 // Prefill logic: Use batch prefill for longer prompts to maintain coherence
547 // For single sequences, only use batch prefill for very long prompts (>= 32 tokens)
548 // to avoid the overhead of CUDA batch processing for short sequences
549 bool prefill_enabled = num_prompt_tokens >= 32;
550
551if (prefill_enabled) {
552 Logger::info("[Generate API] Prefill enabled. num_prompt_tokens: " + std::to_string(num_prompt_tokens) +
553 ", num_cpu_offload_layers: " + std::to_string(config_.num_cpu_offload_layers) +
554 ", total_hidden_layers: " + std::to_string(config_.num_hidden_layers));
555
556 std::vector<float> batch_initial_embeddings(num_prompt_tokens * config_.hidden_size);
557 for (int i = 0; i < num_prompt_tokens; ++i) {
558 std::vector<float> token_embedding = model_->lookup_embedding(tokens[i]);
559 if (token_embedding.empty()) {
560 Logger::error("Prefill: Embedding lookup returned empty vector for token ID: " + std::to_string(tokens[i]) + " at prompt pos " + std::to_string(i));
561 return ""; // Or handle error appropriately
562 }
563 std::copy(token_embedding.begin(), token_embedding.end(), batch_initial_embeddings.begin() + i * config_.hidden_size);
564 }
565
566 // If there are CPU layers to process for prefill
567 std::vector<float> cpu_processed_embeddings;
569 Logger::info("[Generate API] Prefill: Processing " + std::to_string(config_.num_cpu_offload_layers) + " CPU layers for the batch.");
570 cpu_processed_embeddings = model_->forward_cpu_batch(batch_initial_embeddings, num_prompt_tokens, config_.num_cpu_offload_layers, 0, &kv_cache_);
571 if (cpu_processed_embeddings.empty()) {
572 Logger::error("Prefill: forward_cpu_batch returned empty or failed.");
573 return "";
574 }
575 } else {
576 cpu_processed_embeddings = batch_initial_embeddings; // No CPU layers, pass embeddings directly
577 }
578
579 // If all layers are CPU layers (i.e., num_gpu_layers == 0)
581 Logger::info("[Generate API] Prefill: All layers are on CPU. Getting logits from final CPU layer output.");
582 std::vector<float> batch_logits = model_->forward_cpu_logits_batch(cpu_processed_embeddings, num_prompt_tokens);
583 if (batch_logits.empty() || batch_logits.size() % config_.vocab_size != 0) {
584 Logger::error("Prefill: forward_cpu_logits_batch returned invalid logits.");
585 return "";
586 }
587 // Extract logits for the last token of the prompt
588 logits.assign(batch_logits.begin() + (num_prompt_tokens - 1) * config_.vocab_size,
589 batch_logits.begin() + num_prompt_tokens * config_.vocab_size);
590 } else { // GPU layers exist and need to be processed
591#ifdef HAS_CUDA
592 Logger::info("[Generate API] Prefill: Processing GPU layers for the batch.");
593 // Copy the (potentially CPU-processed) embeddings to the device
594 float* d_temp_batch_embeddings = nullptr;
595 size_t batch_embeddings_size_bytes = cpu_processed_embeddings.size() * sizeof(float);
596
597 if (batch_embeddings_size_bytes == 0) {
598 Logger::error("Prefill: cpu_processed_embeddings is empty, cannot proceed with GPU batch prefill.");
599 return ""; // Or handle error appropriately
600 }
601
602 gpuErrchk(cudaMalloc(&d_temp_batch_embeddings, batch_embeddings_size_bytes));
603 if (!d_temp_batch_embeddings) {
604 Logger::error("Prefill: cudaMalloc failed for d_temp_batch_embeddings.");
605 return ""; // Or handle error appropriately
606 }
607
608 gpuErrchk(cudaMemcpy(d_temp_batch_embeddings, cpu_processed_embeddings.data(),
609 batch_embeddings_size_bytes, cudaMemcpyHostToDevice));
610
611 logits = model_->forward_device_batch_prefill(d_temp_batch_embeddings, num_prompt_tokens, start_pos_for_loop, &kv_cache_, 0);
612
613 if (d_temp_batch_embeddings) {
614 gpuErrchk(cudaFree(d_temp_batch_embeddings));
615 }
616#else
617 Logger::error("[Generate API] GPU layers requested but CUDA not available. Cannot proceed.");
618 return "";
619#endif
620 }
621
622 if (logits.empty()) {
623 Logger::error("Prefill: Logits are empty after prefill processing.");
624 return ""; // Critical error
625 }
626 next_token_id = sample_top_k_top_p_temperature(logits, temperature, top_k, top_p, rng_);
627 generated_token_ids.push_back(next_token_id); // Track generated token
628 generated_count++;
629
630 // Stream the first generated token from prefill
631 generated_stream_ << tokenizer_->decode({next_token_id}, false);
632 generated_text_for_api_return_ += tokenizer_->decode({next_token_id}, false);
633
634 start_pos_for_loop = num_prompt_tokens; // Next token will be at num_prompt_tokens
635 kv_cache_.seq_len = num_prompt_tokens; // KVCache is now filled up to num_prompt_tokens
636
637 Logger::info("[Generate API] Prefill completed. next_token_id: " + std::to_string(next_token_id) +
638 ", Decoded: \"" + tokenizer_->decode({next_token_id}, false) + "\"" +
639 ", start_pos_for_loop set to: " + std::to_string(start_pos_for_loop));
640 }
641
642 for (int pos = start_pos_for_loop; pos < total_steps; ++pos) {
643 if (pos >= config_.max_position_embeddings) {
644 Logger::warning("Reached max sequence length (" +
645 std::to_string(config_.max_position_embeddings) +
646 "). Stopping.");
647 break;
648 }
649
650 int input_token_id;
651
652 if (pos == num_prompt_tokens && start_pos_for_loop == num_prompt_tokens) {
653 // This is the first token *after* a successful prefill.
654 // `next_token_id` was already sampled using prefill's logits from the *last prompt token*.
655 // This `next_token_id` is the actual input for the current position `pos`.
656 input_token_id = next_token_id;
657 Logger::debug("[Generate Loop] First token post-prefill. Using prefill's next_token_id: " + std::to_string(input_token_id) + " for pos " + std::to_string(pos));
658 } else {
659 // Standard iterative logic:
660 // If prefill didn't run (start_pos_for_loop == 0):
661 // For pos < num_prompt_tokens: use prompt token.
662 // For pos >= num_prompt_tokens: use previously sampled next_token_id.
663 // If prefill did run (start_pos_for_loop == num_prompt_tokens):
664 // This 'else' block is for pos > num_prompt_tokens, so use previously sampled next_token_id.
665 input_token_id = (pos < num_prompt_tokens && start_pos_for_loop == 0) ? tokens[pos] : next_token_id;
666 if (start_pos_for_loop == 0 && pos < num_prompt_tokens) {
667 Logger::debug("[Generate Loop] No prefill, prompt token. Using tokens[" + std::to_string(pos) + "]: " + std::to_string(input_token_id) + " for pos " + std::to_string(pos));
668 } else {
669 Logger::debug("[Generate Loop] Standard generation. Using previously sampled next_token_id: " + std::to_string(input_token_id) + " for pos " + std::to_string(pos));
670 }
671 }
672
673 current_data_host = model_->lookup_embedding(input_token_id);
674 if (pos == 14 || pos == 15 || pos == 16) {
675 log_vector_summary_detailed("[API_CPP GenLoop] current_data_host after lookup_embedding for input_token_id=" + std::to_string(input_token_id),
676 current_data_host, pos, -100, 8);
677 }
678
679 if (current_data_host.empty()) {
680 Logger::error("Embedding lookup returned empty vector for token ID: " + std::to_string(input_token_id) + " at pos " + std::to_string(pos));
681 break;
682 }
683
684 // Mixed-mode forward pass logic
686#ifdef HAS_CUDA
687 // Mixed CPU/GPU mode: First process CPU layers, then GPU layers
688 Logger::debug("[Mixed Mode] Processing " + std::to_string(config_.num_cpu_offload_layers) + " CPU layers first");
689 std::vector<float> intermediate_activations = model_->forward(current_data_host, pos, &kv_cache_, nullptr);
690
691 Logger::debug("[Mixed Mode] CPU layers complete, transferring to GPU for remaining layers");
692 gpuErrchk(cudaMemcpy(model_->get_x_dev(), intermediate_activations.data(), intermediate_activations.size() * sizeof(float), cudaMemcpyHostToDevice));
693 logits = model_->forward_device(model_->get_x_dev(), pos, &kv_cache_, nullptr);
694#else
695 Logger::error("[Mixed Mode] Mixed CPU/GPU mode requested but CUDA not available. Cannot proceed.");
696 break;
697#endif
698 } else if (config_.num_cpu_offload_layers == 0) {
699#ifdef HAS_CUDA
700 // GPU-only mode
701 gpuErrchk(cudaMemcpy(model_->get_x_dev(), current_data_host.data(), current_data_host.size() * sizeof(float), cudaMemcpyHostToDevice));
702 logits = model_->forward_device(model_->get_x_dev(), pos, &kv_cache_, nullptr);
703#else
704 Logger::error("[GPU-only Mode] GPU-only mode requested but CUDA not available. Cannot proceed.");
705 break;
706#endif
707 } else {
708 // CPU-only mode
709 logits = model_->forward(current_data_host, pos, &kv_cache_, nullptr);
710 }
711
712 // Sampling logic: Only sample if we're at the last prompt token or generating
713 if (pos == num_prompt_tokens - 1 || pos >= num_prompt_tokens) {
714 next_token_id = sample_top_k_top_p_temperature(logits, temperature, top_k, top_p, rng_);
715
716 // Only add to generated tokens if we're actually generating (not just finishing prompt)
717 if (pos >= num_prompt_tokens) {
718 generated_token_ids.push_back(next_token_id);
719 generated_count++;
720
721 // Stream the generated token
722 generated_stream_ << tokenizer_->decode({next_token_id}, false);
723 generated_text_for_api_return_ += tokenizer_->decode({next_token_id}, false);
724 } else {
725 // This is the first token sampled from the last prompt position
726 generated_token_ids.push_back(next_token_id);
727 generated_count++;
728
729 // Stream the first generated token
730 generated_stream_ << tokenizer_->decode({next_token_id}, false);
731 generated_text_for_api_return_ += tokenizer_->decode({next_token_id}, false);
732
733 Logger::info("[Generate API] First token sampled from prompt: " + std::to_string(next_token_id) +
734 ", Decoded: \"" + tokenizer_->decode({next_token_id}, false) + "\"");
735 }
736 }
737
738 if (next_token_id == eos_token_id_ && pos >= num_prompt_tokens) { // EOS only if we are generating
739 Logger::info("EOS token (" + std::to_string(eos_token_id_) +
740 ") sampled at pos " + std::to_string(pos) + ". Stopping.");
741 break;
742 }
743
744 if (generated_count >= steps) { // steps is max new tokens
745 Logger::info("Reached max generation steps (" + std::to_string(steps) + "). Stopping.");
746 break;
747 }
748
749 // KVCache seq_len update for single token pass (already handled by batch prefill for its tokens)
750 // This needs to happen *after* the forward pass for the current 'pos' has updated the cache for 'pos'.
751 // So, after processing 'pos', the cache now contains information up to and including 'pos'.
752 // The length of the sequence in the cache is pos + 1.
753 if (!prefill_enabled || pos >= num_prompt_tokens) { // only update if not prefill, or if we are past prompt token processing in prefill case
754 kv_cache_.seq_len = pos + 1;
755 // Logger::debug("[Generate Loop] KVCache seq_len updated to: " + std::to_string(kv_cache_.seq_len) + " after processing pos " + std::to_string(pos));
756 }
757 }
758
759 // Log all generated IDs before decoding
760 std::string generated_ids_str = "[Generated IDs Pre-Decode] ";
761 for(int gen_id : generated_token_ids) {
762 generated_ids_str += std::to_string(gen_id) + " ";
763 }
764 Logger::debug(generated_ids_str);
765
766 std::string result = tokenizer_->decode(generated_token_ids, true);
767 Logger::info("Generated response: " + result);
768
769 auto t_end = std::chrono::high_resolution_clock::now(); // End timing
770 double time_taken_ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
771
772 // Create a string with the desired precision for the time
773 std::ostringstream time_ss;
774 time_ss << std::fixed << std::setprecision(4) << time_taken_ms;
775 Logger::info("[INFO] Total generation processing time: " + time_ss.str() + " ms");
776
777 return result;
778}
779
780std::vector<std::string> TinyLlamaSession::generate_batch(const std::vector<std::string>& prompts,
781 int steps,
782 float temperature,
783 int top_k, float top_p,
784 const std::string& system_prompt_arg,
785 bool apply_q_a_format_cli_hint) {
786 auto t_start = std::chrono::high_resolution_clock::now();
787
788 if (prompts.empty()) {
789 throw std::runtime_error("Cannot process empty prompts vector for batch generation.");
790 }
791
792 if (static_cast<int>(prompts.size()) > max_batch_size_) {
793 throw std::runtime_error("Batch size " + std::to_string(prompts.size()) +
794 " exceeds maximum batch size " + std::to_string(max_batch_size_));
795 }
796
797 Logger::info("[Batch Generate API] Processing " + std::to_string(prompts.size()) +
798 " prompts in batch. Steps: " + std::to_string(steps));
799
800 if (!model_ || !tokenizer_) {
801 throw std::runtime_error("Model or tokenizer not loaded for batch generation.");
802 }
803
804 // Process each prompt to create final prompts and tokenize them
805 std::vector<std::string> final_prompts(prompts.size());
806 std::vector<std::vector<int>> all_tokens(prompts.size());
807 std::vector<int> prompt_lengths(prompts.size());
808 int max_prompt_length = 0;
809
810 for (size_t i = 0; i < prompts.size(); ++i) {
811 // Apply same prompt processing logic as single generate()
812 std::string final_prompt_for_encoding;
813 bool used_chat_template = false;
814
815 // Same priority logic as single generate()
816 if (apply_q_a_format_cli_hint) {
817 Logger::info("[Batch Generate API] Using legacy Q/A formatting for prompt " + std::to_string(i));
818 if (!system_prompt_arg.empty()) {
819 final_prompt_for_encoding = system_prompt_arg + "\\n\\nQ: " + prompts[i] + "\\nA:";
820 } else {
821 final_prompt_for_encoding = "Q: " + prompts[i] + "\\nA:";
822 }
823 } else if (tokenizer_ && !tokenizer_->get_gguf_chat_template().empty()) {
824 std::string gguf_template_content = tokenizer_->get_gguf_chat_template();
825 bool is_llama_sentencepiece_family = (config_.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE);
826 bool looks_like_jinja = (gguf_template_content.find("{%") != std::string::npos);
827
828 if (is_llama_sentencepiece_family && looks_like_jinja) {
829 Logger::info("[Batch Generate API] Using Q/A format override for prompt " + std::to_string(i));
830 if (!system_prompt_arg.empty()) {
831 final_prompt_for_encoding = system_prompt_arg + "\\\\n\\\\nQ: " + prompts[i] + "\\\\nA:";
832 } else {
833 final_prompt_for_encoding = "Q: " + prompts[i] + "\\\\nA:";
834 }
835 } else {
836 Logger::info("[Batch Generate API] Using GGUF chat template for prompt " + std::to_string(i));
837 final_prompt_for_encoding = tokenizer_->apply_chat_template(prompts[i], system_prompt_arg, config_);
838 used_chat_template = true;
839 }
841 Logger::info("[Batch Generate API] Using Llama 3 chat template for prompt " + std::to_string(i));
842 final_prompt_for_encoding = tokenizer_->apply_chat_template(prompts[i], system_prompt_arg, config_);
843 used_chat_template = true;
844 } else {
845 Logger::info("[Batch Generate API] Using raw prompt for prompt " + std::to_string(i));
846 if (!system_prompt_arg.empty()) {
847 final_prompt_for_encoding = system_prompt_arg + "\\n\\n" + prompts[i];
848 } else {
849 final_prompt_for_encoding = prompts[i];
850 }
851 }
852
853 final_prompts[i] = final_prompt_for_encoding;
854 all_tokens[i] = tokenizer_->encode(final_prompt_for_encoding, true, false, Tokenizer::PreTokenizeMethod::DEFAULT);
855
856 if (all_tokens[i].empty()) {
857 Logger::warning("Batch tokenization resulted in empty ID list for prompt " + std::to_string(i));
858 all_tokens[i].push_back(tokenizer_->bos_token_id()); // Fallback to BOS token
859 }
860
861 prompt_lengths[i] = all_tokens[i].size();
862 max_prompt_length = std::max(max_prompt_length, prompt_lengths[i]);
863
864 Logger::info("[Batch Generate API] Prompt " + std::to_string(i) + ": " +
865 std::to_string(prompt_lengths[i]) + " tokens");
866 }
867
868 // Initialize batch mode
869 kv_cache_.initialize_batch(static_cast<int>(prompts.size()));
871
872 std::vector<std::string> results(prompts.size());
873
874 // Try parallel batch processing if enabled
876 Logger::info("[Batch Generate API] Using parallel batch processing");
877
878 Logger::info("[DEBUG] Initializing KV cache for batch mode");
879 kv_cache_.initialize_batch(static_cast<int>(prompts.size()));
880 Logger::info("[DEBUG] KV cache batch initialization completed");
881
882 Logger::info("[DEBUG] Clearing KV cache data");
884 Logger::info("[DEBUG] KV cache clear completed");
885
886 // Phase 1: Parallel Batch Prefill
887 Logger::info("[DEBUG] About to call batch_prefill_parallel");
888 Logger::info("[DEBUG] all_tokens.size()=" + std::to_string(all_tokens.size()) + ", prompt_lengths.size()=" + std::to_string(prompt_lengths.size()));
889
890 std::vector<std::vector<float>> batch_final_logits;
891
892 bool prefill_success = batch_prefill_parallel(all_tokens, prompt_lengths, batch_final_logits);
893
894 if (prefill_success && batch_final_logits.size() == prompts.size()) {
895 Logger::info("[Batch Generate API] Batch prefill successful, starting parallel generation");
896
897 // Add safety check for batch_final_logits
898 Logger::info("[DEBUG] Checking batch_final_logits integrity after prefill");
899 for (size_t i = 0; i < batch_final_logits.size(); ++i) {
900 if (batch_final_logits[i].empty()) {
901 Logger::error("[DEBUG] batch_final_logits[" + std::to_string(i) + "] is empty!");
902 goto fallback_sequential;
903 }
904 if (batch_final_logits[i].size() != static_cast<size_t>(config_.vocab_size)) {
905 Logger::error("[DEBUG] batch_final_logits[" + std::to_string(i) + "] has wrong size: " +
906 std::to_string(batch_final_logits[i].size()) + " vs expected " + std::to_string(config_.vocab_size));
907 goto fallback_sequential;
908 }
909 // Check for NaN/Inf values
910 for (size_t j = 0; j < std::min(static_cast<size_t>(10UL), batch_final_logits[i].size()); ++j) {
911 if (!std::isfinite(batch_final_logits[i][j])) {
912 Logger::error("[DEBUG] batch_final_logits[" + std::to_string(i) + "][" + std::to_string(j) + "] is not finite: " + std::to_string(batch_final_logits[i][j]));
913 goto fallback_sequential;
914 }
915 }
916 }
917 Logger::info("[DEBUG] batch_final_logits integrity check passed");
918
919 // Sample first tokens for all sequences
920 std::vector<int> current_tokens(prompts.size());
921 std::vector<std::vector<int>> all_generated_tokens(prompts.size());
922 std::vector<int> sequence_positions(prompts.size());
923 std::vector<bool> sequence_finished(prompts.size(), false);
924
925 Logger::info("[DEBUG] Starting token sampling for " + std::to_string(prompts.size()) + " sequences");
926
927 for (size_t i = 0; i < prompts.size(); ++i) {
928 Logger::info("[DEBUG] Sampling token for sequence " + std::to_string(i));
929
930 // Safety check before sampling
931 if (i >= batch_final_logits.size()) {
932 Logger::error("[DEBUG] Index " + std::to_string(i) + " out of bounds for batch_final_logits (size: " + std::to_string(batch_final_logits.size()) + ")");
933 goto fallback_sequential;
934 }
935
936 try {
937 current_tokens[i] = sample_top_k_top_p_temperature(batch_final_logits[i], temperature, top_k, top_p, rng_);
938 Logger::info("[DEBUG] Sampled token " + std::to_string(current_tokens[i]) + " for sequence " + std::to_string(i));
939 } catch (const std::exception& e) {
940 Logger::error("[DEBUG] Exception during sampling for sequence " + std::to_string(i) + ": " + std::string(e.what()));
941 goto fallback_sequential;
942 }
943
944 all_generated_tokens[i].push_back(current_tokens[i]);
945 sequence_positions[i] = prompt_lengths[i]; // Position for next token
946
947 // Check for EOS
948 if (current_tokens[i] == eos_token_id_) {
949 sequence_finished[i] = true;
950 Logger::info("[DEBUG] Sequence " + std::to_string(i) + " finished with EOS token");
951 }
952 }
953
954 Logger::info("[DEBUG] Token sampling completed, starting generation loop");
955
956 // Phase 2: Parallel Batch Generation
957 for (int step = 1; step < steps; ++step) {
958 Logger::info("[DEBUG] Starting generation step " + std::to_string(step));
959
960 // Check if all sequences are finished
961 bool all_finished = true;
962 for (bool finished : sequence_finished) {
963 if (!finished) {
964 all_finished = false;
965 break;
966 }
967 }
968 if (all_finished) {
969 Logger::info("[Batch Generate API] All sequences finished at step " + std::to_string(step));
970 break;
971 }
972
973 Logger::info("[DEBUG] Collecting active sequences for step " + std::to_string(step));
974
975 // Collect active sequences
976 std::vector<int> active_tokens;
977 std::vector<int> active_positions;
978 std::vector<int> active_sequence_indices;
979 std::vector<int> batch_to_original_seq_mapping; // Map batch index to original sequence index
980
981 for (size_t i = 0; i < prompts.size(); ++i) {
982 if (!sequence_finished[i]) {
983 active_tokens.push_back(current_tokens[i]);
984 active_positions.push_back(sequence_positions[i]);
985 active_sequence_indices.push_back(active_tokens.size() - 1); // Use contiguous 0-based index
986 batch_to_original_seq_mapping.push_back(i); // Remember original sequence index
987 Logger::info("[DEBUG] Active sequence " + std::to_string(i) + " mapped to batch index " + std::to_string(active_tokens.size() - 1) +
988 ": token=" + std::to_string(current_tokens[i]) + ", pos=" + std::to_string(sequence_positions[i]));
989 }
990 }
991
992 if (active_tokens.empty()) {
993 Logger::info("[DEBUG] No active tokens, breaking from generation loop");
994 break;
995 }
996
997 Logger::info("[DEBUG] About to call batch_generation_parallel with " + std::to_string(active_tokens.size()) + " active sequences");
998
999 // Process active sequences in parallel
1000 std::vector<std::vector<float>> step_logits;
1001 bool generation_success = batch_generation_parallel(active_tokens, active_positions, batch_to_original_seq_mapping, step_logits);
1002 Logger::info("[DEBUG] batch_generation_parallel returned: " + std::string(generation_success ? "success" : "failure"));
1003
1004 if (!generation_success || step_logits.size() != active_tokens.size()) {
1005 Logger::warning("[Batch Generate API] Parallel generation failed at step " + std::to_string(step) +
1006 ", falling back to sequential processing");
1007 goto fallback_sequential;
1008 }
1009
1010 Logger::info("[DEBUG] Starting token sampling for step " + std::to_string(step));
1011
1012 // Sample next tokens for active sequences
1013 for (size_t active_idx = 0; active_idx < active_tokens.size(); ++active_idx) {
1014 size_t original_seq_idx = batch_to_original_seq_mapping[active_idx]; // Use mapping to get original sequence index
1015 Logger::info("[DEBUG] Sampling for active_idx=" + std::to_string(active_idx) + ", original_seq_idx=" + std::to_string(original_seq_idx));
1016
1017 // Safety checks
1018 if (active_idx >= step_logits.size()) {
1019 Logger::error("[DEBUG] active_idx " + std::to_string(active_idx) + " out of bounds for step_logits (size: " + std::to_string(step_logits.size()) + ")");
1020 goto fallback_sequential;
1021 }
1022 if (original_seq_idx >= prompts.size()) {
1023 Logger::error("[DEBUG] original_seq_idx " + std::to_string(original_seq_idx) + " out of bounds for prompts (size: " + std::to_string(prompts.size()) + ")");
1024 goto fallback_sequential;
1025 }
1026
1027 try {
1028 int next_token = sample_top_k_top_p_temperature(step_logits[active_idx], temperature, top_k, top_p, rng_);
1029 Logger::info("[DEBUG] Sampled next token " + std::to_string(next_token) + " for original_seq_idx " + std::to_string(original_seq_idx));
1030
1031 current_tokens[original_seq_idx] = next_token;
1032 all_generated_tokens[original_seq_idx].push_back(next_token);
1033 sequence_positions[original_seq_idx]++;
1034
1035 // Check for EOS
1036 if (next_token == eos_token_id_) {
1037 sequence_finished[original_seq_idx] = true;
1038 Logger::info("[DEBUG] Sequence " + std::to_string(original_seq_idx) + " finished with EOS at step " + std::to_string(step));
1039 }
1040 } catch (const std::exception& e) {
1041 Logger::error("[DEBUG] Exception during sampling at step " + std::to_string(step) + " for original_seq_idx " + std::to_string(original_seq_idx) + ": " + std::string(e.what()));
1042 goto fallback_sequential;
1043 }
1044 }
1045
1046 Logger::info("[DEBUG] Completed generation step " + std::to_string(step));
1047 }
1048
1049 // Decode results for all sequences
1050 for (size_t i = 0; i < prompts.size(); ++i) {
1051 results[i] = tokenizer_->decode(all_generated_tokens[i], true);
1052 }
1053
1054 Logger::info("[Batch Generate API] Parallel batch processing completed successfully");
1055 } else {
1056 Logger::warning("[Batch Generate API] Batch prefill failed, falling back to sequential processing");
1057 goto fallback_sequential;
1058 }
1059 } else {
1060 fallback_sequential:
1061 Logger::info("[Batch Generate API] Using sequential processing");
1062
1063 for (size_t i = 0; i < prompts.size(); ++i) {
1064 Logger::info("[Batch Generate API] Processing prompt " + std::to_string(i + 1) +
1065 "/" + std::to_string(prompts.size()));
1066
1067 // Reset to single-sequence mode for this prompt
1068 kv_cache_.seq_len = 0;
1069
1070 // Use existing single-sequence generation logic
1071 std::string result = generate(prompts[i], steps, temperature, top_k, top_p,
1072 system_prompt_arg, apply_q_a_format_cli_hint);
1073 results[i] = result;
1074 }
1075 }
1076
1077 auto t_end = std::chrono::high_resolution_clock::now();
1078 double time_taken_ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
1079
1080 std::ostringstream time_ss;
1081 time_ss << std::fixed << std::setprecision(4) << time_taken_ms;
1082 Logger::info("[Batch Generate API] Total batch processing time: " + time_ss.str() + " ms for " +
1083 std::to_string(prompts.size()) + " prompts");
1084
1085 return results;
1086}
1087
1088bool TinyLlamaSession::batch_prefill_parallel(const std::vector<std::vector<int>>& all_tokens,
1089 const std::vector<int>& prompt_lengths,
1090 std::vector<std::vector<float>>& batch_final_logits) {
1091 Logger::info("[EMERGENCY_DEBUG] batch_prefill_parallel function entry - FIRST LINE");
1092 Logger::info("[DEBUG] Entering batch_prefill_parallel");
1093 // Calculate total tokens across all prompts for batch prefill
1094 int total_tokens_across_all_prompts = 0;
1095 for (int len : prompt_lengths) {
1096 total_tokens_across_all_prompts += len;
1097 }
1098
1099 if (total_tokens_across_all_prompts == 0) {
1100 Logger::error("[Batch Prefill] No tokens to process in batch prefill.");
1101 return false;
1102 }
1103
1104 Logger::info("[Batch Prefill] Processing " + std::to_string(all_tokens.size()) +
1105 " sequences with total " + std::to_string(total_tokens_across_all_prompts) + " tokens");
1106
1107 // Process all tokens for all sequences in batch
1108 Logger::info("[Batch Prefill] Preparing batch embeddings for " +
1109 std::to_string(total_tokens_across_all_prompts) + " tokens");
1110
1111 // Calculate required memory
1112 size_t required_memory_bytes = static_cast<size_t>(total_tokens_across_all_prompts) * config_.hidden_size * sizeof(float);
1113 Logger::info("[DEBUG] About to allocate " + std::to_string(required_memory_bytes) + " bytes (" +
1114 std::to_string(required_memory_bytes / (1024*1024)) + " MB) for batch embeddings");
1115
1116 std::vector<float> batch_embeddings(total_tokens_across_all_prompts * config_.hidden_size);
1117 Logger::info("[DEBUG] batch_embeddings allocation completed successfully");
1118
1119 int token_offset = 0;
1120
1121 // Add detailed logging for token processing
1122 Logger::info("[DEBUG] Starting token embedding processing for " + std::to_string(all_tokens.size()) + " sequences");
1123
1124 for (size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1125 Logger::info("[DEBUG] Processing sequence " + std::to_string(seq_idx) + " with " + std::to_string(prompt_lengths[seq_idx]) + " tokens");
1126
1127 // Log first few tokens of this sequence
1128 std::string token_ids_str = "Token IDs: ";
1129 for (int i = 0; i < std::min(5, prompt_lengths[seq_idx]); ++i) {
1130 token_ids_str += std::to_string(all_tokens[seq_idx][i]) + " ";
1131 }
1132 if (prompt_lengths[seq_idx] > 5) token_ids_str += "...";
1133 Logger::info("[DEBUG] Sequence " + std::to_string(seq_idx) + " " + token_ids_str);
1134
1135 for (int token_idx = 0; token_idx < prompt_lengths[seq_idx]; ++token_idx) {
1136 int current_token_id = all_tokens[seq_idx][token_idx];
1137
1138 // Log token placement in batch
1139 if (seq_idx < 2 && token_idx < 3) { // Only log first few tokens of first two sequences
1140 Logger::info("[DEBUG] Placing token " + std::to_string(current_token_id) +
1141 " from seq " + std::to_string(seq_idx) + " pos " + std::to_string(token_idx) +
1142 " at batch offset " + std::to_string(token_offset));
1143 }
1144
1145 std::vector<float> token_embedding = model_->lookup_embedding(current_token_id);
1146 if (token_embedding.empty() || token_embedding.size() != static_cast<size_t>(config_.hidden_size)) {
1147 Logger::error("[Batch Prefill] Embedding lookup failed for token " +
1148 std::to_string(current_token_id) +
1149 " in sequence " + std::to_string(seq_idx));
1150 return false;
1151 }
1152
1153 // Ensure we don't write beyond bounds
1154 size_t target_offset = token_offset * config_.hidden_size;
1155 if (target_offset + config_.hidden_size > batch_embeddings.size()) {
1156 Logger::error("[Batch Prefill] Buffer overflow detected at token offset " + std::to_string(token_offset));
1157 return false;
1158 }
1159
1160 std::copy(token_embedding.begin(), token_embedding.end(),
1161 batch_embeddings.begin() + target_offset);
1162 token_offset++;
1163 }
1164
1165 Logger::info("[DEBUG] Sequence " + std::to_string(seq_idx) + " complete. Next token_offset: " + std::to_string(token_offset));
1166 }
1167
1168 // Process CPU layers if any
1169 std::vector<float> cpu_processed_embeddings;
1171 Logger::info("[Batch Prefill] Processing " + std::to_string(config_.num_cpu_offload_layers) +
1172 " CPU layers for batch prefill");
1173 cpu_processed_embeddings = model_->forward_cpu_batch(batch_embeddings,
1174 total_tokens_across_all_prompts,
1176 0, &kv_cache_,
1177 prompt_lengths);
1178
1179 if (cpu_processed_embeddings.empty()) {
1180 Logger::error("[Batch Prefill] CPU batch processing failed.");
1181 return false;
1182 }
1183 } else {
1184 cpu_processed_embeddings = batch_embeddings;
1185 }
1186
1187 // Process GPU layers if any
1188 std::vector<float> final_batch_logits;
1189
1191 // All CPU - get logits from CPU
1192 Logger::info("[Batch Prefill] All layers on CPU, computing logits");
1193 final_batch_logits = model_->forward_cpu_logits_batch(cpu_processed_embeddings,
1194 total_tokens_across_all_prompts);
1195 } else {
1196 // GPU layers exist - transfer to GPU and process
1197#ifdef HAS_CUDA
1198 Logger::info("[Batch Prefill] Processing GPU layers for batch prefill");
1199
1200 Logger::info("[DEBUG] About to allocate GPU memory for batch prefill");
1201 float* d_batch_embeddings = nullptr;
1202 size_t batch_size_bytes = cpu_processed_embeddings.size() * sizeof(float);
1203 Logger::info("[DEBUG] GPU allocation size: " + std::to_string(batch_size_bytes) + " bytes (" +
1204 std::to_string(batch_size_bytes / (1024*1024)) + " MB)");
1205
1206 Logger::info("[DEBUG] Calling cudaMalloc...");
1207 gpuErrchk(cudaMalloc(&d_batch_embeddings, batch_size_bytes));
1208 Logger::info("[DEBUG] cudaMalloc completed successfully");
1209
1210 Logger::info("[DEBUG] Calling cudaMemcpy host to device...");
1211 gpuErrchk(cudaMemcpy(d_batch_embeddings, cpu_processed_embeddings.data(),
1212 batch_size_bytes, cudaMemcpyHostToDevice));
1213 Logger::info("[DEBUG] cudaMemcpy completed successfully");
1214
1215 // Call forward_device_batch_prefill ONCE with all the batch data
1216 Logger::info("[DEBUG] Calling forward_device_batch_prefill with " + std::to_string(total_tokens_across_all_prompts) + " total tokens");
1217 std::vector<float> all_batch_logits = model_->forward_device_batch_prefill(
1218 d_batch_embeddings, total_tokens_across_all_prompts, 0, &kv_cache_, 0);
1219
1220 Logger::info("[DEBUG] forward_device_batch_prefill completed, returned " + std::to_string(all_batch_logits.size()) + " total logits");
1221
1222 gpuErrchk(cudaFree(d_batch_embeddings));
1223
1224 final_batch_logits = all_batch_logits;
1225#else
1226 Logger::error("[Batch Prefill] GPU processing requested but CUDA not available.");
1227 return false;
1228#endif
1229 }
1230
1231 Logger::info("[Batch Prefill] Successfully processed batch prefill for " +
1232 std::to_string(all_tokens.size()) + " sequences");
1233
1234 Logger::info("[DEBUG] About to return from batch_prefill_parallel");
1235 Logger::info("[DEBUG] batch_final_logits.size()=" + std::to_string(batch_final_logits.size()));
1236 for (size_t i = 0; i < batch_final_logits.size() && i < 3; ++i) {
1237 Logger::info("[DEBUG] batch_final_logits[" + std::to_string(i) + "].size()=" + std::to_string(batch_final_logits[i].size()));
1238 }
1239
1240 // Extract logits for the last token of each sequence
1241 batch_final_logits.clear();
1242 batch_final_logits.resize(all_tokens.size());
1243
1245 // For CPU-only, extract last token logits from the flat array
1246 if (final_batch_logits.size() != static_cast<size_t>(total_tokens_across_all_prompts * config_.vocab_size)) {
1247 Logger::error("[Batch Prefill] CPU logits size mismatch. Expected: " +
1248 std::to_string(total_tokens_across_all_prompts * config_.vocab_size) +
1249 ", got: " + std::to_string(final_batch_logits.size()));
1250 return false;
1251 }
1252
1253 int token_offset = 0;
1254 for (size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1255 int last_token_pos = token_offset + prompt_lengths[seq_idx] - 1;
1256
1257 Logger::info("[DEBUG] Extracting logits for sequence " + std::to_string(seq_idx) +
1258 ": token_offset=" + std::to_string(token_offset) +
1259 ", prompt_length=" + std::to_string(prompt_lengths[seq_idx]) +
1260 ", last_token_pos=" + std::to_string(last_token_pos));
1261
1262 batch_final_logits[seq_idx].resize(config_.vocab_size);
1263
1264 // Bounds check before copying
1265 size_t src_start = last_token_pos * config_.vocab_size;
1266 size_t src_end = src_start + config_.vocab_size;
1267 if (src_end > final_batch_logits.size()) {
1268 Logger::error("[Batch Prefill] CPU logits bounds check failed for sequence " + std::to_string(seq_idx));
1269 return false;
1270 }
1271
1272 std::copy(final_batch_logits.begin() + src_start,
1273 final_batch_logits.begin() + src_end,
1274 batch_final_logits[seq_idx].begin());
1275
1276 // Log a few logit values for debugging
1277 if (seq_idx < 2) { // Only for first two sequences
1278 std::string logit_sample = "First 5 logits: ";
1279 for (int i = 0; i < 5 && i < config_.vocab_size; ++i) {
1280 logit_sample += std::to_string(batch_final_logits[seq_idx][i]) + " ";
1281 }
1282 Logger::info("[DEBUG] Sequence " + std::to_string(seq_idx) + " " + logit_sample);
1283 }
1284
1285 token_offset += prompt_lengths[seq_idx];
1286 }
1287 } else {
1288 // For GPU, check if logits are for all tokens or just last tokens
1289 Logger::info("[DEBUG] GPU batch logits size: " + std::to_string(final_batch_logits.size()) +
1290 ", expected for all tokens: " + std::to_string(total_tokens_across_all_prompts * config_.vocab_size) +
1291 ", expected for last tokens only: " + std::to_string(all_tokens.size() * config_.vocab_size));
1292
1293 if (final_batch_logits.size() == static_cast<size_t>(total_tokens_across_all_prompts * config_.vocab_size)) {
1294 // GPU returned logits for all tokens, extract last token for each sequence
1295 Logger::info("[DEBUG] GPU returned logits for all tokens, extracting last token logits");
1296 int token_offset = 0;
1297 for (size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1298 int last_token_pos = token_offset + prompt_lengths[seq_idx] - 1;
1299
1300 Logger::info("[DEBUG] GPU: Extracting logits for sequence " + std::to_string(seq_idx) +
1301 ": token_offset=" + std::to_string(token_offset) +
1302 ", prompt_length=" + std::to_string(prompt_lengths[seq_idx]) +
1303 ", last_token_pos=" + std::to_string(last_token_pos));
1304
1305 batch_final_logits[seq_idx].resize(config_.vocab_size);
1306
1307 size_t src_start = last_token_pos * config_.vocab_size;
1308 size_t src_end = src_start + config_.vocab_size;
1309 if (src_end > final_batch_logits.size()) {
1310 Logger::error("[Batch Prefill] GPU logits bounds check failed for sequence " + std::to_string(seq_idx));
1311 return false;
1312 }
1313
1314 std::copy(final_batch_logits.begin() + src_start,
1315 final_batch_logits.begin() + src_end,
1316 batch_final_logits[seq_idx].begin());
1317
1318 // Log a few logit values for debugging
1319 if (seq_idx < 2) { // Only for first two sequences
1320 std::string logit_sample = "First 5 logits: ";
1321 for (int i = 0; i < 5 && i < config_.vocab_size; ++i) {
1322 logit_sample += std::to_string(batch_final_logits[seq_idx][i]) + " ";
1323 }
1324 Logger::info("[DEBUG] GPU Sequence " + std::to_string(seq_idx) + " " + logit_sample);
1325 }
1326
1327 token_offset += prompt_lengths[seq_idx];
1328 }
1329 } else if (final_batch_logits.size() == static_cast<size_t>(all_tokens.size() * config_.vocab_size)) {
1330 // GPU returned logits for last tokens only
1331 Logger::info("[DEBUG] GPU returned logits for last tokens only");
1332 for (size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1333 Logger::info("[DEBUG] GPU Last-Token-Only: Processing sequence " + std::to_string(seq_idx) +
1334 " at logit offset " + std::to_string(seq_idx * config_.vocab_size));
1335
1336 batch_final_logits[seq_idx].resize(config_.vocab_size);
1337
1338 size_t src_start = seq_idx * config_.vocab_size;
1339 size_t src_end = src_start + config_.vocab_size;
1340 if (src_end > final_batch_logits.size()) {
1341 Logger::error("[Batch Prefill] GPU logits bounds check failed for sequence " + std::to_string(seq_idx));
1342 return false;
1343 }
1344
1345 std::copy(final_batch_logits.begin() + src_start,
1346 final_batch_logits.begin() + src_end,
1347 batch_final_logits[seq_idx].begin());
1348
1349 // Log a few logit values for debugging
1350 if (seq_idx < 2) { // Only for first two sequences
1351 std::string logit_sample = "First 5 logits: ";
1352 for (int i = 0; i < 5 && i < config_.vocab_size; ++i) {
1353 logit_sample += std::to_string(batch_final_logits[seq_idx][i]) + " ";
1354 }
1355 Logger::info("[DEBUG] GPU Last-Token Sequence " + std::to_string(seq_idx) + " " + logit_sample);
1356 }
1357 }
1358 } else {
1359 Logger::error("[Batch Prefill] GPU logits size doesn't match expected patterns");
1360 return false;
1361 }
1362 }
1363
1364 return true;
1365}
1366
1367bool TinyLlamaSession::batch_generation_parallel(const std::vector<int>& current_tokens,
1368 const std::vector<int>& token_positions,
1369 const std::vector<int>& sequence_indices,
1370 std::vector<std::vector<float>>& batch_logits) {
1371 Logger::info("[DEBUG] Entering batch_generation_parallel");
1372
1373 int num_sequences = current_tokens.size();
1374
1375 if (num_sequences == 0 || token_positions.size() != current_tokens.size()) {
1376 Logger::error("[Batch Generation] Invalid input sizes");
1377 return false;
1378 }
1379
1380 Logger::info("[Batch Generation] Processing " + std::to_string(num_sequences) +
1381 " sequences in parallel generation step");
1382
1383 // Create batch embeddings for all sequences
1384 std::vector<float> batch_embeddings;
1385 batch_embeddings.reserve(num_sequences * config_.hidden_size);
1386
1387 for (int i = 0; i < num_sequences; ++i) {
1388 std::vector<float> token_embedding = model_->lookup_embedding(current_tokens[i]);
1389 if (token_embedding.empty()) {
1390 Logger::error("[Batch Generation] Embedding lookup failed for token " + std::to_string(current_tokens[i]));
1391 return false;
1392 }
1393 batch_embeddings.insert(batch_embeddings.end(), token_embedding.begin(), token_embedding.end());
1394 }
1395
1396 // Process through CPU layers if any
1398 Logger::info("[Batch Generation] Processing " + std::to_string(config_.num_cpu_offload_layers) +
1399 " CPU layers for batch generation");
1400
1401 std::vector<std::vector<float>> cpu_batch_logits = model_->forward_cpu_batch_generation(
1402 batch_embeddings, token_positions, sequence_indices, num_sequences, &kv_cache_);
1403
1404 if (cpu_batch_logits.size() != static_cast<size_t>(num_sequences)) {
1405 Logger::error("[Batch Generation] CPU batch generation returned wrong number of results");
1406 return false;
1407 }
1408
1409 // If all layers are on CPU, we're done
1411 batch_logits = cpu_batch_logits;
1412 Logger::info("[Batch Generation] All CPU layers processed, returning logits");
1413 return true;
1414 }
1415
1416 // Convert CPU results back to batch embeddings for GPU processing
1417 batch_embeddings.clear();
1418 batch_embeddings.resize(num_sequences * config_.hidden_size);
1419 // Note: This would need the CPU layer output activations, not logits
1420 // For now, fall back to sequential processing if mixed CPU/GPU
1421 Logger::warning("[Batch Generation] Mixed CPU/GPU not yet implemented for batch generation");
1422 return false;
1423 }
1424
1425 // GPU-only processing
1427#ifdef HAS_CUDA
1428 Logger::info("[Batch Generation] Processing GPU layers for batch generation");
1429
1430 float* d_batch_embeddings = nullptr;
1431 size_t batch_size_bytes = batch_embeddings.size() * sizeof(float);
1432
1433 gpuErrchk(cudaMalloc(&d_batch_embeddings, batch_size_bytes));
1434 gpuErrchk(cudaMemcpy(d_batch_embeddings, batch_embeddings.data(),
1435 batch_size_bytes, cudaMemcpyHostToDevice));
1436
1437 std::vector<std::vector<float>> gpu_batch_logits = model_->forward_device_batch_generation(
1438 d_batch_embeddings, token_positions, sequence_indices, num_sequences, &kv_cache_, 0);
1439
1440 gpuErrchk(cudaFree(d_batch_embeddings));
1441
1442 if (gpu_batch_logits.size() != static_cast<size_t>(num_sequences)) {
1443 Logger::error("[Batch Generation] GPU batch generation returned wrong number of results");
1444 return false;
1445 }
1446
1447 batch_logits = gpu_batch_logits;
1448 Logger::info("[Batch Generation] GPU batch generation completed successfully");
1449 return true;
1450#else
1451 Logger::error("[Batch Generation] GPU processing requested but CUDA not available.");
1452 return false;
1453#endif
1454 }
1455
1456 Logger::error("[Batch Generation] No valid processing path found");
1457 return false;
1458}
1459
1460} // namespace tinyllama
static void debug(const std::string &message)
Definition logger.cpp:131
static void warning(const std::string &message)
Definition logger.cpp:139
static void info(const std::string &message)
Definition logger.cpp:135
static void error(const std::string &message)
Definition logger.cpp:143
Main class for loading tensors from SafeTensors format files (single or sharded)
static bool load_model_config_from_json(const std::string &model_path_or_dir, ModelConfig &config_to_populate)
Loads model configuration from a JSON file corresponding to a .safetensors model path.
ModelConfig config_
Definition api.h:125
bool batch_generation_parallel(const std::vector< int > &current_tokens, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, std::vector< std::vector< float > > &batch_logits)
Definition api.cpp:1367
bool batch_prefill_parallel(const std::vector< std::vector< int > > &all_tokens, const std::vector< int > &prompt_lengths, std::vector< std::vector< float > > &batch_final_logits)
Definition api.cpp:1088
std::mt19937 rng_
Definition api.h:128
std::stringstream generated_stream_
Definition api.h:131
std::unique_ptr< TinyLlamaModel > model_
Definition api.h:123
TinyLlamaSession(const std::string &model_path, const std::string &tokenizer_path, int threads=1, int num_gpu_layers_from_cli=0, bool cli_use_mmap=true, bool use_kv_quant=false, bool use_batch_generation=false, int max_batch_size=1)
Loads the model, config, and tokenizer from the specified directory or GGUF file.
Definition api.cpp:175
std::unique_ptr< Tokenizer > tokenizer_
Definition api.h:124
std::string generate(const std::string &prompt, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text based on a given prompt.
Definition api.cpp:433
std::vector< std::string > generate_batch(const std::vector< std::string > &prompts, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text for multiple prompts in a single batch (parallel processing).
Definition api.cpp:780
std::string generated_text_for_api_return_
Definition api.h:132
~TinyLlamaSession()
Destructor to ensure proper cleanup (e.g., KVCache CUDA memory).
Definition api.cpp:429
Parser for GGUF (GPT-Generated Unified Format) files.
Logging utilities for the TinyLlama implementation.
static std::string read_file_api(const std::string &path)
Definition api.cpp:65
static int sample_top_k_top_p_temperature(const std::vector< float > &logits, float temperature, int top_k, float top_p, std::mt19937 &rng)
Definition api.cpp:82
static int argmax(const std::vector< float > &v)
Definition api.cpp:73
static void log_vector_summary_detailed(const std::string &name, const std::vector< float > &v, int current_pos, int current_layer, int N)
Definition api.cpp:32
SafeTensors format loader for efficient tensor loading, supporting single and sharded models.
Complete representation of a GGUF file's contents.
void initialize_batch(int batch_size)
Initialize batch mode with specified number of sequences.
Definition model.h:201
void initialize(const ModelConfig &config, int total_num_model_layers, int num_gpu_layers_to_allocate, int max_seq_len_arg, int num_kv_heads, int head_dim, int max_batch_size_arg=1)
Initializes the KV cache with given dimensions.
Definition kv_cache.cpp:10
void clear_data()
Definition model.h:180
int seq_len
Definition model.h:155
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
int num_attention_heads
Definition model.h:83
bool use_mmap_for_gguf
Definition model.h:102
int eos_token_id
Definition model.h:93
int num_cpu_offload_layers
Definition model.h:104
bool is_gguf_file_loaded
Definition model.h:101
bool use_kvcache_quantization
Definition model.h:103
int num_hidden_layers
Definition model.h:85
int num_key_value_heads
Definition model.h:84
TokenizerFamily tokenizer_family
Definition model.h:117
int max_position_embeddings
Definition model.h:87