83 float temperature,
int top_k,
84 float top_p, std::mt19937& rng) {
86 throw std::runtime_error(
"Cannot sample from empty logits.");
90 if (temperature < 0.05f) {
91 return std::distance(logits.begin(), std::max_element(logits.begin(), logits.end()));
94 int vocab_size = logits.size();
96 top_k = (std::min)(top_k, vocab_size);
97 if (top_k <= 0) top_k = vocab_size;
99 std::vector<float> scaled_logits(vocab_size);
100 float max_logit = -std::numeric_limits<float>::infinity();
101 for (
float logit : logits) max_logit = (std::max)(max_logit, logit);
104 const float scale = 1.0f / temperature;
105 for (
int i = 0; i < vocab_size; ++i) {
106 scaled_logits[i] = (logits[i] - max_logit) * scale;
109 std::vector<double> probs_double(vocab_size);
110 double sum_exp = 0.0;
111 for (
int i = 0; i < vocab_size; ++i) {
112 probs_double[i] = std::exp(
static_cast<double>(scaled_logits[i]));
113 sum_exp += probs_double[i];
118 for (
int i = 0; i < vocab_size; ++i) {
119 probs_double[i] /= sum_exp;
123 for (
int i = 0; i < vocab_size; ++i) {
124 probs_double[i] = 1.0 / vocab_size;
128 std::vector<std::pair<float, int>> prob_idx(vocab_size);
129 for (
int i = 0; i < vocab_size; ++i) {
130 prob_idx[i] = {
static_cast<float>(probs_double[i]), i};
133 std::sort(prob_idx.begin(), prob_idx.end(),
134 std::greater<std::pair<float, int>>());
136 if (top_k < vocab_size) {
137 prob_idx.resize(top_k);
140 float cumulative_prob = 0.0f;
142 for (
int i = 0; i < prob_idx.size(); ++i) {
143 cumulative_prob += prob_idx[i].first;
145 if (cumulative_prob >= top_p) {
149 prob_idx.resize(last_idx + 1);
151 float final_sum = 0.0f;
152 for (
const auto& pi : prob_idx) {
153 final_sum += pi.first;
157 std::vector<float> final_probs(prob_idx.size());
158 if (final_sum > 0.0f) {
159 for (
size_t i = 0; i < prob_idx.size(); ++i) {
160 final_probs[i] = prob_idx[i].first / final_sum;
164 float uniform_prob = 1.0f / prob_idx.size();
165 std::fill(final_probs.begin(), final_probs.end(), uniform_prob);
168 std::discrete_distribution<int> dist(final_probs.begin(), final_probs.end());
169 int sampled_idx_in_filtered = dist(rng);
171 return prob_idx[sampled_idx_in_filtered].second;
176 const std::string& tokenizer_path_arg,
178 int num_gpu_layers_from_cli,
181 bool use_batch_generation,
183 : threads_(threads), use_batch_generation_(use_batch_generation),
184 max_batch_size_(max_batch_size), rng_(std::random_device{}()) {
185 Logger::info(
"TinyLlamaSession constructor entered. Model path: " + model_path_arg +
186 ", Tokenizer path: " + tokenizer_path_arg +
187 ", Threads: " + std::to_string(threads) +
188 ", Num GPU Layers (CLI): " + std::to_string(num_gpu_layers_from_cli) +
189 ", Use MMAP (CLI): " + (cli_use_mmap ?
"true" :
"false") +
190 ", Use KV Quant (CLI): " + (use_kv_quant ?
"true" :
"false"));
192 std::string effective_model_file_path = model_path_arg;
193 std::string path_for_config_json = model_path_arg;
198 if (num_gpu_layers_from_cli < 0) {
204 std::filesystem::path fs_model_path(model_path_arg);
205 bool is_dir = std::filesystem::is_directory(fs_model_path);
208 Logger::info(
"Model path is a directory. Assuming SafeTensors model directory: " + model_path_arg);
209 effective_model_file_path = (fs_model_path /
"model.safetensors").
string();
210 std::string config_json_path_in_dir = (fs_model_path /
"config.json").
string();
212 Logger::info(
"Derived SafeTensors model file path: " + effective_model_file_path);
213 Logger::info(
"Path for loading config.json: " + config_json_path_in_dir);
219 if (st_config_loaded) {
220 Logger::info(
"Successfully loaded config.json directly into initial_model_config_for_model_ctor.");
222 std::string family_after_json_load =
"UNKNOWN_POST_JSON_LOAD_DIR_CASE";
225 Logger::info(
"[API_CPP_POST_JSON_LOAD_DIR_CASE] Tokenizer family in initial_model_config_for_model_ctor: " + family_after_json_load);
229 Logger::warning(
"Failed to load config.json for SafeTensors. initial_model_config_for_model_ctor will have defaults/CLI overrides for some fields, tokenizer_family likely UNKNOWN.");
237 int total_layers_from_config = initial_model_config_for_model_ctor.
num_hidden_layers;
238 if (total_layers_from_config <= 0 && st_config_loaded) {
239 Logger::warning(
"config.json loaded but num_hidden_layers is <= 0. GPU offload logic might be incorrect.");
240 }
else if (total_layers_from_config <= 0 && !st_config_loaded) {
241 Logger::warning(
"config.json NOT loaded and num_hidden_layers is <= 0 (default). GPU offload may not work as expected. Model load will likely fail.");
246 if (num_gpu_layers_from_cli < 0) {
248 }
else if (num_gpu_layers_from_cli == 0) {
251 if (total_layers_from_config > 0) {
252 initial_model_config_for_model_ctor.
num_cpu_offload_layers = total_layers_from_config - num_gpu_layers_from_cli;
257 Logger::warning(
"Total hidden layers unknown from config.json before model load; passing num_gpu_layers_from_cli as num_cpu_offload_layers hint.");
261 if (total_layers_from_config > 0) {
268 model_ = std::make_unique<TinyLlamaModel>(initial_model_config_for_model_ctor, st_loader);
274 Logger::info(
"TinyLlamaSession: Finalizing ModelConfig for KVCache initialization. use_kvcache_quantization set to: " +
285 std::string extension = fs_model_path.extension().string();
286 std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
288 if (extension ==
".gguf") {
289 Logger::info(
"GGUF model type detected by extension for Session constructor: " + model_path_arg);
290 model_ = std::make_unique<TinyLlamaModel>(initial_model_config_for_model_ctor, model_path_arg);
292 }
else if (extension ==
".safetensors") {
293 Logger::info(
"SafeTensors model type detected by extension for Session constructor (file case): " + model_path_arg);
294 effective_model_file_path = model_path_arg;
297 if (st_config_loaded) {
298 Logger::info(
"Successfully loaded config.json for SafeTensors in Session ctor (file case).");
300 std::string family_after_json_load_file_case =
"UNKNOWN_POST_JSON_LOAD_FILE_CASE";
303 Logger::info(
"[API_CPP_POST_JSON_LOAD_FILE_CASE] Tokenizer family in initial_model_config_for_model_ctor: " + family_after_json_load_file_case);
306 Logger::warning(
"Failed to load config.json for SafeTensors in Session ctor (file case). Model will use defaults or CLI overrides.");
311 int total_layers_from_config_file_case = initial_model_config_for_model_ctor.
num_hidden_layers;
312 if (num_gpu_layers_from_cli < 0) {
314 }
else if (num_gpu_layers_from_cli == 0) {
317 if (total_layers_from_config_file_case > 0) {
318 initial_model_config_for_model_ctor.
num_cpu_offload_layers = total_layers_from_config_file_case - num_gpu_layers_from_cli;
321 Logger::warning(
"Total hidden layers unknown from config.json (file case); passing num_gpu_layers_from_cli as num_cpu_offload_layers hint.");
324 if (total_layers_from_config_file_case > 0) {
331 model_ = std::make_unique<TinyLlamaModel>(initial_model_config_for_model_ctor, st_loader);
337 Logger::info(
"TinyLlamaSession: Finalizing ModelConfig for KVCache initialization. use_kvcache_quantization set to: " +
349 throw std::runtime_error(
"Unsupported model file type or extension in Session constructor: " + model_path_arg +
350 ". Please provide a directory for SafeTensors, a .gguf file, or a .safetensors file.");
355 throw std::runtime_error(
"Model pointer is null after instantiation attempt in Session constructor.");
362 throw std::runtime_error(
"GGUF model loaded but GGUFData is null in Session constructor.");
365 Logger::info(
"Tokenizer initialized from GGUF metadata.");
367 std::filesystem::path p_tokenizer_arg(tokenizer_path_arg);
368 std::string tokenizer_dir = p_tokenizer_arg.parent_path().string();
369 if (tokenizer_dir.empty()) {
373 std::string vocab_json_path = (std::filesystem::path(tokenizer_dir) /
"tokenizer.json").
string();
376 std::string sp_model_path = tokenizer_path_arg;
378 Logger::info(
"Initializing Tokenizer for SafeTensors. Vocab JSON path: " + vocab_json_path +
", SP Model path: " + sp_model_path);
380 std::string family_to_log =
"UNKNOWN_IN_API_CPP";
383 Logger::info(
"[API_CPP_TOKENIZER_INIT] Tokenizer family from session config for SafeTensors: " + family_to_log);
385 tokenizer_ = std::make_unique<Tokenizer>(vocab_json_path, sp_model_path,
config_);
386 Logger::info(
"Tokenizer initialized from external files for SafeTensors model.");
388 }
catch (
const std::exception& e) {
389 throw std::runtime_error(std::string(
"Failed to initialize Tokenizer: ") + e.what());
393 throw std::runtime_error(
"Tokenizer pointer is null after instantiation attempt.");
402 int gpu_layers_for_kvcache = total_model_layers - effective_cpu_offload_layers;
403 if (gpu_layers_for_kvcache < 0) gpu_layers_for_kvcache = 0;
404 if (gpu_layers_for_kvcache > total_model_layers) gpu_layers_for_kvcache = total_model_layers;
406 Logger::info(
"[Session KVCache Init] Total Layers: " + std::to_string(total_model_layers) +
407 ", Effective CPU Offload by Model: " + std::to_string(effective_cpu_offload_layers) +
408 ", GPU Layers for KVCache: " + std::to_string(gpu_layers_for_kvcache));
410 if (total_model_layers <= 0) {
411 throw std::runtime_error(
"Model config has zero or negative num_hidden_layers before KVCache init.");
414 throw std::runtime_error(
"Model config has zero or negative num_attention_heads before KVCache init.");
421 gpu_layers_for_kvcache,
426 Logger::info(
"TinyLlamaSession initialization complete (after KVCache init).");
435 int top_k,
float top_p,
436 const std::string& system_prompt_arg,
437 bool apply_q_a_format_cli_hint) {
438 auto t_start = std::chrono::high_resolution_clock::now();
444 Logger::info(
"[Generate API] User prompt: \"" + user_prompt +
"\", System prompt: \"" + system_prompt_arg +
"\", Steps: " + std::to_string(steps));
447 throw std::runtime_error(
"Model or tokenizer not loaded.");
450 std::string final_prompt_for_encoding;
451 bool used_chat_template =
false;
455 bool gguf_template_empty =
tokenizer_->get_gguf_chat_template().empty();
456 Logger::info(
"[Generate API] GGUF chat template from tokenizer is empty: " + std::string(gguf_template_empty ?
"true" :
"false"));
457 if (!gguf_template_empty) {
458 Logger::info(
"[Generate API] GGUF Template Content (first 100 chars): " +
tokenizer_->get_gguf_chat_template().substr(0, 100));
461 Logger::warning(
"[Generate API] Tokenizer is null before checking chat template!");
463 std::string family_log_str =
"UNKNOWN";
466 Logger::info(
"[Generate API] Configured tokenizer_family: " + family_log_str);
470 if (apply_q_a_format_cli_hint) {
471 Logger::info(
"[Generate API] Using legacy Q/A formatting (CLI Hint is true - Priority 1).");
472 std::string temp_prompt = user_prompt;
473 if (!system_prompt_arg.empty()) {
474 temp_prompt = system_prompt_arg +
"\\n\\nQ: " + user_prompt +
"\\nA:";
476 temp_prompt =
"Q: " + user_prompt +
"\\nA:";
478 final_prompt_for_encoding = temp_prompt;
479 used_chat_template =
false;
483 std::string gguf_template_content =
tokenizer_->get_gguf_chat_template();
485 bool looks_like_jinja = (gguf_template_content.find(
"{%") != std::string::npos);
487 if (is_llama_sentencepiece_family && looks_like_jinja) {
488 Logger::info(
"[Generate API] Detected LLAMA_SENTENCEPIECE model with a Jinja-like GGUF template. Forcing Q/A format to avoid C++ Jinja processing issues (Priority 2 Override).");
489 std::string temp_prompt = user_prompt;
490 if (!system_prompt_arg.empty()) {
491 temp_prompt = system_prompt_arg +
"\\\\n\\\\nQ: " + user_prompt +
"\\\\nA:";
493 temp_prompt =
"Q: " + user_prompt +
"\\\\nA:";
495 final_prompt_for_encoding = temp_prompt;
496 used_chat_template =
false;
498 Logger::info(
"[Generate API] Using GGUF chat template from tokenizer (Q/A Hint false - Priority 2).");
499 final_prompt_for_encoding =
tokenizer_->apply_chat_template(user_prompt, system_prompt_arg,
config_);
500 used_chat_template =
true;
505 Logger::info(
"[Generate API] Llama 3 tokenizer family detected, using apply_chat_template (Q/A Hint false, No GGUF template - Priority 3).");
506 final_prompt_for_encoding =
tokenizer_->apply_chat_template(user_prompt, system_prompt_arg,
config_);
507 used_chat_template =
true;
511 Logger::info(
"[Generate API] No applicable template/hint. Using user prompt as is (prepending system prompt if available - Priority 4).");
512 if (!system_prompt_arg.empty()) {
513 final_prompt_for_encoding = system_prompt_arg +
"\\n\\n" + user_prompt;
515 final_prompt_for_encoding = user_prompt;
517 used_chat_template =
false;
520 Logger::debug(
"[Generate API] Final prompt for encoding (first 100 chars): \\\"" + final_prompt_for_encoding.substr(0, 100) +
"\\\"");
524 if (tokens.empty()) {
525 Logger::warning(
"Tokenization resulted in empty ID list for prompt: " +
526 final_prompt_for_encoding);
530 int num_prompt_tokens = tokens.size();
531 Logger::info(
"[Generate API] Number of prompt tokens: " + std::to_string(num_prompt_tokens));
533 int total_steps = num_prompt_tokens + steps -1;
534 int generated_count = 0;
535 int next_token_id = -1;
537 std::vector<float> logits;
538 std::vector<int> generated_token_ids;
543 std::vector<float> current_data_host;
544 int start_pos_for_loop = 0;
549 bool prefill_enabled = num_prompt_tokens >= 32;
551if (prefill_enabled) {
552 Logger::info(
"[Generate API] Prefill enabled. num_prompt_tokens: " + std::to_string(num_prompt_tokens) +
557 for (
int i = 0; i < num_prompt_tokens; ++i) {
558 std::vector<float> token_embedding =
model_->lookup_embedding(tokens[i]);
559 if (token_embedding.empty()) {
560 Logger::error(
"Prefill: Embedding lookup returned empty vector for token ID: " + std::to_string(tokens[i]) +
" at prompt pos " + std::to_string(i));
563 std::copy(token_embedding.begin(), token_embedding.end(), batch_initial_embeddings.begin() + i *
config_.
hidden_size);
567 std::vector<float> cpu_processed_embeddings;
571 if (cpu_processed_embeddings.empty()) {
572 Logger::error(
"Prefill: forward_cpu_batch returned empty or failed.");
576 cpu_processed_embeddings = batch_initial_embeddings;
581 Logger::info(
"[Generate API] Prefill: All layers are on CPU. Getting logits from final CPU layer output.");
582 std::vector<float> batch_logits =
model_->forward_cpu_logits_batch(cpu_processed_embeddings, num_prompt_tokens);
584 Logger::error(
"Prefill: forward_cpu_logits_batch returned invalid logits.");
588 logits.assign(batch_logits.begin() + (num_prompt_tokens - 1) *
config_.
vocab_size,
592 Logger::info(
"[Generate API] Prefill: Processing GPU layers for the batch.");
594 float* d_temp_batch_embeddings =
nullptr;
595 size_t batch_embeddings_size_bytes = cpu_processed_embeddings.size() *
sizeof(float);
597 if (batch_embeddings_size_bytes == 0) {
598 Logger::error(
"Prefill: cpu_processed_embeddings is empty, cannot proceed with GPU batch prefill.");
602 gpuErrchk(cudaMalloc(&d_temp_batch_embeddings, batch_embeddings_size_bytes));
603 if (!d_temp_batch_embeddings) {
604 Logger::error(
"Prefill: cudaMalloc failed for d_temp_batch_embeddings.");
608 gpuErrchk(cudaMemcpy(d_temp_batch_embeddings, cpu_processed_embeddings.data(),
609 batch_embeddings_size_bytes, cudaMemcpyHostToDevice));
611 logits =
model_->forward_device_batch_prefill(d_temp_batch_embeddings, num_prompt_tokens, start_pos_for_loop, &
kv_cache_, 0);
613 if (d_temp_batch_embeddings) {
614 gpuErrchk(cudaFree(d_temp_batch_embeddings));
617 Logger::error(
"[Generate API] GPU layers requested but CUDA not available. Cannot proceed.");
622 if (logits.empty()) {
623 Logger::error(
"Prefill: Logits are empty after prefill processing.");
627 generated_token_ids.push_back(next_token_id);
634 start_pos_for_loop = num_prompt_tokens;
637 Logger::info(
"[Generate API] Prefill completed. next_token_id: " + std::to_string(next_token_id) +
638 ", Decoded: \"" +
tokenizer_->decode({next_token_id},
false) +
"\"" +
639 ", start_pos_for_loop set to: " + std::to_string(start_pos_for_loop));
642 for (
int pos = start_pos_for_loop; pos < total_steps; ++pos) {
652 if (pos == num_prompt_tokens && start_pos_for_loop == num_prompt_tokens) {
656 input_token_id = next_token_id;
657 Logger::debug(
"[Generate Loop] First token post-prefill. Using prefill's next_token_id: " + std::to_string(input_token_id) +
" for pos " + std::to_string(pos));
665 input_token_id = (pos < num_prompt_tokens && start_pos_for_loop == 0) ? tokens[pos] : next_token_id;
666 if (start_pos_for_loop == 0 && pos < num_prompt_tokens) {
667 Logger::debug(
"[Generate Loop] No prefill, prompt token. Using tokens[" + std::to_string(pos) +
"]: " + std::to_string(input_token_id) +
" for pos " + std::to_string(pos));
669 Logger::debug(
"[Generate Loop] Standard generation. Using previously sampled next_token_id: " + std::to_string(input_token_id) +
" for pos " + std::to_string(pos));
673 current_data_host =
model_->lookup_embedding(input_token_id);
674 if (pos == 14 || pos == 15 || pos == 16) {
675 log_vector_summary_detailed(
"[API_CPP GenLoop] current_data_host after lookup_embedding for input_token_id=" + std::to_string(input_token_id),
676 current_data_host, pos, -100, 8);
679 if (current_data_host.empty()) {
680 Logger::error(
"Embedding lookup returned empty vector for token ID: " + std::to_string(input_token_id) +
" at pos " + std::to_string(pos));
689 std::vector<float> intermediate_activations =
model_->forward(current_data_host, pos, &
kv_cache_,
nullptr);
691 Logger::debug(
"[Mixed Mode] CPU layers complete, transferring to GPU for remaining layers");
692 gpuErrchk(cudaMemcpy(
model_->get_x_dev(), intermediate_activations.data(), intermediate_activations.size() *
sizeof(
float), cudaMemcpyHostToDevice));
695 Logger::error(
"[Mixed Mode] Mixed CPU/GPU mode requested but CUDA not available. Cannot proceed.");
701 gpuErrchk(cudaMemcpy(
model_->get_x_dev(), current_data_host.data(), current_data_host.size() *
sizeof(
float), cudaMemcpyHostToDevice));
704 Logger::error(
"[GPU-only Mode] GPU-only mode requested but CUDA not available. Cannot proceed.");
713 if (pos == num_prompt_tokens - 1 || pos >= num_prompt_tokens) {
717 if (pos >= num_prompt_tokens) {
718 generated_token_ids.push_back(next_token_id);
726 generated_token_ids.push_back(next_token_id);
733 Logger::info(
"[Generate API] First token sampled from prompt: " + std::to_string(next_token_id) +
734 ", Decoded: \"" +
tokenizer_->decode({next_token_id},
false) +
"\"");
738 if (next_token_id ==
eos_token_id_ && pos >= num_prompt_tokens) {
740 ") sampled at pos " + std::to_string(pos) +
". Stopping.");
744 if (generated_count >= steps) {
745 Logger::info(
"Reached max generation steps (" + std::to_string(steps) +
"). Stopping.");
753 if (!prefill_enabled || pos >= num_prompt_tokens) {
760 std::string generated_ids_str =
"[Generated IDs Pre-Decode] ";
761 for(
int gen_id : generated_token_ids) {
762 generated_ids_str += std::to_string(gen_id) +
" ";
766 std::string result =
tokenizer_->decode(generated_token_ids,
true);
769 auto t_end = std::chrono::high_resolution_clock::now();
770 double time_taken_ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
773 std::ostringstream time_ss;
774 time_ss << std::fixed << std::setprecision(4) << time_taken_ms;
775 Logger::info(
"[INFO] Total generation processing time: " + time_ss.str() +
" ms");
783 int top_k,
float top_p,
784 const std::string& system_prompt_arg,
785 bool apply_q_a_format_cli_hint) {
786 auto t_start = std::chrono::high_resolution_clock::now();
788 if (prompts.empty()) {
789 throw std::runtime_error(
"Cannot process empty prompts vector for batch generation.");
793 throw std::runtime_error(
"Batch size " + std::to_string(prompts.size()) +
797 Logger::info(
"[Batch Generate API] Processing " + std::to_string(prompts.size()) +
798 " prompts in batch. Steps: " + std::to_string(steps));
801 throw std::runtime_error(
"Model or tokenizer not loaded for batch generation.");
805 std::vector<std::string> final_prompts(prompts.size());
806 std::vector<std::vector<int>> all_tokens(prompts.size());
807 std::vector<int> prompt_lengths(prompts.size());
808 int max_prompt_length = 0;
810 for (
size_t i = 0; i < prompts.size(); ++i) {
812 std::string final_prompt_for_encoding;
813 bool used_chat_template =
false;
816 if (apply_q_a_format_cli_hint) {
817 Logger::info(
"[Batch Generate API] Using legacy Q/A formatting for prompt " + std::to_string(i));
818 if (!system_prompt_arg.empty()) {
819 final_prompt_for_encoding = system_prompt_arg +
"\\n\\nQ: " + prompts[i] +
"\\nA:";
821 final_prompt_for_encoding =
"Q: " + prompts[i] +
"\\nA:";
824 std::string gguf_template_content =
tokenizer_->get_gguf_chat_template();
826 bool looks_like_jinja = (gguf_template_content.find(
"{%") != std::string::npos);
828 if (is_llama_sentencepiece_family && looks_like_jinja) {
829 Logger::info(
"[Batch Generate API] Using Q/A format override for prompt " + std::to_string(i));
830 if (!system_prompt_arg.empty()) {
831 final_prompt_for_encoding = system_prompt_arg +
"\\\\n\\\\nQ: " + prompts[i] +
"\\\\nA:";
833 final_prompt_for_encoding =
"Q: " + prompts[i] +
"\\\\nA:";
836 Logger::info(
"[Batch Generate API] Using GGUF chat template for prompt " + std::to_string(i));
837 final_prompt_for_encoding =
tokenizer_->apply_chat_template(prompts[i], system_prompt_arg,
config_);
838 used_chat_template =
true;
841 Logger::info(
"[Batch Generate API] Using Llama 3 chat template for prompt " + std::to_string(i));
842 final_prompt_for_encoding =
tokenizer_->apply_chat_template(prompts[i], system_prompt_arg,
config_);
843 used_chat_template =
true;
845 Logger::info(
"[Batch Generate API] Using raw prompt for prompt " + std::to_string(i));
846 if (!system_prompt_arg.empty()) {
847 final_prompt_for_encoding = system_prompt_arg +
"\\n\\n" + prompts[i];
849 final_prompt_for_encoding = prompts[i];
853 final_prompts[i] = final_prompt_for_encoding;
856 if (all_tokens[i].empty()) {
857 Logger::warning(
"Batch tokenization resulted in empty ID list for prompt " + std::to_string(i));
858 all_tokens[i].push_back(
tokenizer_->bos_token_id());
861 prompt_lengths[i] = all_tokens[i].size();
862 max_prompt_length = std::max(max_prompt_length, prompt_lengths[i]);
864 Logger::info(
"[Batch Generate API] Prompt " + std::to_string(i) +
": " +
865 std::to_string(prompt_lengths[i]) +
" tokens");
872 std::vector<std::string> results(prompts.size());
876 Logger::info(
"[Batch Generate API] Using parallel batch processing");
878 Logger::info(
"[DEBUG] Initializing KV cache for batch mode");
880 Logger::info(
"[DEBUG] KV cache batch initialization completed");
887 Logger::info(
"[DEBUG] About to call batch_prefill_parallel");
888 Logger::info(
"[DEBUG] all_tokens.size()=" + std::to_string(all_tokens.size()) +
", prompt_lengths.size()=" + std::to_string(prompt_lengths.size()));
890 std::vector<std::vector<float>> batch_final_logits;
894 if (prefill_success && batch_final_logits.size() == prompts.size()) {
895 Logger::info(
"[Batch Generate API] Batch prefill successful, starting parallel generation");
898 Logger::info(
"[DEBUG] Checking batch_final_logits integrity after prefill");
899 for (
size_t i = 0; i < batch_final_logits.size(); ++i) {
900 if (batch_final_logits[i].empty()) {
901 Logger::error(
"[DEBUG] batch_final_logits[" + std::to_string(i) +
"] is empty!");
902 goto fallback_sequential;
905 Logger::error(
"[DEBUG] batch_final_logits[" + std::to_string(i) +
"] has wrong size: " +
906 std::to_string(batch_final_logits[i].size()) +
" vs expected " + std::to_string(
config_.
vocab_size));
907 goto fallback_sequential;
910 for (
size_t j = 0; j < std::min(static_cast<size_t>(10UL), batch_final_logits[i].size()); ++j) {
911 if (!std::isfinite(batch_final_logits[i][j])) {
912 Logger::error(
"[DEBUG] batch_final_logits[" + std::to_string(i) +
"][" + std::to_string(j) +
"] is not finite: " + std::to_string(batch_final_logits[i][j]));
913 goto fallback_sequential;
917 Logger::info(
"[DEBUG] batch_final_logits integrity check passed");
920 std::vector<int> current_tokens(prompts.size());
921 std::vector<std::vector<int>> all_generated_tokens(prompts.size());
922 std::vector<int> sequence_positions(prompts.size());
923 std::vector<bool> sequence_finished(prompts.size(),
false);
925 Logger::info(
"[DEBUG] Starting token sampling for " + std::to_string(prompts.size()) +
" sequences");
927 for (
size_t i = 0; i < prompts.size(); ++i) {
928 Logger::info(
"[DEBUG] Sampling token for sequence " + std::to_string(i));
931 if (i >= batch_final_logits.size()) {
932 Logger::error(
"[DEBUG] Index " + std::to_string(i) +
" out of bounds for batch_final_logits (size: " + std::to_string(batch_final_logits.size()) +
")");
933 goto fallback_sequential;
938 Logger::info(
"[DEBUG] Sampled token " + std::to_string(current_tokens[i]) +
" for sequence " + std::to_string(i));
939 }
catch (
const std::exception& e) {
940 Logger::error(
"[DEBUG] Exception during sampling for sequence " + std::to_string(i) +
": " + std::string(e.what()));
941 goto fallback_sequential;
944 all_generated_tokens[i].push_back(current_tokens[i]);
945 sequence_positions[i] = prompt_lengths[i];
949 sequence_finished[i] =
true;
950 Logger::info(
"[DEBUG] Sequence " + std::to_string(i) +
" finished with EOS token");
954 Logger::info(
"[DEBUG] Token sampling completed, starting generation loop");
957 for (
int step = 1; step < steps; ++step) {
958 Logger::info(
"[DEBUG] Starting generation step " + std::to_string(step));
961 bool all_finished =
true;
962 for (
bool finished : sequence_finished) {
964 all_finished =
false;
969 Logger::info(
"[Batch Generate API] All sequences finished at step " + std::to_string(step));
973 Logger::info(
"[DEBUG] Collecting active sequences for step " + std::to_string(step));
976 std::vector<int> active_tokens;
977 std::vector<int> active_positions;
978 std::vector<int> active_sequence_indices;
979 std::vector<int> batch_to_original_seq_mapping;
981 for (
size_t i = 0; i < prompts.size(); ++i) {
982 if (!sequence_finished[i]) {
983 active_tokens.push_back(current_tokens[i]);
984 active_positions.push_back(sequence_positions[i]);
985 active_sequence_indices.push_back(active_tokens.size() - 1);
986 batch_to_original_seq_mapping.push_back(i);
987 Logger::info(
"[DEBUG] Active sequence " + std::to_string(i) +
" mapped to batch index " + std::to_string(active_tokens.size() - 1) +
988 ": token=" + std::to_string(current_tokens[i]) +
", pos=" + std::to_string(sequence_positions[i]));
992 if (active_tokens.empty()) {
993 Logger::info(
"[DEBUG] No active tokens, breaking from generation loop");
997 Logger::info(
"[DEBUG] About to call batch_generation_parallel with " + std::to_string(active_tokens.size()) +
" active sequences");
1000 std::vector<std::vector<float>> step_logits;
1001 bool generation_success =
batch_generation_parallel(active_tokens, active_positions, batch_to_original_seq_mapping, step_logits);
1002 Logger::info(
"[DEBUG] batch_generation_parallel returned: " + std::string(generation_success ?
"success" :
"failure"));
1004 if (!generation_success || step_logits.size() != active_tokens.size()) {
1005 Logger::warning(
"[Batch Generate API] Parallel generation failed at step " + std::to_string(step) +
1006 ", falling back to sequential processing");
1007 goto fallback_sequential;
1010 Logger::info(
"[DEBUG] Starting token sampling for step " + std::to_string(step));
1013 for (
size_t active_idx = 0; active_idx < active_tokens.size(); ++active_idx) {
1014 size_t original_seq_idx = batch_to_original_seq_mapping[active_idx];
1015 Logger::info(
"[DEBUG] Sampling for active_idx=" + std::to_string(active_idx) +
", original_seq_idx=" + std::to_string(original_seq_idx));
1018 if (active_idx >= step_logits.size()) {
1019 Logger::error(
"[DEBUG] active_idx " + std::to_string(active_idx) +
" out of bounds for step_logits (size: " + std::to_string(step_logits.size()) +
")");
1020 goto fallback_sequential;
1022 if (original_seq_idx >= prompts.size()) {
1023 Logger::error(
"[DEBUG] original_seq_idx " + std::to_string(original_seq_idx) +
" out of bounds for prompts (size: " + std::to_string(prompts.size()) +
")");
1024 goto fallback_sequential;
1029 Logger::info(
"[DEBUG] Sampled next token " + std::to_string(next_token) +
" for original_seq_idx " + std::to_string(original_seq_idx));
1031 current_tokens[original_seq_idx] = next_token;
1032 all_generated_tokens[original_seq_idx].push_back(next_token);
1033 sequence_positions[original_seq_idx]++;
1037 sequence_finished[original_seq_idx] =
true;
1038 Logger::info(
"[DEBUG] Sequence " + std::to_string(original_seq_idx) +
" finished with EOS at step " + std::to_string(step));
1040 }
catch (
const std::exception& e) {
1041 Logger::error(
"[DEBUG] Exception during sampling at step " + std::to_string(step) +
" for original_seq_idx " + std::to_string(original_seq_idx) +
": " + std::string(e.what()));
1042 goto fallback_sequential;
1046 Logger::info(
"[DEBUG] Completed generation step " + std::to_string(step));
1050 for (
size_t i = 0; i < prompts.size(); ++i) {
1051 results[i] =
tokenizer_->decode(all_generated_tokens[i],
true);
1054 Logger::info(
"[Batch Generate API] Parallel batch processing completed successfully");
1056 Logger::warning(
"[Batch Generate API] Batch prefill failed, falling back to sequential processing");
1057 goto fallback_sequential;
1060 fallback_sequential:
1061 Logger::info(
"[Batch Generate API] Using sequential processing");
1063 for (
size_t i = 0; i < prompts.size(); ++i) {
1064 Logger::info(
"[Batch Generate API] Processing prompt " + std::to_string(i + 1) +
1065 "/" + std::to_string(prompts.size()));
1071 std::string result =
generate(prompts[i], steps, temperature, top_k, top_p,
1072 system_prompt_arg, apply_q_a_format_cli_hint);
1073 results[i] = result;
1077 auto t_end = std::chrono::high_resolution_clock::now();
1078 double time_taken_ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
1080 std::ostringstream time_ss;
1081 time_ss << std::fixed << std::setprecision(4) << time_taken_ms;
1082 Logger::info(
"[Batch Generate API] Total batch processing time: " + time_ss.str() +
" ms for " +
1083 std::to_string(prompts.size()) +
" prompts");
1089 const std::vector<int>& prompt_lengths,
1090 std::vector<std::vector<float>>& batch_final_logits) {
1091 Logger::info(
"[EMERGENCY_DEBUG] batch_prefill_parallel function entry - FIRST LINE");
1092 Logger::info(
"[DEBUG] Entering batch_prefill_parallel");
1094 int total_tokens_across_all_prompts = 0;
1095 for (
int len : prompt_lengths) {
1096 total_tokens_across_all_prompts += len;
1099 if (total_tokens_across_all_prompts == 0) {
1100 Logger::error(
"[Batch Prefill] No tokens to process in batch prefill.");
1104 Logger::info(
"[Batch Prefill] Processing " + std::to_string(all_tokens.size()) +
1105 " sequences with total " + std::to_string(total_tokens_across_all_prompts) +
" tokens");
1108 Logger::info(
"[Batch Prefill] Preparing batch embeddings for " +
1109 std::to_string(total_tokens_across_all_prompts) +
" tokens");
1112 size_t required_memory_bytes =
static_cast<size_t>(total_tokens_across_all_prompts) *
config_.
hidden_size *
sizeof(
float);
1113 Logger::info(
"[DEBUG] About to allocate " + std::to_string(required_memory_bytes) +
" bytes (" +
1114 std::to_string(required_memory_bytes / (1024*1024)) +
" MB) for batch embeddings");
1116 std::vector<float> batch_embeddings(total_tokens_across_all_prompts *
config_.
hidden_size);
1117 Logger::info(
"[DEBUG] batch_embeddings allocation completed successfully");
1119 int token_offset = 0;
1122 Logger::info(
"[DEBUG] Starting token embedding processing for " + std::to_string(all_tokens.size()) +
" sequences");
1124 for (
size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1125 Logger::info(
"[DEBUG] Processing sequence " + std::to_string(seq_idx) +
" with " + std::to_string(prompt_lengths[seq_idx]) +
" tokens");
1128 std::string token_ids_str =
"Token IDs: ";
1129 for (
int i = 0; i < std::min(5, prompt_lengths[seq_idx]); ++i) {
1130 token_ids_str += std::to_string(all_tokens[seq_idx][i]) +
" ";
1132 if (prompt_lengths[seq_idx] > 5) token_ids_str +=
"...";
1133 Logger::info(
"[DEBUG] Sequence " + std::to_string(seq_idx) +
" " + token_ids_str);
1135 for (
int token_idx = 0; token_idx < prompt_lengths[seq_idx]; ++token_idx) {
1136 int current_token_id = all_tokens[seq_idx][token_idx];
1139 if (seq_idx < 2 && token_idx < 3) {
1140 Logger::info(
"[DEBUG] Placing token " + std::to_string(current_token_id) +
1141 " from seq " + std::to_string(seq_idx) +
" pos " + std::to_string(token_idx) +
1142 " at batch offset " + std::to_string(token_offset));
1145 std::vector<float> token_embedding =
model_->lookup_embedding(current_token_id);
1146 if (token_embedding.empty() || token_embedding.size() !=
static_cast<size_t>(
config_.
hidden_size)) {
1147 Logger::error(
"[Batch Prefill] Embedding lookup failed for token " +
1148 std::to_string(current_token_id) +
1149 " in sequence " + std::to_string(seq_idx));
1156 Logger::error(
"[Batch Prefill] Buffer overflow detected at token offset " + std::to_string(token_offset));
1160 std::copy(token_embedding.begin(), token_embedding.end(),
1161 batch_embeddings.begin() + target_offset);
1165 Logger::info(
"[DEBUG] Sequence " + std::to_string(seq_idx) +
" complete. Next token_offset: " + std::to_string(token_offset));
1169 std::vector<float> cpu_processed_embeddings;
1172 " CPU layers for batch prefill");
1173 cpu_processed_embeddings =
model_->forward_cpu_batch(batch_embeddings,
1174 total_tokens_across_all_prompts,
1179 if (cpu_processed_embeddings.empty()) {
1180 Logger::error(
"[Batch Prefill] CPU batch processing failed.");
1184 cpu_processed_embeddings = batch_embeddings;
1188 std::vector<float> final_batch_logits;
1192 Logger::info(
"[Batch Prefill] All layers on CPU, computing logits");
1193 final_batch_logits =
model_->forward_cpu_logits_batch(cpu_processed_embeddings,
1194 total_tokens_across_all_prompts);
1198 Logger::info(
"[Batch Prefill] Processing GPU layers for batch prefill");
1200 Logger::info(
"[DEBUG] About to allocate GPU memory for batch prefill");
1201 float* d_batch_embeddings =
nullptr;
1202 size_t batch_size_bytes = cpu_processed_embeddings.size() *
sizeof(float);
1203 Logger::info(
"[DEBUG] GPU allocation size: " + std::to_string(batch_size_bytes) +
" bytes (" +
1204 std::to_string(batch_size_bytes / (1024*1024)) +
" MB)");
1207 gpuErrchk(cudaMalloc(&d_batch_embeddings, batch_size_bytes));
1208 Logger::info(
"[DEBUG] cudaMalloc completed successfully");
1210 Logger::info(
"[DEBUG] Calling cudaMemcpy host to device...");
1211 gpuErrchk(cudaMemcpy(d_batch_embeddings, cpu_processed_embeddings.data(),
1212 batch_size_bytes, cudaMemcpyHostToDevice));
1213 Logger::info(
"[DEBUG] cudaMemcpy completed successfully");
1216 Logger::info(
"[DEBUG] Calling forward_device_batch_prefill with " + std::to_string(total_tokens_across_all_prompts) +
" total tokens");
1217 std::vector<float> all_batch_logits =
model_->forward_device_batch_prefill(
1218 d_batch_embeddings, total_tokens_across_all_prompts, 0, &
kv_cache_, 0);
1220 Logger::info(
"[DEBUG] forward_device_batch_prefill completed, returned " + std::to_string(all_batch_logits.size()) +
" total logits");
1222 gpuErrchk(cudaFree(d_batch_embeddings));
1224 final_batch_logits = all_batch_logits;
1226 Logger::error(
"[Batch Prefill] GPU processing requested but CUDA not available.");
1231 Logger::info(
"[Batch Prefill] Successfully processed batch prefill for " +
1232 std::to_string(all_tokens.size()) +
" sequences");
1234 Logger::info(
"[DEBUG] About to return from batch_prefill_parallel");
1235 Logger::info(
"[DEBUG] batch_final_logits.size()=" + std::to_string(batch_final_logits.size()));
1236 for (
size_t i = 0; i < batch_final_logits.size() && i < 3; ++i) {
1237 Logger::info(
"[DEBUG] batch_final_logits[" + std::to_string(i) +
"].size()=" + std::to_string(batch_final_logits[i].size()));
1241 batch_final_logits.clear();
1242 batch_final_logits.resize(all_tokens.size());
1246 if (final_batch_logits.size() !=
static_cast<size_t>(total_tokens_across_all_prompts *
config_.
vocab_size)) {
1247 Logger::error(
"[Batch Prefill] CPU logits size mismatch. Expected: " +
1249 ", got: " + std::to_string(final_batch_logits.size()));
1253 int token_offset = 0;
1254 for (
size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1255 int last_token_pos = token_offset + prompt_lengths[seq_idx] - 1;
1257 Logger::info(
"[DEBUG] Extracting logits for sequence " + std::to_string(seq_idx) +
1258 ": token_offset=" + std::to_string(token_offset) +
1259 ", prompt_length=" + std::to_string(prompt_lengths[seq_idx]) +
1260 ", last_token_pos=" + std::to_string(last_token_pos));
1267 if (src_end > final_batch_logits.size()) {
1268 Logger::error(
"[Batch Prefill] CPU logits bounds check failed for sequence " + std::to_string(seq_idx));
1272 std::copy(final_batch_logits.begin() + src_start,
1273 final_batch_logits.begin() + src_end,
1274 batch_final_logits[seq_idx].begin());
1278 std::string logit_sample =
"First 5 logits: ";
1280 logit_sample += std::to_string(batch_final_logits[seq_idx][i]) +
" ";
1282 Logger::info(
"[DEBUG] Sequence " + std::to_string(seq_idx) +
" " + logit_sample);
1285 token_offset += prompt_lengths[seq_idx];
1289 Logger::info(
"[DEBUG] GPU batch logits size: " + std::to_string(final_batch_logits.size()) +
1290 ", expected for all tokens: " + std::to_string(total_tokens_across_all_prompts *
config_.
vocab_size) +
1291 ", expected for last tokens only: " + std::to_string(all_tokens.size() *
config_.
vocab_size));
1293 if (final_batch_logits.size() ==
static_cast<size_t>(total_tokens_across_all_prompts *
config_.
vocab_size)) {
1295 Logger::info(
"[DEBUG] GPU returned logits for all tokens, extracting last token logits");
1296 int token_offset = 0;
1297 for (
size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1298 int last_token_pos = token_offset + prompt_lengths[seq_idx] - 1;
1300 Logger::info(
"[DEBUG] GPU: Extracting logits for sequence " + std::to_string(seq_idx) +
1301 ": token_offset=" + std::to_string(token_offset) +
1302 ", prompt_length=" + std::to_string(prompt_lengths[seq_idx]) +
1303 ", last_token_pos=" + std::to_string(last_token_pos));
1309 if (src_end > final_batch_logits.size()) {
1310 Logger::error(
"[Batch Prefill] GPU logits bounds check failed for sequence " + std::to_string(seq_idx));
1314 std::copy(final_batch_logits.begin() + src_start,
1315 final_batch_logits.begin() + src_end,
1316 batch_final_logits[seq_idx].begin());
1320 std::string logit_sample =
"First 5 logits: ";
1322 logit_sample += std::to_string(batch_final_logits[seq_idx][i]) +
" ";
1324 Logger::info(
"[DEBUG] GPU Sequence " + std::to_string(seq_idx) +
" " + logit_sample);
1327 token_offset += prompt_lengths[seq_idx];
1329 }
else if (final_batch_logits.size() ==
static_cast<size_t>(all_tokens.size() *
config_.
vocab_size)) {
1331 Logger::info(
"[DEBUG] GPU returned logits for last tokens only");
1332 for (
size_t seq_idx = 0; seq_idx < all_tokens.size(); ++seq_idx) {
1333 Logger::info(
"[DEBUG] GPU Last-Token-Only: Processing sequence " + std::to_string(seq_idx) +
1340 if (src_end > final_batch_logits.size()) {
1341 Logger::error(
"[Batch Prefill] GPU logits bounds check failed for sequence " + std::to_string(seq_idx));
1345 std::copy(final_batch_logits.begin() + src_start,
1346 final_batch_logits.begin() + src_end,
1347 batch_final_logits[seq_idx].begin());
1351 std::string logit_sample =
"First 5 logits: ";
1353 logit_sample += std::to_string(batch_final_logits[seq_idx][i]) +
" ";
1355 Logger::info(
"[DEBUG] GPU Last-Token Sequence " + std::to_string(seq_idx) +
" " + logit_sample);
1359 Logger::error(
"[Batch Prefill] GPU logits size doesn't match expected patterns");
1368 const std::vector<int>& token_positions,
1369 const std::vector<int>& sequence_indices,
1370 std::vector<std::vector<float>>& batch_logits) {
1371 Logger::info(
"[DEBUG] Entering batch_generation_parallel");
1373 int num_sequences = current_tokens.size();
1375 if (num_sequences == 0 || token_positions.size() != current_tokens.size()) {
1380 Logger::info(
"[Batch Generation] Processing " + std::to_string(num_sequences) +
1381 " sequences in parallel generation step");
1384 std::vector<float> batch_embeddings;
1387 for (
int i = 0; i < num_sequences; ++i) {
1388 std::vector<float> token_embedding =
model_->lookup_embedding(current_tokens[i]);
1389 if (token_embedding.empty()) {
1390 Logger::error(
"[Batch Generation] Embedding lookup failed for token " + std::to_string(current_tokens[i]));
1393 batch_embeddings.insert(batch_embeddings.end(), token_embedding.begin(), token_embedding.end());
1399 " CPU layers for batch generation");
1401 std::vector<std::vector<float>> cpu_batch_logits =
model_->forward_cpu_batch_generation(
1402 batch_embeddings, token_positions, sequence_indices, num_sequences, &
kv_cache_);
1404 if (cpu_batch_logits.size() !=
static_cast<size_t>(num_sequences)) {
1405 Logger::error(
"[Batch Generation] CPU batch generation returned wrong number of results");
1411 batch_logits = cpu_batch_logits;
1412 Logger::info(
"[Batch Generation] All CPU layers processed, returning logits");
1417 batch_embeddings.clear();
1421 Logger::warning(
"[Batch Generation] Mixed CPU/GPU not yet implemented for batch generation");
1428 Logger::info(
"[Batch Generation] Processing GPU layers for batch generation");
1430 float* d_batch_embeddings =
nullptr;
1431 size_t batch_size_bytes = batch_embeddings.size() *
sizeof(float);
1433 gpuErrchk(cudaMalloc(&d_batch_embeddings, batch_size_bytes));
1434 gpuErrchk(cudaMemcpy(d_batch_embeddings, batch_embeddings.data(),
1435 batch_size_bytes, cudaMemcpyHostToDevice));
1437 std::vector<std::vector<float>> gpu_batch_logits =
model_->forward_device_batch_generation(
1438 d_batch_embeddings, token_positions, sequence_indices, num_sequences, &
kv_cache_, 0);
1440 gpuErrchk(cudaFree(d_batch_embeddings));
1442 if (gpu_batch_logits.size() !=
static_cast<size_t>(num_sequences)) {
1443 Logger::error(
"[Batch Generation] GPU batch generation returned wrong number of results");
1447 batch_logits = gpu_batch_logits;
1448 Logger::info(
"[Batch Generation] GPU batch generation completed successfully");
1451 Logger::error(
"[Batch Generation] GPU processing requested but CUDA not available.");
1456 Logger::error(
"[Batch Generation] No valid processing path found");