60 const std::string& text)
const {
61 std::vector<std::string> all_tokens;
62 std::vector<std::string> initial_units;
65 boost::regex llama_regex(
67 R
"([\r\n]+|[[:space:]]+|[^\r\n[:space:][:alnum:]]+|[[:alnum:]]+)");
69 std::string text_to_search = text;
72 while (boost::regex_search(text_to_search, match, llama_regex)) {
73 if (!match.str(0).empty()) {
74 initial_units.push_back(match.str(0));
76 text_to_search = match.suffix().str();
78 if (!text_to_search.empty()) {
79 initial_units.push_back(text_to_search);
82 Logger::debug(
"[BPE_SCORES] Regex pre-tokenization resulted in " + std::to_string(initial_units.size()) +
" initial units.");
84 const std::string sp_space_prefix =
"\xE2\x96\x81";
85 bool next_word_needs_prefix =
true;
87 for (
const std::string& unit_raw : initial_units) {
88 if (unit_raw.empty())
continue;
91 bool unit_is_whitespace =
true;
92 for (
char c : unit_raw) {
93 if (!std::isspace(
static_cast<unsigned char>(c))) {
94 unit_is_whitespace =
false;
99 if (unit_is_whitespace) {
101 next_word_needs_prefix =
true;
102 Logger::debug(
"[BPE_SCORES] Unit '" + unit_raw +
"' is whitespace. Setting prefix flag for next word.");
107 std::string unit_to_bpe = unit_raw;
108 if (next_word_needs_prefix) {
109 unit_to_bpe = sp_space_prefix + unit_to_bpe;
110 Logger::debug(
"[BPE_SCORES] Prefixed unit: '" + unit_raw +
"' -> '" + unit_to_bpe +
"'");
111 next_word_needs_prefix =
false;
113 Logger::debug(
"[BPE_SCORES] Processing unit without prefix: '" + unit_to_bpe +
"'");
116 if (unit_raw ==
"\n") {
117 Logger::debug(
"[BPE_SCORES] Raw unit is newline. It will be split into chars. Current unit_to_bpe: '" + unit_to_bpe +
"'");
121 std::vector<std::string> chars;
123 for (
size_t i = 0; i < unit_to_bpe.size();) {
124 int bytes = unicode_char_len(unit_to_bpe[i]);
126 if (i + bytes <= unit_to_bpe.size()) {
127 chars.push_back(unit_to_bpe.substr(i, bytes));
129 Logger::warning(
"[BPE_SCORES] Invalid UTF-8 sequence or length error for: '" + unit_to_bpe.substr(i) +
"'");
130 chars.push_back(unit_to_bpe.substr(i));
137 Logger::warning(
"[BPE_SCORES] Unit '" + unit_to_bpe +
"' (original: '" + unit_raw +
"') produced no chars for BPE.");
143 while (changes && chars.size() > 1) {
145 int best_rank = std::numeric_limits<int>::max();
148 for (
size_t i = 0; i < chars.size() - 1; ++i) {
149 std::string pair = chars[i] + chars[i + 1];
151 if (it !=
bpe_merges_.end() && it->second < best_rank) {
152 best_rank = it->second;
158 std::string merged = chars[best_i] + chars[best_i + 1];
159 chars[best_i] = merged;
160 chars.erase(chars.begin() + best_i + 1);
165 all_tokens.insert(all_tokens.end(), chars.begin(), chars.end());
168 Logger::debug(
"[BPE_SCORES] Final token count after BPE: " + std::to_string(all_tokens.size()));
172 const std::vector<std::string>& tokens)
const {
173 std::vector<int> ids;
174 ids.reserve(tokens.size());
176 for (
const auto& token : tokens) {
179 Logger::debug(
"[TOK_TO_ID_NL_DEBUG] Processing token: '\n' (actual newline char). Length: " + std::to_string(token.length()));
180 bool found_in_added =
false;
182 if (pair.first ==
"\n") {
183 Logger::debug(
"[TOK_TO_ID_NL_DEBUG] Found '\n' key in added_tokens_ map. ID: " + std::to_string(pair.second));
184 found_in_added =
true;
188 if (!found_in_added) {
189 Logger::debug(
"[TOK_TO_ID_NL_DEBUG] '\n' key NOT found in added_tokens_ map by direct string compare.");
191 std::string keys_in_map =
"Keys in added_tokens_: ";
193 std::string key_escaped;
194 for (
char c_key : pair.first) {
195 if (c_key ==
'\n') key_escaped +=
"<NL>";
196 else if (c_key ==
'\r') key_escaped +=
"<CR>";
197 else if (c_key ==
'\t') key_escaped +=
"<TAB>";
198 else if (std::isprint(
static_cast<unsigned char>(c_key))) key_escaped += c_key;
199 else { std::stringstream ss_hex; ss_hex <<
"<0x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(
static_cast<unsigned char>(c_key)) <<
">"; key_escaped += ss_hex.str(); }
201 keys_in_map +=
"['" + key_escaped +
"' (len:" + std::to_string(pair.first.length()) +
")] ";
210 ids.push_back(added_it->second);
212 "' -> ID: " + std::to_string(added_it->second));
216 ids.push_back(base_it->second);
218 "' -> ID: " + std::to_string(base_it->second));
221 if (capitalized_token != token) {
222 auto capitalized_it =
token_to_id_.find(capitalized_token);
224 ids.push_back(capitalized_it->second);
226 "[TOK_TO_ID] FALLBACK: Found capitalized base token: '" +
227 token +
"' -> '" + capitalized_token +
228 "' -> ID: " + std::to_string(capitalized_it->second));
234 if (token.length() == 1) {
238 ids.push_back(byte_it->second);
239 Logger::debug(
"[TOK_TO_ID] FALLBACK: Mapped single-byte token '" +
240 std::string(1, c) +
"' to byte token ID " +
241 std::to_string(byte_it->second));
248 "' not found in added, base, capitalized fallback, or "
249 "byte tokens. Using UNK ID: " +
286 const std::string& model_path,
288 : tokenizer_family_(config.tokenizer_family),
292 pad_token_(
"<pad>") {
293 Logger::info(
"[Tokenizer Constructor JSON] vocab_path: '" + vocab_path +
"', model_path: '" + model_path +
"'");
295 std::filesystem::path vocab_json_path_abs(vocab_path);
296 if (!std::filesystem::exists(vocab_json_path_abs)) {
297 throw std::runtime_error(
"Tokenizer vocab_path (tokenizer.json) does not exist: " + vocab_json_path_abs.string());
300 Logger::info(std::string(
"Loading tokenizer and vocab from: ") + vocab_json_path_abs.string());
301 std::string family_str =
"UNKNOWN";
304 Logger::info(std::string(
"Tokenizer family based on config: ") + family_str);
309 Logger::info(
"LLAMA_SENTENCEPIECE family detected for JSON constructor, attempting to load BPE merges from: " + vocab_json_path_abs.string());
329 std::string init_log_message =
"Tokenizer successfully initialized from JSON/Config. Detected type based on config: ";
334 if (model_path.size() > 0) {
335 if (model_path.size() > 6 &&
336 model_path.substr(model_path.size() - 6) ==
".model") {
337 Logger::info(
"Loading SentencePiece model: " + model_path);
339 }
else if (model_path.size() > 5 &&
340 model_path.substr(model_path.size() - 5) ==
".json") {
341 Logger::info(
"Loading BPE merges from JSON: " + model_path);
344 Logger::info(
"Unsupported model format: " + model_path +
345 " - falling back to space tokenization");
349 "No model path provided - falling back to space tokenization");
351 }
catch (
const std::exception& e) {
352 std::cerr <<
"Failed to load tokenizer or vocab from " << vocab_path <<
": "
353 << e.what() << std::endl;
354 Logger::error(std::string(
"Failed to load tokenizer or vocab from \"") +
355 vocab_path +
"\": " + e.what());
360 throw std::runtime_error(
361 "Failed to initialize tokenizer vocabulary from: " + vocab_path);
365 " tokens from vocabulary file: " + vocab_path);
368 std::string first_few_tokens_log =
"First few (up to 10 or vocab size) tokens from " + vocab_path +
": ";
369 for (
size_t i = 0; i < std::min((
size_t)10,
id_to_token_.size()); ++i) {
370 first_few_tokens_log +=
"ID[" + std::to_string(i) +
"]=";
371 std::string escaped_token;
374 escaped_token +=
"\\\\";
375 }
else if (c_tok ==
'\'') {
376 escaped_token +=
"\\'";
377 }
else if (std::isprint(
static_cast<unsigned char>(c_tok))) {
378 escaped_token += c_tok;
380 std::stringstream ss_hex;
381 ss_hex <<
"<0x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(
static_cast<unsigned char>(c_tok)) <<
">";
382 escaped_token += ss_hex.str();
385 first_few_tokens_log +=
"'" + escaped_token +
"' ";
390 const std::vector<std::pair<std::string, int>> known_chat_tokens = {
391 {
"<|system|>", 32000}, {
"<|user|>", 32001}, {
"<|assistant|>", 32002}};
392 int manually_injected_count = 0;
394 for (
const auto& pair : known_chat_tokens) {
395 const std::string& tok = pair.first;
396 int id = pair.second;
402 manually_injected_count++;
403 Logger::info(
"[MANUAL INJECT] Added missing chat token: '" + tok +
404 "' with assumed ID: " + std::to_string(
id));
407 "' already loaded from JSON. Skipping injection.");
410 "', assumed ID " + std::to_string(
id) +
411 " clashes with loaded vocab size (" +
415 if (manually_injected_count > 0) {
417 std::to_string(manually_injected_count) +
418 " missing chat tokens.");
423 const std::vector<std::string>& id_to_token,
424 const std::vector<float>& token_scores) {
426 std::unordered_map<std::string, int> generated_merges;
428 if (token_scores.empty() || id_to_token.empty()) {
429 Logger::warning(
"Cannot generate BPE merges: empty scores or vocabulary");
430 return generated_merges;
433 Logger::info(
"Generating BPE merges from vocabulary and scores for older Llama models...");
436 std::vector<std::pair<float, std::string>> scored_tokens;
437 for (
size_t id = 0;
id < id_to_token.size(); ++id) {
438 if (
id < token_scores.size()) {
439 const std::string& token = id_to_token[id];
441 if (token.length() > 1 &&
442 token.find(
"<") == std::string::npos &&
443 token.find(
">") == std::string::npos &&
445 scored_tokens.emplace_back(token_scores[
id], token);
451 std::sort(scored_tokens.begin(), scored_tokens.end(),
452 [](
const auto& a,
const auto& b) { return a.first > b.first; });
454 Logger::info(
"Found " + std::to_string(scored_tokens.size()) +
" candidate tokens for merge generation");
458 std::unordered_set<std::string> processed_tokens;
460 for (
const auto& [score, token] : scored_tokens) {
461 if (processed_tokens.count(token))
continue;
464 std::string best_left, best_right;
465 float best_combined_score = -std::numeric_limits<float>::infinity();
468 for (
size_t split = 1; split < token.length(); ++split) {
469 std::string left = token.substr(0, split);
470 std::string right = token.substr(split);
473 auto left_it = std::find(id_to_token.begin(), id_to_token.end(), left);
474 auto right_it = std::find(id_to_token.begin(), id_to_token.end(), right);
476 if (left_it != id_to_token.end() && right_it != id_to_token.end()) {
478 size_t left_id = std::distance(id_to_token.begin(), left_it);
479 size_t right_id = std::distance(id_to_token.begin(), right_it);
480 float left_score = (left_id < token_scores.size()) ?
481 token_scores[left_id] : 0.0f;
482 float right_score = (right_id < token_scores.size()) ?
483 token_scores[right_id] : 0.0f;
484 float combined_score = left_score + right_score;
486 if (combined_score > best_combined_score) {
487 best_combined_score = combined_score;
495 if (!best_left.empty() && !best_right.empty()) {
496 std::string merge_key = best_left + best_right;
497 if (generated_merges.find(merge_key) == generated_merges.end()) {
498 generated_merges[merge_key] = merge_rank++;
499 Logger::debug(
"Generated merge: '" + best_left +
"' + '" + best_right +
"' -> '" + token +
"' (rank " + std::to_string(merge_rank-1) +
")");
503 processed_tokens.insert(token);
506 if (merge_rank >= 50000) {
507 Logger::info(
"Reached maximum merge limit (50000), stopping generation");
512 Logger::info(
"Generated " + std::to_string(generated_merges.size()) +
" BPE merge rules from vocabulary and scores");
513 return generated_merges;
517 : tokenizer_family_(config.tokenizer_family),
518 initialized_from_gguf_(true) {
519 Logger::info(
"Initializing Tokenizer from GGUFData...");
520 std::string family_str_gguf =
"UNKNOWN";
523 Logger::info(std::string(
"Tokenizer family from ModelConfig: ") + family_str_gguf);
527 auto it = gguf_data.
metadata.find(
"tokenizer.chat_template");
528 if (it != gguf_data.
metadata.end()) {
529 if (std::holds_alternative<std::string>(it->second)) {
532 Logger::info(
"[Tokenizer GGUF Init] Found and loaded 'tokenizer.chat_template' from GGUF metadata.");
538 std::string loggable_snippet;
539 for (
char ch : template_snippet) {
540 if (ch ==
'\n') loggable_snippet +=
"\\n";
541 else if (ch ==
'\r') loggable_snippet +=
"\\r";
542 else if (ch ==
'\t') loggable_snippet +=
"\\t";
543 else if (std::isprint(
static_cast<unsigned char>(ch))) loggable_snippet += ch;
544 else loggable_snippet +=
".";
546 Logger::debug(
"[Tokenizer GGUF Init] Chat template snippet: " + loggable_snippet);
548 Logger::info(
"[Tokenizer GGUF Init] 'tokenizer.chat_template' found in GGUF metadata but is empty.");
551 Logger::warning(
"[Tokenizer GGUF Init] 'tokenizer.chat_template' found in GGUF metadata but is not a string type.");
554 Logger::info(
"[Tokenizer GGUF Init] 'tokenizer.chat_template' not found in GGUF metadata.");
556 }
catch (
const std::exception& e) {
557 Logger::error(
"[Tokenizer GGUF Init] Exception while trying to access 'tokenizer.chat_template': " + std::string(e.what()));
561 throw std::runtime_error(
562 "GGUF data does not contain 'tokenizer.ggml.tokens'");
572 if (
static_cast<int>(i) == 1734) {
574 std::string escaped_token_1734;
575 for (
char c : token_at_1734) {
576 if (c ==
'\n') escaped_token_1734 +=
"\\n";
577 else if (c ==
'\r') escaped_token_1734 +=
"\\r";
578 else if (c ==
'\t') escaped_token_1734 +=
"\\t";
579 else if (c ==
'\\') escaped_token_1734 +=
"\\\\";
580 else if (std::isprint(
static_cast<unsigned char>(c))) escaped_token_1734 += c;
582 std::stringstream ss_hex;
583 ss_hex <<
"<0x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(
static_cast<unsigned char>(c)) <<
">";
584 escaped_token_1734 += ss_hex.str();
587 Logger::info(
"[GGUF_VOCAB_SCAN] Token string at ID 1734 is: '" + escaped_token_1734 +
"' (length: " + std::to_string(token_at_1734.length()) +
")");
592 " tokens from GGUF tokenizer_tokens.");
596 std::string first_few_tokens_log =
"First few (up to 10 or vocab size) GGUF tokens: ";
597 for (
size_t i = 0; i < std::min((
size_t)10,
id_to_token_.size()); ++i) {
598 first_few_tokens_log +=
"ID[" + std::to_string(i) +
"]='";
601 if (std::isprint(
static_cast<unsigned char>(c_tok))) {
602 first_few_tokens_log += c_tok;
604 std::stringstream ss_hex;
605 ss_hex <<
"<0x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(
static_cast<unsigned char>(c_tok)) <<
">";
606 first_few_tokens_log += ss_hex.str();
609 first_few_tokens_log +=
"' ";
617 Logger::info(
"Configuring for LLAMA3_TIKTOKEN (gpt2-style BPE).");
620 Logger::warning(
"Llama 3 Tiktoken family specified, but GGUF data does not contain 'tokenizer.ggml.merges'. Tiktoken BPE may not function correctly without explicit merges.");
626 std::string part1, part2;
627 size_t space_pos = merge_str.find(
' ');
628 if (space_pos != std::string::npos && space_pos > 0 && space_pos < merge_str.length() - 1) {
629 part1 = merge_str.substr(0, space_pos);
630 part2 = merge_str.substr(space_pos + 1);
631 std::string merged = part1 + part2;
634 Logger::warning(
"Skipping malformed Tiktoken merge rule from GGUF: '" + merge_str +
"'");
639 " Tiktoken merges from GGUF tokenizer_merges into bpe_merges_ map with ranks.");
652 std::string target_sub_oplasm =
"oplasm";
655 auto check_and_log_vocab = [&](
const std::string& token_to_check) {
657 Logger::debug(
"[DEBUG_VOCAB] Found '" + token_to_check +
"' in vocab with ID: " + std::to_string(
token_to_id_.at(token_to_check)));
659 Logger::debug(
"[DEBUG_VOCAB] Token '" + token_to_check +
"' NOT FOUND in vocab.");
663 auto check_and_log_merge = [&](
const std::string& p1,
const std::string& p2) {
666 Logger::debug(
"[DEBUG_VOCAB] Found merge for '" + p1 +
"' + '" + p2 +
"' ('" + (p1+p2) +
"') with rank: " + std::to_string(merge_it->second));
668 Logger::debug(
"[DEBUG_VOCAB] Merge for '" + p1 +
"' + '" + p2 +
"' ('" + (p1+p2) +
"') NOT FOUND.");
679 Logger::warning(
"GGUF (SentencePiece path) token and score array sizes mismatch: tokens=" +
683 Logger::warning(
"SentencePiece family: No scores found. BPE merging will likely not work if no other SP model data is available.");
688 Logger::info(
"SentencePiece family path: Found 'tokenizer.ggml.merges' in GGUF. Loading them into bpe_merges_ map.");
692 std::string part1, part2;
693 size_t space_pos = merge_str.find(
' ');
694 if (space_pos != std::string::npos && space_pos > 0 && space_pos < merge_str.length() - 1) {
695 part1 = merge_str.substr(0, space_pos);
696 part2 = merge_str.substr(space_pos + 1);
699 Logger::warning(
"Skipping malformed SentencePiece merge rule from GGUF: '" + merge_str +
"'");
703 " merges from GGUF tokenizer_merges into bpe_merges_ map (SentencePiece path).");
705 Logger::warning(
"SentencePiece family path: No 'tokenizer.ggml.merges' found in GGUF. Attempting to generate merges from vocabulary and scores...");
709 if (!generated_merges.empty()) {
711 Logger::info(
"Successfully generated " + std::to_string(
bpe_merges_.size()) +
" BPE merges from vocabulary and scores for SentencePiece tokenizer");
713 Logger::warning(
"Failed to generate BPE merges. Tokenization may be suboptimal for this model.");
720 Logger::warning(
"Tokenizer family is UNKNOWN. Tokenizer may not function as expected. Will attempt to load basic vocab and scores if present.");
723 Logger::info(
"Loaded " + std::to_string(
token_scores_.size()) +
" token scores from GGUF for UNKNOWN family as a fallback.");
731 [](
unsigned int u) { return static_cast<int32_t>(u); });
738 int byte_tokens_from_type = 0;
739 int special_tokens_from_type = 0;
744 int token_id =
static_cast<int>(i);
745 bool processed_as_byte =
false;
748 bool added_byte =
false;
749 if (token_str.length() == 1) {
752 }
else if (token_str.rfind(
"<0x", 0) == 0 && token_str.back() ==
'>' && token_str.length() == 6) {
754 int byte_val = std::stoi(token_str.substr(3, 2),
nullptr, 16);
757 }
catch (
const std::exception& e) {
758 Logger::warning(
"Could not parse byte value from type-BYTE (6) token string: '" + token_str +
"'");
762 Logger::warning(
"Token type is BYTE (6) but does not match single char or <0xNN> format: '" + token_str +
"' ID: " + std::to_string(token_id));
766 byte_tokens_from_type++;
767 processed_as_byte =
true;
770 if (!processed_as_byte && (tt == 2 || tt == 3 || tt == 4 || tt == 5)) {
774 special_tokens_from_type++;
779 Logger::info(
"From GGUF token_types (BYTE=6): Identified " + std::to_string(byte_tokens_from_type) +
" byte tokens (for byte_char_to_id_). " +
780 "Identified " + std::to_string(special_tokens_from_type) +
" other special/added tokens (types 2,3,4,5).");
785 Logger::warning(
"No byte tokens identified via token_types metadata for Tiktoken. Attempting fallback scan of vocabulary.");
788 int bytes_found_in_vocab_fallback = 0;
789 for (
int i = 0; i < 256; ++i) {
790 std::stringstream ss_hex_repr;
791 ss_hex_repr <<
"<0x" << std::hex << std::setw(2) << std::setfill(
'0') << i <<
">";
792 std::string byte_token_str_repr = ss_hex_repr.str();
793 std::string literal_byte_char_str(1,
static_cast<char>(i));
794 bool is_space_char = (
static_cast<char>(i) ==
' ');
798 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Checking for SPACE (byte 32). Looking for '<0x20>' and ' '.");
806 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Found '<0x20>' token with ID: " + std::to_string(it->second) +
". Adding to map.");
810 bytes_found_in_vocab_fallback++;
813 if (std::isprint(
static_cast<unsigned char>(i))) {
818 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Did not find '<0x20>', but found literal ' ' token with ID: " + std::to_string(lit_it->second));
823 bool id_already_mapped =
false;
824 for(
const auto& pair :
byte_char_to_id_) {
if (pair.second == lit_it->second) { id_already_mapped =
true;
break; } }
825 if (!id_already_mapped) {
828 Logger::debug(
"[BYTE_FALLBACK_DEBUG] ID " + std::to_string(lit_it->second) +
" for ' ' not already mapped. Adding to map.");
832 bytes_found_in_vocab_fallback++;
837 Logger::debug(
"[BYTE_FALLBACK_DEBUG] ID " + std::to_string(lit_it->second) +
" for ' ' was already mapped (likely by <0x20>). Skipping literal add.");
844 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Did not find '<0x20>' OR literal ' ' token in vocab.");
851 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Did not find '<0x20>' token, and space is not printable, so didn't check for literal ' '.");
857 Logger::info(
"Fallback byte_char_to_id_ map population: Found representations for " + std::to_string(bytes_found_in_vocab_fallback) +
858 " byte values in GGUF vocab (using <0xNN> or literal). Intended for Tiktoken BPE.");
859 byte_tokens_from_type = bytes_found_in_vocab_fallback;
864 Logger::warning(
"GGUF tokenizer_token_types array missing or size mismatch. Byte token and special token identification will be limited.");
867 int bytes_found_in_vocab_fallback = 0;
868 for (
int i = 0; i < 256; ++i) {
869 std::stringstream ss_hex_repr;
870 ss_hex_repr <<
"<0x" << std::hex << std::setw(2) << std::setfill(
'0') << i <<
">";
871 std::string byte_token_str_repr = ss_hex_repr.str();
872 std::string literal_byte_char_str(1,
static_cast<char>(i));
873 bool is_space_char = (
static_cast<char>(i) ==
' ');
877 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Checking for SPACE (byte 32). Looking for '<0x20>' and ' '.");
885 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Found '<0x20>' token with ID: " + std::to_string(it->second) +
". Adding to map.");
889 bytes_found_in_vocab_fallback++;
892 if (std::isprint(
static_cast<unsigned char>(i))) {
897 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Did not find '<0x20>', but found literal ' ' token with ID: " + std::to_string(lit_it->second));
902 bool id_already_mapped =
false;
903 for(
const auto& pair :
byte_char_to_id_) {
if (pair.second == lit_it->second) { id_already_mapped =
true;
break; } }
904 if (!id_already_mapped) {
907 Logger::debug(
"[BYTE_FALLBACK_DEBUG] ID " + std::to_string(lit_it->second) +
" for ' ' not already mapped. Adding to map.");
911 bytes_found_in_vocab_fallback++;
916 Logger::debug(
"[BYTE_FALLBACK_DEBUG] ID " + std::to_string(lit_it->second) +
" for ' ' was already mapped (likely by <0x20>). Skipping literal add.");
923 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Did not find '<0x20>' OR literal ' ' token in vocab.");
930 Logger::debug(
"[BYTE_FALLBACK_DEBUG] Did not find '<0x20>' token, and space is not printable, so didn't check for literal ' '.");
936 Logger::info(
"Fallback byte_char_to_id_ map population: Found representations for " + std::to_string(bytes_found_in_vocab_fallback) +
937 " byte values in GGUF vocab (using <0xNN> or literal). Intended for Tiktoken BPE.");
943 Logger::info(
"[GENERAL_BYTE_FALLBACK] Space ' ' not found in byte_char_to_id_. Attempting to populate from vocab.");
944 int general_fallback_bytes_added = 0;
945 for (
int i = 0; i < 256; ++i) {
946 char current_char =
static_cast<char>(i);
952 std::stringstream ss_hex_repr;
953 ss_hex_repr <<
"<0x" << std::hex << std::setw(2) << std::setfill(
'0') << i <<
">";
954 std::string byte_token_str_repr = ss_hex_repr.str();
955 std::string literal_byte_char_str(1, current_char);
960 general_fallback_bytes_added++;
961 if (current_char ==
' ')
Logger::debug(
"[GENERAL_BYTE_FALLBACK] Found space as '" + byte_token_str_repr +
"' -> ID: " + std::to_string(it_hex->second));
966 general_fallback_bytes_added++;
967 if (current_char ==
' ')
Logger::debug(
"[GENERAL_BYTE_FALLBACK] Found space as literal '" + literal_byte_char_str +
"' -> ID: " + std::to_string(it_lit->second));
971 Logger::info(
"[GENERAL_BYTE_FALLBACK] Added " + std::to_string(general_fallback_bytes_added) +
972 " new entries to byte_char_to_id_ map. Final size: " + std::to_string(
byte_char_to_id_.size()));
974 Logger::warning(
"[GENERAL_BYTE_FALLBACK] Space ' ' still not found in byte_char_to_id_ after fallback scan!");
979 const std::string sp_space_token =
"\xE2\x96\x81";
983 Logger::info(
"[GENERAL_BYTE_FALLBACK] SUCCESS: Found SentencePiece space token '" + sp_space_token +
984 "' (ID: " + std::to_string(it_sp_space->second) +
"). Mapped standard space ' ' to this ID.");
987 Logger::warning(
"[GENERAL_BYTE_FALLBACK] Space ' ' still not found in byte_char_to_id_ after fallback scan AND specific SP space check!");
1006 auto setup_special_token = [&](
const std::string& name,
int& id_field, std::string& str_field,
const std::string& default_str_val) {
1007 if (id_field >= 0 &&
static_cast<size_t>(id_field) <
id_to_token_.size()) {
1010 str_field = default_str_val;
1011 if (id_field != -1) {
1013 " from config is out of vocab bounds or invalid. Using default string: '" + default_str_val +
"'.");
1018 if (id_field == -1 || (id_field >=0 &&
static_cast<size_t>(id_field) >=
id_to_token_.size()) ) {
1019 id_field = it->second;
1020 Logger::info(
"Set " + name +
" token ID to " + std::to_string(id_field) +
" based on default string '" + default_str_val +
"'.");
1022 }
else if (id_field != -1) {
1023 Logger::warning(
"Default string '" + default_str_val +
"' for " + name +
" token also not found in vocab.");
1045 Logger::info(std::string(
"Tokenizer successfully initialized from GGUFData. Final type: ") +
1144 std::vector<int> final_ids;
1145 std::string family_str_enc =
"UNKNOWN";
1152 std::stringstream log_ss_main;
1153 log_ss_main <<
"[ENCODE] Encoding text: '" << text <<
"'"
1154 <<
" (add_bos=" << add_bos
1155 <<
", add_eos=" << add_eos
1156 <<
", family=" << family_str_enc
1157 <<
", pre_tok_override=" <<
static_cast<int>(pre_tok_override)
1163 Logger::debug(
"[ENCODE] Using LLAMA3_TIKTOKEN (bpe_tokenize_to_ids) path.");
1167 if (this->
bos_token_.empty() || text.rfind(this->bos_token_, 0) != 0) {
1170 " (text did not already start with it).");
1172 Logger::debug(
"[ENCODE Llama 3 Path] BOS token flag was true, but text already started with BOS string. Skipping explicit BOS ID addition.");
1177 final_ids.insert(final_ids.end(), token_ids.begin(), token_ids.end());
1186 Logger::debug(
"[ENCODE] Using LLAMA_SENTENCEPIECE (old SentencePiece/BPE logic) path.");
1190 "[ENCODE SPM Path] Using simplified merge-based tokenizer path (calling "
1191 "bpe_tokenize directly).");
1193 std::vector<std::string> bpe_pieces = this->
bpe_tokenize(text);
1195 std::to_string(bpe_pieces.size()) +
" pieces.");
1200 final_ids.insert(final_ids.begin(), this->bos_token_id_);
1209 Logger::debug(
"[ENCODE SPM Path] Final IDs (Simplified Merge Path): " +
1210 std::to_string(final_ids.size()) +
" tokens.");
1212 Logger::debug(
"[ENCODE SPM Path] Using GGUF score-based tokenizer path.");
1220 std::vector<std::pair<std::string, bool>> segments;
1221 std::string text_to_process = text;
1227 Logger::debug(
"[ENCODE SPM GGUF Path] Using DEFAULT pre-tokenization (split by special, BPE for non-specials).");
1230 Logger::debug(
"[ENCODE SPM GGUF Path] Using LLAMA_REGEX pre-tokenization.");
1232 Logger::warning(
"[ENCODE SPM GGUF Path] pre_tok_type_ is '" + this->
pre_tok_type_ +
"' or unset. Defaulting to WHITESPACE pre-tokenization for GGUF/SPM path.");
1236 method_to_use = pre_tok_override;
1239 std::string method_str_log;
1241 else method_str_log =
"DEFAULT (Special Token Split or WHITESPACE Fallback)";
1242 Logger::debug(
"[ENCODE SPM GGUF Path] Effective pre-tokenization method: " + method_str_log);
1245 std::unordered_set<std::string> all_special_tokens_set;
1247 if (!pair.first.empty()) all_special_tokens_set.insert(pair.first);
1249 if (!this->
bos_token_.empty()) all_special_tokens_set.insert(this->bos_token_);
1250 if (!this->
eos_token_.empty()) all_special_tokens_set.insert(this->eos_token_);
1251 if (!this->
unk_token_.empty()) all_special_tokens_set.insert(this->unk_token_);
1253 std::string special_pattern_str =
"(";
1254 bool first_special =
true;
1255 for (
const std::string& st : all_special_tokens_set) {
1256 if (!first_special) special_pattern_str +=
"|";
1257 std::string escaped_st;
1259 if (strchr(
".^$*+?()[{\\|", c)) escaped_st +=
'\\';
1262 special_pattern_str += escaped_st;
1263 first_special =
false;
1265 special_pattern_str +=
")";
1267 if (all_special_tokens_set.empty()) {
1268 Logger::debug(
"[ENCODE SPM GGUF Path] No special tokens defined for DEFAULT pre-tok. Treating whole text as one segment.");
1269 segments.push_back({text_to_process,
false});
1271 Logger::debug(
"[ENCODE SPM GGUF Path] Splitting by special tokens regex: " + special_pattern_str);
1273 boost::regex special_regex(special_pattern_str);
1274 boost::sregex_iterator it(text_to_process.begin(), text_to_process.end(), special_regex);
1275 boost::sregex_iterator end;
1276 size_t last_pos = 0;
1278 boost::smatch match = *it;
1279 if (match.position() > last_pos) {
1280 segments.push_back({text_to_process.substr(last_pos, match.position() - last_pos),
false});
1282 segments.push_back({match.str(),
true});
1283 last_pos = match.position() + match.length();
1286 if (last_pos < text_to_process.length()) {
1287 segments.push_back({text_to_process.substr(last_pos),
false});
1289 }
catch (
const boost::regex_error& e) {
1290 Logger::error(
"[ENCODE SPM GGUF Path] Regex error splitting by special tokens: " + std::string(e.what()) +
". Treating as single segment.");
1292 segments.push_back({text_to_process,
false});
1296 boost::regex llama_segment_regex( R
"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s[:alpha:][:digit:]]+|\\s+(?!\\S)|\\s+)");
1297 Logger::debug("[ENCODE SPM GGUF Path] Using LLAMA_REGEX for pre-tokenization.");
1299 boost::sregex_iterator it(text_to_process.begin(), text_to_process.end(), llama_segment_regex);
1300 boost::sregex_iterator end;
1301 size_t last_pos = 0;
1303 boost::smatch match = *it;
1304 if (match.position() > last_pos) {
1305 segments.push_back({text_to_process.substr(last_pos, match.position() - last_pos),
false});
1307 segments.push_back({match.str(),
false});
1308 last_pos = match.position() + match.length();
1311 if (last_pos < text_to_process.length()) {
1312 segments.push_back({text_to_process.substr(last_pos),
false});
1314 }
catch (
const boost::regex_error& e) {
1315 Logger::error(
"[ENCODE SPM GGUF Path] Regex error during LLAMA_REGEX splitting: " + std::string(e.what()) +
". Treating as single segment.");
1317 segments.push_back({text_to_process,
false});
1320 Logger::debug(
"[ENCODE SPM GGUF Path] Using WHITESPACE pre-tokenization (or fallback).");
1321 std::string current_ws_segment;
1322 for (
char c : text_to_process) {
1323 if (std::isspace(
static_cast<unsigned char>(c))) {
1324 if (!current_ws_segment.empty()) {
1325 segments.push_back({current_ws_segment,
false});
1326 current_ws_segment.clear();
1328 segments.push_back({{c},
false});
1330 current_ws_segment += c;
1333 if (!current_ws_segment.empty()) {
1334 segments.push_back({current_ws_segment,
false});
1338 Logger::debug(
"[ENCODE SPM GGUF Path] Pre-tokenization resulted in " + std::to_string(segments.size()) +
" segments.");
1340 std::vector<int> segment_ids;
1341 for (
const auto& seg_pair : segments) {
1342 const std::string& segment_str = seg_pair.first;
1343 bool is_special = seg_pair.second;
1345 if (segment_str.empty())
continue;
1350 segment_ids.push_back(it->second);
1351 Logger::debug(
"[ENCODE SPM GGUF Path] Found special segment: '" + segment_str +
"' -> ID: " + std::to_string(it->second));
1353 Logger::warning(
"[ENCODE SPM GGUF Path] Special segment '" + segment_str +
1354 "' not in vocab. Using UNK ID: " + std::to_string(this->
unk_token_id_));
1360 segment_ids.insert(segment_ids.end(), piece_ids.begin(), piece_ids.end());
1361 Logger::debug(
"[ENCODE SPM GGUF Path] BPE for non-special segment '" + segment_str +
"' -> " + std::to_string(piece_ids.size()) +
" IDs.");
1364 final_ids.insert(final_ids.end(), segment_ids.begin(), segment_ids.end());
1368 Logger::debug(
"[ENCODE SPM GGUF Path] Appended EOS token: " +
1371 Logger::debug(
"[ENCODE SPM GGUF Path] Final IDs (GGUF Score Path): " + std::to_string(final_ids.size()) +
" tokens.");
1375 Logger::error(
"[ENCODE] Unknown or unsupported tokenizer family: " + family_str_enc +
". Cannot encode text.");
1386 Logger::debug(
"[ENCODE] Final IDs count (end of function): " + std::to_string(final_ids.size()));
1387 if (final_ids.empty() && !text.empty()) {
1388 Logger::warning(
"[ENCODE] Tokenization resulted in empty ID list for non-empty text: '" + text +
"'");
1477 bool skip_special_tokens)
const {
1478 Logger::debug(
"[decode_sentencepiece] Decoding using SentencePiece logic.");
1480 std::stringstream ss;
1481 bool first_token =
true;
1482 const std::string sp_space_prefix =
"\xE2\x96\x81";
1483 const std::string gpt2_space_prefix =
"\xC4\xA0";
1485 for (
int id : ids) {
1486 std::string token_str;
1487 bool is_special_or_invalid =
false;
1490 if (skip_special_tokens) {
1492 is_special_or_invalid =
true;
1496 is_special_or_invalid =
true;
1502 if (
id >= 0 &&
static_cast<size_t>(
id) <
id_to_token_.size()) {
1507 token_str = added_it->second;
1510 if (!skip_special_tokens) {
1511 token_str =
"[INVALID_ID:" + std::to_string(
id) +
"]";
1515 is_special_or_invalid =
true;
1520 if (token_str.length() == 6 && token_str.rfind(
"<0x", 0) == 0 && token_str[5] ==
'>') {
1522 std::string hex_val_str = token_str.substr(3, 2);
1523 int byte_val = std::stoi(hex_val_str,
nullptr, 16);
1524 token_str = std::string(1,
static_cast<char>(byte_val));
1525 Logger::debug(
"[decode_sentencepiece] Converted '<0x" + hex_val_str +
">' to char: " + std::to_string(byte_val));
1526 }
catch (
const std::exception& e) {
1527 Logger::warning(
"[decode_sentencepiece] Failed to parse hex from token: '" + token_str +
"'. Error: " + e.what());
1532 if (token_str.empty() && !is_special_or_invalid) {
1539 is_special_or_invalid =
true;
1543 if (!skip_special_tokens){
1544 token_str =
"[EMPTY_TOKEN_FOR_ID:" + std::to_string(
id) +
"]";
1546 is_special_or_invalid =
true;
1550 if (!is_special_or_invalid) {
1551 Logger::warning(
"[decode_sentencepiece] Encountered empty token string for valid ID " + std::to_string(
id) +
1552 ". Using: '" + token_str +
"'");
1557 if (token_str == sp_space_prefix || token_str == gpt2_space_prefix) {
1561 Logger::debug(
"[decode_sentencepiece] Ignored leading standalone space prefix token.");
1565 std::string current_output_check = ss.str();
1566 if (current_output_check.empty() || current_output_check.back() !=
' ') {
1568 Logger::debug(
"[decode_sentencepiece] Added space for standalone prefix token mid-sequence.");
1570 first_token =
false;
1575 if (!token_str.empty()) {
1576 bool starts_with_sp_prefix = (token_str.rfind(sp_space_prefix, 0) == 0);
1578 bool starts_with_gpt2_prefix = (!starts_with_sp_prefix && token_str.rfind(gpt2_space_prefix, 0) == 0);
1580 if (starts_with_sp_prefix) {
1581 std::string current_output = ss.str();
1582 if (!first_token && (current_output.empty() || current_output.back() !=
' ')) {
1585 std::string content = token_str.substr(sp_space_prefix.length());
1587 size_t first_non_space = content.find_first_not_of(
' ');
1588 if (std::string::npos != first_non_space) {
1589 content = content.substr(first_non_space);
1592 first_token =
false;
1594 else if (starts_with_gpt2_prefix) {
1595 std::string current_output = ss.str();
1596 if (!first_token && (current_output.empty() || current_output.back() !=
' ')) {
1599 std::string content = token_str.substr(gpt2_space_prefix.length());
1601 size_t first_non_space = content.find_first_not_of(
' ');
1602 if (std::string::npos != first_non_space) {
1603 content = content.substr(first_non_space);
1606 first_token =
false;
1610 first_token =
false;
1629 const std::string& system_message,
1639 if (is_jinja_template) {
1640 Logger::info(
"[apply_chat_template] GGUF chat template detected as Jinja2-like.");
1642 Logger::info(
"[apply_chat_template] GGUF chat template detected as simple placeholder template.");
1647 Logger::info(
"[apply_chat_template] Using simple GGUF chat template (non-Jinja).");
1653 processed_template =
replace_all(processed_template,
"{{bos_token}}", bos_s);
1654 processed_template =
replace_all(processed_template,
"{{eos_token}}", eos_s);
1655 processed_template =
replace_all(processed_template,
"{{user_prompt}}", user_prompt);
1656 if (!system_message.empty()) {
1657 processed_template =
replace_all(processed_template,
"{{system_message}}", system_message);
1659 processed_template =
replace_all(processed_template,
"{{system_message}}",
"");
1662 std::string snippet_to_log = processed_template.substr(0, std::min((
size_t)100, processed_template.length()));
1663 Logger::debug(std::string(
"[apply_chat_template] Processed simple GGUF template. Snippet: ") + snippet_to_log);
1664 return processed_template;
1666 if (is_jinja_template) {
1667 Logger::warning(
"[apply_chat_template] GGUF chat template appears to be Jinja2, which is not fully supported by this C++ implementation. Falling back to hardcoded Llama 3 Instruct template. The model's intended GGUF chat template will be ignored.");
1669 Logger::info(
"[apply_chat_template] GGUF chat template not found or empty. Falling back to hardcoded Llama 3 Instruct template.");
1673 auto find_added_token_str_fallback = [&](
const std::string& content,
1674 const std::string& fallback_value) -> std::string {
1677 if ((!this->
added_tokens_.empty() || !this->token_to_id_.empty()) && content.rfind(
"<",0) == 0 && content.rfind(
"|",0) != std::string::npos && content.rfind(
">",0) == content.length()-1) {
1678 Logger::warning(
"[apply_chat_template_fallback] Could not find special token string '" + content +
1679 "' in added_tokens_ or vocab. Using default/fallback string: '" + fallback_value +
"'");
1681 return fallback_value;
1687 std::string start_header_s_fallback = find_added_token_str_fallback(
"<|start_header_id|>",
"<|start_header_id|>");
1688 std::string end_header_s_fallback = find_added_token_str_fallback(
"<|end_header_id|>",
"<|end_header_id|>");
1689 std::string eot_s_fallback = find_added_token_str_fallback(
"<|eot_id|>",
"<|eot_id|>");
1691 std::string system_role_name =
"system";
1692 std::string user_role_name =
"user";
1693 std::string assistant_role_name =
"assistant";
1695 std::stringstream ss;
1696 ss << bos_s_fallback;
1697 if (!system_message.empty()) {
1698 ss << start_header_s_fallback << system_role_name << end_header_s_fallback <<
"\n\n" << system_message << eot_s_fallback;
1700 ss << start_header_s_fallback << user_role_name << end_header_s_fallback <<
"\n\n" << user_prompt << eot_s_fallback;
1701 ss << start_header_s_fallback << assistant_role_name << end_header_s_fallback <<
"\n\n";
1703 Logger::info(
"[apply_chat_template] Applied hardcoded Llama 3 Instruct-like chat template as fallback. Prompt snippet: " + ss.str().substr(0,100));
1709 const std::string& vocab_path,
1710 std::unordered_map<std::string, int>& token_to_id_map,
1711 std::vector<std::string>& id_to_token_vec) {
1712 token_to_id_map.clear();
1713 id_to_token_vec.clear();
1716 std::ifstream file(vocab_path);
1717 if (!file.is_open()) {
1718 throw std::runtime_error(
"Failed to open vocabulary file: " + vocab_path);
1725 if (vocab_json.contains(
"model") && vocab_json[
"model"].is_object() &&
1726 vocab_json[
"model"].contains(
"vocab") && vocab_json[
"model"][
"vocab"].is_object()) {
1727 Logger::info(
"load_vocab_from_json: Detected HuggingFace tokenizer.json format.");
1728 const auto& vocab = vocab_json[
"model"][
"vocab"];
1732 for (
auto it = vocab.begin(); it != vocab.end(); ++it) {
1733 int id = it.value().get<
int>();
1735 Logger::warning(
"load_vocab_from_json: Skipping token with negative ID: " + it.key());
1738 if (
static_cast<size_t>(
id) > max_id) {
1739 max_id =
static_cast<size_t>(id);
1742 id_to_token_vec.resize(max_id + 1,
"<unk>");
1745 for (
auto it = vocab.begin(); it != vocab.end(); ++it) {
1746 std::string token = it.key();
1747 int id = it.value().get<
int>();
1748 if (
id < 0)
continue;
1750 token_to_id_map[token] = id;
1751 if (
static_cast<size_t>(
id) < id_to_token_vec.size()) {
1752 id_to_token_vec[id] = token;
1755 Logger::warning(
"load_vocab_from_json: ID out of bounds during vocab population: " + std::to_string(
id));
1760 if (vocab_json.contains(
"added_tokens") &&
1761 vocab_json[
"added_tokens"].is_array()) {
1762 const auto& added_tokens_json = vocab_json[
"added_tokens"];
1763 Logger::info(
"load_vocab_from_json: Processing " + std::to_string(added_tokens_json.size()) +
" added_tokens.");
1764 for (
const auto& token_obj : added_tokens_json) {
1765 if (token_obj.contains(
"content") && token_obj.contains(
"id")) {
1766 std::string token_content = token_obj[
"content"];
1767 int token_id = token_obj[
"id"];
1770 Logger::warning(
"load_vocab_from_json: Skipping added_token with negative ID: " + token_content);
1775 token_to_id_map[token_content] = token_id;
1779 if (
static_cast<size_t>(token_id) >= id_to_token_vec.size()) {
1780 id_to_token_vec.resize(token_id + 1,
"<unk>");
1782 id_to_token_vec[token_id] = token_content;
1789 Logger::debug(
"load_vocab_from_json: Processed added_token: '" + token_content +
"' with ID " +
1790 std::to_string(token_id));
1795 }
else if (vocab_json.is_object()) {
1796 Logger::info(
"load_vocab_from_json: Detected plain vocabulary format (direct map).");
1798 for (
auto it = vocab_json.begin(); it != vocab_json.end(); ++it) {
1799 int id = it.value().get<
int>();
1800 if (
id < 0)
continue;
1801 if (
static_cast<size_t>(
id) > max_id) {
1802 max_id =
static_cast<size_t>(id);
1805 id_to_token_vec.resize(max_id + 1,
"<unk>");
1807 for (
auto it = vocab_json.begin(); it != vocab_json.end(); ++it) {
1808 std::string token = it.key();
1809 int id = it.value().get<
int>();
1811 Logger::warning(
"load_vocab_from_json: Skipping token with negative ID: " + token);
1814 token_to_id_map[token] = id;
1815 if (
static_cast<size_t>(
id) < id_to_token_vec.size()) {
1816 id_to_token_vec[id] = token;
1825 throw std::runtime_error(
"load_vocab_from_json: Vocabulary JSON has an unsupported format.");
1828 for (
size_t i = 0; i < id_to_token_vec.size(); ++i) {
1829 if (id_to_token_vec[i].empty() || id_to_token_vec[i] ==
"<unk>") {
1832 id_to_token_vec[i] = added_it->second;
1833 }
else if (id_to_token_vec[i].empty()) {
1834 if (id_to_token_vec[i].empty()) id_to_token_vec[i] =
"<missing_id_" + std::to_string(i) +
">";
1839 Logger::info(
"load_vocab_from_json: Loaded vocabulary with " +
1840 std::to_string(token_to_id_map.size()) +
" unique token strings and " +
1841 std::to_string(id_to_token_vec.size()) +
" ID entries.");
1848 }
catch (
const json::exception& e) {
1849 throw std::runtime_error(
"Error parsing vocabulary JSON from " + vocab_path +
": " + e.what());
1850 }
catch (
const std::exception& e) {
1851 throw std::runtime_error(
"Error loading vocabulary from " + vocab_path +
": " + std::string(e.what()));
1862 std::ifstream file(tokenizer_json_path);
1863 if (!file.is_open()) {
1864 throw std::runtime_error(
"load_bpe_merges_from_json: Failed to open BPE merges file: " + tokenizer_json_path);
1874 if (model_json.contains(
"model") && model_json[
"model"].is_object()) {
1875 const auto& model_section = model_json[
"model"];
1876 if (model_section.contains(
"merges") && model_section[
"merges"].is_array()) {
1877 Logger::info(
"load_bpe_merges_from_json: Detected HuggingFace tokenizer.json format with BPE merges from: " + tokenizer_json_path);
1878 const auto& merges = model_section[
"merges"];
1880 for (
const auto& merge_entry_json : merges) {
1881 if (merge_entry_json.is_string()) {
1882 std::string merge_entry = merge_entry_json.get<std::string>();
1883 size_t space_pos = merge_entry.find(
' ');
1886 if (space_pos != std::string::npos && space_pos > 0 && space_pos < merge_entry.length() - 1) {
1887 std::string first = merge_entry.substr(0, space_pos);
1888 std::string second = merge_entry.substr(space_pos + 1);
1890 std::string pair_key = first + second;
1893 Logger::warning(
"load_bpe_merges_from_json: Skipping malformed merge rule: '" + merge_entry +
"' from " + tokenizer_json_path);
1896 Logger::warning(
"load_bpe_merges_from_json: Merge entry is not a string, skipping. File: " + tokenizer_json_path);
1901 Logger::warning(
"load_bpe_merges_from_json: HuggingFace format detected, but no 'model.merges' array found in model section of: " + tokenizer_json_path);
1905 else if (model_json.contains(
"merges") && model_json[
"merges"].is_array()) {
1906 Logger::info(
"load_bpe_merges_from_json: Detected simple top-level 'merges' array format in: " + tokenizer_json_path);
1907 const auto& merges = model_json[
"merges"];
1909 for (
const auto& merge_entry_json : merges) {
1910 if (merge_entry_json.is_string()) {
1911 std::string merge_entry = merge_entry_json.get<std::string>();
1912 size_t space_pos = merge_entry.find(
' ');
1913 if (space_pos != std::string::npos && space_pos > 0 && space_pos < merge_entry.length() - 1) {
1914 std::string first = merge_entry.substr(0, space_pos);
1915 std::string second = merge_entry.substr(space_pos + 1);
1916 std::string pair_key = first + second;
1919 Logger::warning(
"load_bpe_merges_from_json: Skipping malformed merge rule from top-level array: '" + merge_entry +
"' from " + tokenizer_json_path);
1922 Logger::warning(
"load_bpe_merges_from_json: Merge entry in top-level array is not a string, skipping. File: " + tokenizer_json_path);
1927 throw std::runtime_error(
1928 "load_bpe_merges_from_json: Unsupported BPE model format: no 'model.merges' or top-level 'merges' array found in '" + tokenizer_json_path +
"'");
1932 Logger::warning(
"load_bpe_merges_from_json: No BPE merges were loaded from the file: " + tokenizer_json_path);
1935 " BPE merges with ranks from " + tokenizer_json_path);
1938 }
catch (
const json::exception& e) {
1939 throw std::runtime_error(
"Error parsing BPE merges JSON from " + tokenizer_json_path +
": " + e.what());
1940 }
catch (
const std::exception& e) {
1941 throw std::runtime_error(
"An unexpected error occurred while loading BPE merges from " + tokenizer_json_path +
": " + std::string(e.what()));
1976 Logger::debug(
"[Original bpe_tokenize for SentencePiece] Entered. bpe_merges_ size: " + std::to_string(
bpe_merges_.size()));
1977 std::vector<std::string> all_final_tokens;
1978 const std::string sp_space_prefix =
"\xE2\x96\x81";
1980 std::vector<std::string> pieces;
1981 std::string current_piece;
1982 bool last_char_was_space =
true;
1984 for (
char c : text) {
1985 if (std::isspace(
static_cast<unsigned char>(c))) {
1986 if (!current_piece.empty()) {
1987 pieces.push_back(current_piece);
1988 current_piece.clear();
1990 pieces.push_back(std::string(1, c));
1991 last_char_was_space =
true;
1994 last_char_was_space =
false;
1997 if (!current_piece.empty()) {
1998 pieces.push_back(current_piece);
2001 Logger::debug(
"[Original bpe_tokenize for SentencePiece] Split text into " + std::to_string(pieces.size()) +
" pieces (words/spaces).");
2003 bool next_word_needs_prefix =
true;
2005 for (
const std::string& piece : pieces) {
2006 if (piece.empty())
continue;
2008 bool piece_is_whitespace = std::all_of(piece.begin(), piece.end(),
2009 [](
char c) { return std::isspace(static_cast<unsigned char>(c)); });
2011 if (piece_is_whitespace) {
2012 next_word_needs_prefix =
true;
2013 Logger::debug(
"[Original bpe_tokenize for SentencePiece] Piece '" + piece +
"' is whitespace. Setting prefix flag.");
2017 std::string word_to_process = piece;
2018 if (next_word_needs_prefix) {
2019 word_to_process = sp_space_prefix + word_to_process;
2020 Logger::debug(
"[Original bpe_tokenize for SentencePiece] Prefixed word: '" + piece +
"' -> '" + word_to_process +
"'");
2021 next_word_needs_prefix =
false;
2023 Logger::debug(
"[Original bpe_tokenize for SentencePiece] Processing word without prefix: '" + word_to_process +
"'");
2026 std::vector<std::string> chars;
2027 for (
size_t i = 0; i < word_to_process.size();) {
2028 size_t bytes = unicode_char_len(word_to_process[i]);
2029 if (i + bytes <= word_to_process.size()) {
2030 chars.push_back(word_to_process.substr(i, bytes));
2032 Logger::warning(
"[Original bpe_tokenize for SentencePiece] Invalid UTF-8 near: '" + word_to_process.substr(i) +
"'");
2033 chars.push_back(word_to_process.substr(i, 1));
2039 if (chars.empty()) {
2040 Logger::warning(
"[Original bpe_tokenize for SentencePiece] Word '" + word_to_process +
"' produced no chars.");
2044 bool changes =
true;
2045 while (changes && chars.size() > 1) {
2047 int best_rank = std::numeric_limits<int>::max();
2050 for (
size_t i = 0; i < chars.size() - 1; ++i) {
2051 std::string pair = chars[i] + chars[i + 1];
2053 if (it !=
bpe_merges_.end() && it->second < best_rank) {
2054 best_rank = it->second;
2060 std::string merged = chars[best_i] + chars[best_i + 1];
2061 chars[best_i] = merged;
2062 chars.erase(chars.begin() + best_i + 1);
2064 Logger::debug(
"[Original bpe_tokenize for SentencePiece] Applied merge: '" + merged +
"' with rank " +
2065 std::to_string(best_rank));
2068 all_final_tokens.insert(all_final_tokens.end(), chars.begin(), chars.end());
2071 Logger::debug(
"[Original bpe_tokenize for SentencePiece] Final token count: " + std::to_string(all_final_tokens.size()));
2072 return all_final_tokens;
2095 bool add_bos_token_param,
2096 bool add_eos_token_param,
2097 bool ignore_merges_param)
const {
2098 Logger::debug(std::string(
"[bpe_tokenize_to_ids] Starting Tiktoken BPE tokenization for text length: ") + std::to_string(text.length()) +
2099 ", add_bos=" + std::to_string(add_bos_token_param) +
2100 ", add_eos=" + std::to_string(add_eos_token_param) +
2101 ", ignore_merges=" + std::to_string(ignore_merges_param) );
2103 std::vector<int> output_ids;
2105 if (add_bos_token_param) {
2107 Logger::warning(
"[bpe_tokenize_to_ids] BOS token requested but bos_token_id_ is -1.");
2117 const std::string tiktoken_pattern_str =
2118 R
"(<\|[^|]+\||[[:alnum:]]+|\.(?![<|])|[^\s<|]+|\s+)";
2121 const boost::xpressive::sregex tiktoken_pattern_ = boost::xpressive::sregex::compile(
2122 tiktoken_pattern_str,
2123 boost::xpressive::regex_constants::icase
2126 size_t current_idx = 0;
2127 while (current_idx < text.length()) {
2128 bool special_match_found =
false;
2129 if (!sorted_special_tokens.empty()) {
2130 for (
const auto& special_pair : sorted_special_tokens) {
2131 const std::string& special_text = special_pair.first;
2132 int special_id = special_pair.second;
2133 if (text.compare(current_idx, special_text.length(), special_text) == 0) {
2134 output_ids.push_back(special_id);
2135 Logger::debug(
"[bpe_tokenize_to_ids] Matched special token: '" + special_text +
"' -> ID: " + std::to_string(special_id));
2136 current_idx += special_text.length();
2137 special_match_found =
true;
2143 if (special_match_found) {
2147 if (current_idx >= text.length())
break;
2149 std::string remaining_text_view_str = text.substr(current_idx);
2150 boost::xpressive::smatch word_match;
2152 if (!boost::xpressive::regex_search(remaining_text_view_str, word_match, tiktoken_pattern_, boost::xpressive::regex_constants::match_continuous)) {
2153 Logger::debug(std::string(
"[bpe_tokenize_to_ids] No more regex-matchable words at pos ") + std::to_string(current_idx) +
". Remainder: '" + remaining_text_view_str +
"'");
2154 if (!remaining_text_view_str.empty()) {
2155 Logger::warning(std::string(
"[bpe_tokenize_to_ids] Regex could not process remainder. Processing byte-by-byte: '") + remaining_text_view_str +
"'");
2156 for (
char c : remaining_text_view_str) {
2157 std::string byte_str(1, c);
2160 output_ids.push_back(it->second);
2166 Logger::warning(std::string(
"[bpe_tokenize_to_ids] Unrecognized byte '") + byte_str + std::string(
"' replaced with UNK."));
2168 Logger::error(std::string(
"[bpe_tokenize_to_ids] Unrecognized byte '") + byte_str + std::string(
"' and no UNK token defined. Skipping."));
2173 current_idx = text.length();
2177 std::string original_word = word_match.str(0);
2179 if (original_word.empty()){
2180 Logger::warning(
"[bpe_tokenize_to_ids] Regex search succeeded but matched an empty string. Advancing one char from pos " + std::to_string(current_idx));
2181 size_t advance_len = unicode_char_len(text[current_idx]);
2182 if (advance_len == 0) advance_len = 1;
2184 std::string problematic_char_str = text.substr(current_idx, advance_len);
2185 auto it_char =
token_to_id_.find(problematic_char_str);
2187 output_ids.push_back(it_char->second);
2188 }
else if (advance_len == 1 &&
byte_char_to_id_.count(problematic_char_str[0])) {
2192 Logger::debug(
"[bpe_tokenize_to_ids] Added UNK for unmatchable leading char after empty regex match: '" + problematic_char_str +
"'");
2194 current_idx += advance_len;
2199 auto direct_match_it =
token_to_id_.find(original_word);
2201 output_ids.push_back(direct_match_it->second);
2202 Logger::debug(
"[bpe_tokenize_to_ids] Regex-matched word '" + original_word +
"' is a direct token ID: " + std::to_string(direct_match_it->second));
2203 current_idx += original_word.length();
2207 Logger::debug(std::string(
"[bpe_tokenize_to_ids] Processing regex-derived word for BPE: '") + original_word +
"'");
2211 std::string word_to_process = original_word;
2212 if (!word_to_process.empty() && word_to_process[0] ==
' ') {
2213 if (word_to_process.length() > 1) {
2218 Logger::debug(std::string(
"[bpe_tokenize_to_ids] Converted leading space. Word for BPE: '") + word_to_process +
"'");
2221 if (ignore_merges_param) {
2224 output_ids.push_back(it_direct->second);
2225 Logger::debug(std::string(
"[bpe_tokenize_to_ids] Found word directly (ignore_merges): '") + word_to_process +
"' -> ID: " + std::to_string(it_direct->second));
2226 current_idx += original_word.length();
2229 Logger::debug(std::string(
"[bpe_tokenize_to_ids] ignore_merges=true, but word \'") + word_to_process +
"\' not in vocab directly. Proceeding with BPE char split (unusual for tiktoken special words).");
2233 std::vector<llm_symbol> symbols;
2234 symbols.reserve(word_to_process.length());
2236 while (offset < word_to_process.length()) {
2237 size_t char_len = unicode_char_len(word_to_process[offset]);
2238 if (offset + char_len > word_to_process.length()) {
2239 Logger::error(
"[bpe_tokenize_to_ids] Invalid UTF-8 sequence in word: '" + word_to_process +
"' at offset " + std::to_string(offset));
2245 symbols.emplace_back(
llm_symbol{-1, -1, word_to_process.data() + offset, char_len});
2249 if (symbols.empty() && !word_to_process.empty()) {
2250 Logger::warning(
"[bpe_tokenize_to_ids] Word '" + word_to_process +
"' resulted in no symbols. Skipping this word's BPE.");
2254 current_idx += original_word.length();
2257 if (symbols.empty() && word_to_process.empty()){
2258 current_idx += original_word.length();
2262 for (
size_t i = 0; i < symbols.size(); ++i) {
2263 symbols[i].prev = (i > 0) ? (i - 1) : -1;
2264 symbols[i].next = (i < symbols.size() - 1) ? (i + 1) : -1;
2268 std::priority_queue<std::pair<int, int>,
2269 std::vector<std::pair<int, int>>,
2270 std::greater<std::pair<int, int>>> merge_queue;
2272 for (
size_t i = 0; i + 1 < symbols.size(); ++i) {
2277 while (!merge_queue.empty()) {
2278 auto top = merge_queue.top();
2281 int rank = top.first;
2282 int p1_idx = top.second;
2284 if (symbols[p1_idx].n == 0)
continue;
2285 int p2_idx = symbols[p1_idx].next;
2286 if (p2_idx == -1 || symbols[p2_idx].n == 0)
continue;
2289 symbols[p1_idx].n += symbols[p2_idx].n;
2290 symbols[p2_idx].n = 0;
2291 symbols[p1_idx].next = symbols[p2_idx].next;
2292 if (symbols[p1_idx].next != -1) {
2293 symbols[symbols[p1_idx].next].prev = p1_idx;
2298 if (symbols[p1_idx].prev != -1) {
2301 if (symbols[p1_idx].next != -1) {
2306 std::vector<int> final_word_ids;
2307 if (!symbols.empty()) {
2308 for (
int i = 0; i != -1; i = symbols[i].next) {
2310 if (symbol.
n == 0)
continue;
2312 std::string s(symbol.
text, symbol.
n);
2313 std::string lookup_s = s;
2318 final_word_ids.push_back(token_it->second);
2320 Logger::warning(std::string(
"[bpe_tokenize_to_ids] Symbol not found in vocab: '") + lookup_s +
"'. Attempting byte-level tokenization.");
2321 for (
char c_char : lookup_s) {
2324 final_word_ids.push_back(byte_map_it->second);
2329 Logger::error(std::string(
"[bpe_tokenize_to_ids] Unhandled char '") + std::string(1, c_char) +
"' and no UNK token ID.");
2335 }
else if (!word_to_process.empty()) {
2336 Logger::warning(std::string(
"[bpe_tokenize_to_ids] Word '") + word_to_process + std::string(
"' yielded no final symbols. UNK if available."));
2340 if (final_word_ids.empty() && !original_word.empty()) {
2341 Logger::warning(std::string(
"[bpe_tokenize_to_ids] Word '") + original_word +
"' resulted in no tokens. Adding UNK.");
2344 output_ids.insert(output_ids.end(), final_word_ids.begin(), final_word_ids.end());
2346 current_idx += original_word.length();
2349 if (add_eos_token_param) {
2351 Logger::warning(
"[bpe_tokenize_to_ids] EOS token requested but eos_token_id_ is -1.");
2357 Logger::debug(
"[bpe_tokenize_to_ids] Finished Tiktoken BPE tokenization. Total IDs: " + std::to_string(output_ids.size()));
2364 const std::vector<llm_symbol>& symbols,
2366 std::priority_queue<std::pair<int, int>,
2367 std::vector<std::pair<int, int>>,
2368 std::greater<std::pair<int, int>>>& work_queue)
const {
2369 if (first_symbol_idx < 0 ||
static_cast<size_t>(first_symbol_idx) >= symbols.size()) {
2370 Logger::error(std::string(
"[ADD_BIGRAM_REFACTORED] Invalid first_symbol_idx: ") + std::to_string(first_symbol_idx));
2374 const llm_symbol& s1 = symbols[first_symbol_idx];
2377 if (s2_idx < 0 ||
static_cast<size_t>(s2_idx) >= symbols.size() || s2_idx <= first_symbol_idx) {
2382 if (s1.
n == 0 || s2.
n == 0) {
2386 std::string token_left_str(s1.
text, s1.
n);
2387 std::string token_right_str(s2.
text, s2.
n);
2389 std::vector<std::string> merge_attempts;
2393 merge_attempts.push_back(
BPE_SPACE_CHAR +
" " + token_right_str);
2398 merge_attempts.push_back(token_left_str + token_right_str);
2399 Logger::debug(
"[ADD_BIGRAM] Attempting standard merge: '" + (token_left_str + token_right_str) +
"'");
2403 merge_attempts.push_back(token_left_str +
" " + token_right_str);
2404 Logger::debug(
"[ADD_BIGRAM] Attempting Ġword+space merge: '" + (token_left_str +
" " + token_right_str) +
"'");
2408 if (token_left_str.length() == 2 && token_right_str.length() == 1) {
2409 std::string attempt = token_left_str.substr(0, 1) +
" " + token_right_str;
2410 merge_attempts.push_back(attempt);
2411 Logger::debug(
"[ADD_BIGRAM] Attempting char split merge: '" + attempt +
"'");
2414 int best_rank = std::numeric_limits<int>::max();
2415 bool found_merge =
false;
2416 std::string matched_merge;
2418 for (
const auto& merge_attempt : merge_attempts) {
2420 if (it !=
bpe_merges_.end() && it->second < best_rank) {
2421 best_rank = it->second;
2423 matched_merge = merge_attempt;
2428 work_queue.push({best_rank, first_symbol_idx});
2429 Logger::debug(
"[ADD_BIGRAM] Found merge: '" + matched_merge +
"' with rank " + std::to_string(best_rank));
2431 Logger::debug(
"[ADD_BIGRAM] No valid merges found for attempts with left='" + token_left_str +
2432 "' right='" + token_right_str +
"'");