#include "tokenizer.h"
#include <algorithm>
#include <cctype>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <nlohmann/json.hpp>
#include <queue>
#include <boost/regex.hpp>
#include <boost/xpressive/xpressive.hpp>
#include <sstream>
#include <stdexcept>
#include <unordered_set>
#include <vector>
#include <string>
#include <limits>
#include <utility>
#include <functional>
#include <filesystem>
#include "logger.h"

Include dependency graph for tokenizer.cpp:

Typedefs
using	json = nlohmann::json

Functions
bool	is_numeric (const std::string &s)

static std::unordered_map< std::string, int >	generate_bpe_merges_from_vocab_scores (const std::vector< std::string > &id_to_token, const std::vector< float > &token_scores)

static std::string	replace_all (std::string str, const std::string &from, const std::string &to)

static std::vector< std::pair< std::string, int > >	sort_tokens_by_length_desc (const std::unordered_map< std::string, int > &tokens_map)

Variables
const std::string	BPE_SPACE_CHAR = "\xC4\xA0"

Typedef Documentation

◆ json

using json = nlohmann::json

Definition at line 28 of file tokenizer.cpp.

Function Documentation

◆ generate_bpe_merges_from_vocab_scores()

static std::unordered_map< std::string, int > generate_bpe_merges_from_vocab_scores	(	const std::vector< std::string > &	id_to_token,
		const std::vector< float > &	token_scores
	)

static

Definition at line 422 of file tokenizer.cpp.

                                          {
    
    std::unordered_map<std::string, int> generated_merges;
    
    if (token_scores.empty() || id_to_token.empty()) {
        Logger::warning("Cannot generate BPE merges: empty scores or vocabulary");
        return generated_merges;
    }
    
    Logger::info("Generating BPE merges from vocabulary and scores for older Llama models...");
    
    // Create a list of tokens with their scores, sorted by score (higher score = higher priority)
    std::vector<std::pair<float, std::string>> scored_tokens;
    for (size_t id = 0; id < id_to_token.size(); ++id) {
        if (id < token_scores.size()) {
            const std::string& token = id_to_token[id];
            // Skip special tokens and single characters
            if (token.length() > 1 && 
                token.find("<") == std::string::npos && 
                token.find(">") == std::string::npos &&
                token != "▁") {  // Skip SentencePiece space token
                scored_tokens.emplace_back(token_scores[id], token);
            }
        }
    }
    
    // Sort by score (descending - higher scores first)
    std::sort(scored_tokens.begin(), scored_tokens.end(), 
              [](const auto& a, const auto& b) { return a.first > b.first; });
    
    Logger::info("Found " + std::to_string(scored_tokens.size()) + " candidate tokens for merge generation");
    
    // Generate merges by finding tokens that can be decomposed into pairs
    int merge_rank = 0;
    std::unordered_set<std::string> processed_tokens;
    
    for (const auto& [score, token] : scored_tokens) {
        if (processed_tokens.count(token)) continue;
        
        // Try to find the best split point for this token
        std::string best_left, best_right;
        float best_combined_score = -std::numeric_limits<float>::infinity();
        
        // Try all possible split points
        for (size_t split = 1; split < token.length(); ++split) {
            std::string left = token.substr(0, split);
            std::string right = token.substr(split);
            
            // Check if both parts exist in vocabulary
            auto left_it = std::find(id_to_token.begin(), id_to_token.end(), left);
            auto right_it = std::find(id_to_token.begin(), id_to_token.end(), right);
            
            if (left_it != id_to_token.end() && right_it != id_to_token.end()) {
                // Both parts exist, calculate combined score
                size_t left_id = std::distance(id_to_token.begin(), left_it);
                size_t right_id = std::distance(id_to_token.begin(), right_it);
                float left_score = (left_id < token_scores.size()) ? 
                                 token_scores[left_id] : 0.0f;
                float right_score = (right_id < token_scores.size()) ? 
                                  token_scores[right_id] : 0.0f;
                float combined_score = left_score + right_score;
                
                if (combined_score > best_combined_score) {
                    best_combined_score = combined_score;
                    best_left = left;
                    best_right = right;
                }
            }
        }
        
        // If we found a valid decomposition, add it as a merge rule
        if (!best_left.empty() && !best_right.empty()) {
            std::string merge_key = best_left + best_right;
            if (generated_merges.find(merge_key) == generated_merges.end()) {
                generated_merges[merge_key] = merge_rank++;
                Logger::debug("Generated merge: '" + best_left + "' + '" + best_right + "' -> '" + token + "' (rank " + std::to_string(merge_rank-1) + ")");
            }
        }
        
        processed_tokens.insert(token);
        
        // Limit the number of merges to prevent excessive computation
        if (merge_rank >= 50000) {
            Logger::info("Reached maximum merge limit (50000), stopping generation");
            break;
        }
    }
    
    Logger::info("Generated " + std::to_string(generated_merges.size()) + " BPE merge rules from vocabulary and scores");
    return generated_merges;
}

References Logger::debug(), Logger::info(), and Logger::warning().

Referenced by Tokenizer::Tokenizer().

◆ is_numeric()

bool is_numeric ( const std::string & s )

Definition at line 36 of file tokenizer.cpp.

                                    {
    if (s.empty()) {
        return false; // An empty string is not considered numeric
    }
    for (char c : s) {
        if (!std::isdigit(static_cast<unsigned char>(c))) {
            return false; // Found a non-digit character
        }
    }
    return true; // All characters are digits
}

◆ replace_all()

static std::string replace_all	(	std::string	str,
		const std::string &	from,
		const std::string &	to
	)

static

Definition at line 1619 of file tokenizer.cpp.

                                                                                        {
    size_t start_pos = 0;
    while((start_pos = str.find(from, start_pos)) != std::string::npos) {
        str.replace(start_pos, from.length(), to);
        start_pos += to.length(); // Handles cases where 'to' is a substring of 'from'
    }
    return str;
}

Referenced by Tokenizer::apply_chat_template().

◆ sort_tokens_by_length_desc()

static std::vector< std::pair< std::string, int > > sort_tokens_by_length_desc ( const std::unordered_map< std::string, int > & tokens_map )

static

Definition at line 2080 of file tokenizer.cpp.

                                                                                                                         {
    std::vector<std::pair<std::string, int>> sorted_tokens;
    for (const auto& pair : tokens_map) {
        sorted_tokens.push_back(pair);
    }
    std::sort(sorted_tokens.begin(), sorted_tokens.end(),
              [](const auto& a, const auto& b) {
                  return a.first.length() > b.first.length();
              });
    return sorted_tokens;
}

Referenced by Tokenizer::bpe_tokenize_to_ids().

Variable Documentation

◆ BPE_SPACE_CHAR

const std::string BPE_SPACE_CHAR = "\xC4\xA0"

Definition at line 26 of file tokenizer.cpp.

Referenced by Tokenizer::add_bigram_to_queue_refactored(), Tokenizer::bpe_tokenize_to_ids(), Tokenizer::decode(), and Tokenizer::Tokenizer().

Typedefs

Functions

Variables

Typedef Documentation

◆ json

Function Documentation

◆ generate_bpe_merges_from_vocab_scores()

◆ is_numeric()

◆ replace_all()

◆ sort_tokens_by_length_desc()

Variable Documentation

◆ BPE_SPACE_CHAR