tinyllama.cpp/tokenizer_8h_source.html

#pragma once


#include <limits>

#include <map>

#include <memory>

#include <string>

#include <unordered_map>

#include <unordered_set>

#include <utility>

#include <vector>

#include <queue>

#include <functional>


#include "gguf_structs.h"

#include "logger.h"

#include "model.h"


// Helper struct to represent segments during BPE tokenization


struct llm_symbol {

  using index = int;

  index prev;              // index of the previous symbol in the linked list

  index next;              // index of the next symbol in the linked list

  const char * text;       // pointer to the start of the symbol's text in the original string

  size_t n;                // length of the symbol's text

};


// Helper struct representing a potential byte pair merge


struct llm_bigram_bpe {

    // Comparator for the priority queue: higher rank first, then lower left index


    struct comparator {


        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {

            // Prioritize lower rank (higher priority in BPE merges)

            // If ranks are equal, prioritize the one starting earlier (lower left index)

            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);

        }


    };


    using queue_storage = std::vector<llm_bigram_bpe>;

    // Define a min-priority queue based on the comparator

    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;


    llm_symbol::index left;  // index of the left symbol in the pair

    llm_symbol::index right; // index of the right symbol in the pair

    std::string text;        // the merged text of the pair (for checking against merges map)

    int rank;                // rank of the merge (lower is better)

    size_t size;             // size of the merged text (for validation)

};


class Tokenizer {

 public:


  enum PreTokenizeMethod {

    DEFAULT,

    LLAMA_REGEX

  };


  enum class Type {

    UNKNOWN,

    SENTENCEPIECE_BPE,

    TIKTOKEN_BPE

  };


  Tokenizer(const std::string& vocab_path, const std::string& model_path, const ModelConfig& config);


  explicit Tokenizer(const GGUFData& gguf_data, const ModelConfig& config);


  std::vector<std::string> tokenize(const std::string& text) const;


  std::vector<std::string> ids_to_tokens(const std::vector<int>& ids) const;


  std::string detokenize(const std::vector<std::string>& tokens) const;


  std::vector<int> encode(

      const std::string& text, bool add_bos = true, bool add_eos = false,

      PreTokenizeMethod pre_tok_override = PreTokenizeMethod::DEFAULT) const;


  std::string decode(const std::vector<int>& ids,

                     bool skip_special_tokens = true) const;


  std::string apply_chat_template(const std::string& user_prompt,

                                  const std::string& system_message,

                                  const ModelConfig& config) const;


  int vocab_size() const;


  bool is_added_token(int id) const;


  int bos_token_id() const { return bos_token_id_; }


  int eos_token_id() const { return eos_token_id_; }


  int pad_token_id() const { return pad_token_id_; }


  int unk_token_id() const { return unk_token_id_; }


  const std::string& get_gguf_chat_template() const;


 private:

  void load_vocab_from_json(const std::string& vocab_path,

                            std::unordered_map<std::string, int>& token_to_id,

                            std::vector<std::string>& id_to_token);


  void load_bpe_merges_from_json(const std::string& model_path);


  void load_sentencepiece_model(const std::string& model_path);


  // Token mappings

  std::unordered_map<std::string, int> token_to_id_;

  std::vector<std::string> id_to_token_;

  std::unordered_map<std::string, int> bpe_merges_;

  std::vector<std::string> tiktoken_merges_list_;

  std::vector<float> token_scores_;

  std::vector<int32_t> token_types_;

  ModelConfig::TokenizerFamily tokenizer_family_ = ModelConfig::TokenizerFamily::UNKNOWN;

  bool initialized_from_gguf_ = false;

  std::unordered_map<std::string, int> added_tokens_;

  // Special tokens

  std::string unk_token_;

  std::string bos_token_;

  std::string eos_token_;

  std::string pad_token_;

  // Special token IDs

  int unk_token_id_ = 0;

  int bos_token_id_ = -1;

  int eos_token_id_ = -1;

  int pad_token_id_ = -1;

  bool sentencepiece_model_loaded_ = false;

  std::string pre_tok_type_ = "unknown";

  std::unordered_map<int, std::string> id_to_added_token_;

  std::unordered_set<std::string> chat_template_special_tokens;

  std::unordered_map<char, int> byte_char_to_id_;

  Type type_ = Type::UNKNOWN;

  std::string gguf_chat_template_;

  // SentencePiece specific helper methods (reinstated)

  std::vector<std::string> bpe_tokenize(const std::string& text) const;

  std::vector<std::string> bpe_tokenize_from_scores(const std::string& text) const;

  std::vector<int> tokens_to_ids(const std::vector<std::string>& tokens) const;

  std::string decode_sentencepiece(const std::vector<int>& ids, bool skip_special_tokens) const;

  std::string capitalize_first_letter(std::string s) const;


  // New BPE tokenization method for TikToken (Llama 3) path

  std::vector<int> bpe_tokenize_to_ids(const std::string& text,

                                       bool add_bos_token_param,

                                       bool add_eos_token_param,

                                       bool ignore_merges_param) const;


  // Helper for the new BPE tokenization path

  void add_bigram_to_queue_refactored(const char* text_data_base,

                                      const std::vector<llm_symbol>& symbols,

                                      llm_symbol::index first_symbol_idx,

                                      std::priority_queue<std::pair<int, int>,

                                                          std::vector<std::pair<int, int>>,

                                                          std::greater<std::pair<int, int>>>& work_queue) const;


  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;


};


Tokenizer
A lightweight tokenizer implementation for text processing.
Definition tokenizer.h:61

Tokenizer::eos_token_id_
int eos_token_id_
Definition tokenizer.h:224

Tokenizer::PreTokenizeMethod
PreTokenizeMethod
Enumeration of available pre-tokenization methods.
Definition tokenizer.h:66

Tokenizer::DEFAULT
@ DEFAULT
Definition tokenizer.h:67

Tokenizer::LLAMA_REGEX
@ LLAMA_REGEX
Definition tokenizer.h:68

Tokenizer::bos_token_id_
int bos_token_id_
Definition tokenizer.h:223

Tokenizer::token_to_id_
std::unordered_map< std::string, int > token_to_id_
Definition tokenizer.h:205

Tokenizer::id_to_added_token_
std::unordered_map< int, std::string > id_to_added_token_
Definition tokenizer.h:229

Tokenizer::capitalize_first_letter
std::string capitalize_first_letter(std::string s) const
Definition tokenizer.cpp:1945

Tokenizer::load_bpe_merges_from_json
void load_bpe_merges_from_json(const std::string &model_path)
Loads BPE merge rules from JSON file.
Definition tokenizer.cpp:1860

Tokenizer::add_bigram_to_queue_refactored
void add_bigram_to_queue_refactored(const char *text_data_base, const std::vector< llm_symbol > &symbols, llm_symbol::index first_symbol_idx, std::priority_queue< std::pair< int, int >, std::vector< std::pair< int, int > >, std::greater< std::pair< int, int > > > &work_queue) const
Definition tokenizer.cpp:2363

Tokenizer::pad_token_id_
int pad_token_id_
Definition tokenizer.h:225

Tokenizer::is_added_token
bool is_added_token(int id) const
Checks if a token ID represents an added token.

Tokenizer::tokenizer_family_
ModelConfig::TokenizerFamily tokenizer_family_
Definition tokenizer.h:211

Tokenizer::bpe_tokenize_to_ids
std::vector< int > bpe_tokenize_to_ids(const std::string &text, bool add_bos_token_param, bool add_eos_token_param, bool ignore_merges_param) const
Definition tokenizer.cpp:2094

Tokenizer::unk_token_id
int unk_token_id() const
Gets the unknown token ID.
Definition tokenizer.h:181

Tokenizer::sentencepiece_model_loaded_
bool sentencepiece_model_loaded_
Definition tokenizer.h:227

Tokenizer::decode_sentencepiece
std::string decode_sentencepiece(const std::vector< int > &ids, bool skip_special_tokens) const
Definition tokenizer.cpp:1476

Tokenizer::ids_to_tokens
std::vector< std::string > ids_to_tokens(const std::vector< int > &ids) const
Converts token IDs back to token strings.
Definition tokenizer.cpp:258

Tokenizer::tokens_to_ids
std::vector< int > tokens_to_ids(const std::vector< std::string > &tokens) const
Definition tokenizer.cpp:171

Tokenizer::token_scores_
std::vector< float > token_scores_
Definition tokenizer.h:209

Tokenizer::load_sentencepiece_model
void load_sentencepiece_model(const std::string &model_path)
Loads a SentencePiece model.
Definition tokenizer.cpp:1855

Tokenizer::Type
Type
Definition tokenizer.h:71

Tokenizer::Type::SENTENCEPIECE_BPE
@ SENTENCEPIECE_BPE

Tokenizer::Type::TIKTOKEN_BPE
@ TIKTOKEN_BPE

Tokenizer::Type::UNKNOWN
@ UNKNOWN

Tokenizer::get_gguf_chat_template
const std::string & get_gguf_chat_template() const
Definition tokenizer.cpp:2075

Tokenizer::bpe_tokenize
std::vector< std::string > bpe_tokenize(const std::string &text) const
Definition tokenizer.cpp:1975

Tokenizer::find_bpe_rank
int find_bpe_rank(const std::string &token_left, const std::string &token_right) const
Definition tokenizer.cpp:51

Tokenizer::tiktoken_merges_list_
std::vector< std::string > tiktoken_merges_list_
Definition tokenizer.h:208

Tokenizer::unk_token_
std::string unk_token_
Definition tokenizer.h:216

Tokenizer::bpe_tokenize_from_scores
std::vector< std::string > bpe_tokenize_from_scores(const std::string &text) const
Definition tokenizer.cpp:59

Tokenizer::tokenize
std::vector< std::string > tokenize(const std::string &text) const
Tokenizes input text into token strings.

Tokenizer::encode
std::vector< int > encode(const std::string &text, bool add_bos=true, bool add_eos=false, PreTokenizeMethod pre_tok_override=PreTokenizeMethod::DEFAULT) const
Encodes text into token IDs with optional special tokens.
Definition tokenizer.cpp:1141

Tokenizer::unk_token_id_
int unk_token_id_
Definition tokenizer.h:222

Tokenizer::initialized_from_gguf_
bool initialized_from_gguf_
Definition tokenizer.h:212

Tokenizer::eos_token_
std::string eos_token_
Definition tokenizer.h:218

Tokenizer::byte_char_to_id_
std::unordered_map< char, int > byte_char_to_id_
Definition tokenizer.h:231

Tokenizer::vocab_size
int vocab_size() const
Returns the size of the vocabulary.

Tokenizer::load_vocab_from_json
void load_vocab_from_json(const std::string &vocab_path, std::unordered_map< std::string, int > &token_to_id, std::vector< std::string > &id_to_token)
Loads vocabulary from JSON file.
Definition tokenizer.cpp:1708

Tokenizer::token_types_
std::vector< int32_t > token_types_
Definition tokenizer.h:210

Tokenizer::eos_token_id
int eos_token_id() const
Gets the end-of-sequence token ID.
Definition tokenizer.h:169

Tokenizer::pad_token_
std::string pad_token_
Definition tokenizer.h:219

Tokenizer::chat_template_special_tokens
std::unordered_set< std::string > chat_template_special_tokens
Definition tokenizer.h:230

Tokenizer::detokenize
std::string detokenize(const std::vector< std::string > &tokens) const
Combines tokens back into text.
Definition tokenizer.cpp:1050

Tokenizer::bos_token_id
int bos_token_id() const
Gets the beginning-of-sequence token ID.
Definition tokenizer.h:163

Tokenizer::bpe_merges_
std::unordered_map< std::string, int > bpe_merges_
Definition tokenizer.h:207

Tokenizer::pre_tok_type_
std::string pre_tok_type_
Definition tokenizer.h:228

Tokenizer::pad_token_id
int pad_token_id() const
Gets the padding token ID.
Definition tokenizer.h:175

Tokenizer::added_tokens_
std::unordered_map< std::string, int > added_tokens_
Definition tokenizer.h:213

Tokenizer::id_to_token_
std::vector< std::string > id_to_token_
Definition tokenizer.h:206

Tokenizer::gguf_chat_template_
std::string gguf_chat_template_
Definition tokenizer.h:234

Tokenizer::apply_chat_template
std::string apply_chat_template(const std::string &user_prompt, const std::string &system_message, const ModelConfig &config) const
Applies chat template formatting to the input prompt.
Definition tokenizer.cpp:1628

Tokenizer::bos_token_
std::string bos_token_
Definition tokenizer.h:217

Tokenizer::decode
std::string decode(const std::vector< int > &ids, bool skip_special_tokens=true) const
Decodes token IDs back to text.
Definition tokenizer.cpp:1394

Tokenizer::type_
Type type_
Definition tokenizer.h:233

gguf_structs.h
Data structures for GGUF (GPT-Generated Unified Format) file format.

logger.h
Logging utilities for the TinyLlama implementation.

model.h

GGUFData
Complete representation of a GGUF file's contents.
Definition gguf_structs.h:80

ModelConfig
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80

ModelConfig::TokenizerFamily
TokenizerFamily
Definition model.h:112

ModelConfig::TokenizerFamily::UNKNOWN
@ UNKNOWN

llm_bigram_bpe::comparator
Definition tokenizer.h:32

llm_bigram_bpe::comparator::operator()
bool operator()(const llm_bigram_bpe &l, const llm_bigram_bpe &r) const
Definition tokenizer.h:33

llm_bigram_bpe
Definition tokenizer.h:30

llm_bigram_bpe::text
std::string text
Definition tokenizer.h:46

llm_bigram_bpe::right
llm_symbol::index right
Definition tokenizer.h:45

llm_bigram_bpe::queue
std::priority_queue< llm_bigram_bpe, queue_storage, comparator > queue
Definition tokenizer.h:42

llm_bigram_bpe::queue_storage
std::vector< llm_bigram_bpe > queue_storage
Definition tokenizer.h:40

llm_bigram_bpe::left
llm_symbol::index left
Definition tokenizer.h:44

llm_bigram_bpe::rank
int rank
Definition tokenizer.h:47

llm_bigram_bpe::size
size_t size
Definition tokenizer.h:48

llm_symbol
Definition tokenizer.h:21

llm_symbol::text
const char * text
Definition tokenizer.h:25

llm_symbol::index
int index
Definition tokenizer.h:22

llm_symbol::prev
index prev
Definition tokenizer.h:23

llm_symbol::n
size_t n
Definition tokenizer.h:26

llm_symbol::next
index next
Definition tokenizer.h:24