7#include <unordered_map>
8#include <unordered_set>
42 using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
83 Tokenizer(
const std::string& vocab_path,
const std::string& model_path,
const ModelConfig& config);
97 std::vector<std::string>
tokenize(
const std::string& text)
const;
105 std::vector<std::string>
ids_to_tokens(
const std::vector<int>& ids)
const;
112 std::string
detokenize(
const std::vector<std::string>& tokens)
const;
123 const std::string& text,
bool add_bos =
true,
bool add_eos =
false,
132 std::string
decode(
const std::vector<int>& ids,
133 bool skip_special_tokens =
true)
const;
143 const std::string& system_message,
190 std::unordered_map<std::string, int>& token_to_id,
191 std::vector<std::string>& id_to_token);
237 std::vector<std::string>
bpe_tokenize(
const std::string& text)
const;
239 std::vector<int>
tokens_to_ids(
const std::vector<std::string>& tokens)
const;
246 bool add_bos_token_param,
247 bool add_eos_token_param,
248 bool ignore_merges_param)
const;
252 const std::vector<llm_symbol>& symbols,
254 std::priority_queue<std::pair<int, int>,
255 std::vector<std::pair<int, int>>,
256 std::greater<std::pair<int, int>>>& work_queue)
const;
258 int find_bpe_rank(
const std::string & token_left,
const std::string & token_right)
const;
A lightweight tokenizer implementation for text processing.
PreTokenizeMethod
Enumeration of available pre-tokenization methods.
std::unordered_map< std::string, int > token_to_id_
std::unordered_map< int, std::string > id_to_added_token_
std::string capitalize_first_letter(std::string s) const
void load_bpe_merges_from_json(const std::string &model_path)
Loads BPE merge rules from JSON file.
void add_bigram_to_queue_refactored(const char *text_data_base, const std::vector< llm_symbol > &symbols, llm_symbol::index first_symbol_idx, std::priority_queue< std::pair< int, int >, std::vector< std::pair< int, int > >, std::greater< std::pair< int, int > > > &work_queue) const
bool is_added_token(int id) const
Checks if a token ID represents an added token.
ModelConfig::TokenizerFamily tokenizer_family_
std::vector< int > bpe_tokenize_to_ids(const std::string &text, bool add_bos_token_param, bool add_eos_token_param, bool ignore_merges_param) const
int unk_token_id() const
Gets the unknown token ID.
bool sentencepiece_model_loaded_
std::string decode_sentencepiece(const std::vector< int > &ids, bool skip_special_tokens) const
std::vector< std::string > ids_to_tokens(const std::vector< int > &ids) const
Converts token IDs back to token strings.
std::vector< int > tokens_to_ids(const std::vector< std::string > &tokens) const
std::vector< float > token_scores_
void load_sentencepiece_model(const std::string &model_path)
Loads a SentencePiece model.
const std::string & get_gguf_chat_template() const
std::vector< std::string > bpe_tokenize(const std::string &text) const
int find_bpe_rank(const std::string &token_left, const std::string &token_right) const
std::vector< std::string > tiktoken_merges_list_
std::vector< std::string > bpe_tokenize_from_scores(const std::string &text) const
std::vector< std::string > tokenize(const std::string &text) const
Tokenizes input text into token strings.
std::vector< int > encode(const std::string &text, bool add_bos=true, bool add_eos=false, PreTokenizeMethod pre_tok_override=PreTokenizeMethod::DEFAULT) const
Encodes text into token IDs with optional special tokens.
bool initialized_from_gguf_
std::unordered_map< char, int > byte_char_to_id_
int vocab_size() const
Returns the size of the vocabulary.
void load_vocab_from_json(const std::string &vocab_path, std::unordered_map< std::string, int > &token_to_id, std::vector< std::string > &id_to_token)
Loads vocabulary from JSON file.
std::vector< int32_t > token_types_
int eos_token_id() const
Gets the end-of-sequence token ID.
std::unordered_set< std::string > chat_template_special_tokens
std::string detokenize(const std::vector< std::string > &tokens) const
Combines tokens back into text.
int bos_token_id() const
Gets the beginning-of-sequence token ID.
std::unordered_map< std::string, int > bpe_merges_
std::string pre_tok_type_
int pad_token_id() const
Gets the padding token ID.
std::unordered_map< std::string, int > added_tokens_
std::vector< std::string > id_to_token_
std::string gguf_chat_template_
std::string apply_chat_template(const std::string &user_prompt, const std::string &system_message, const ModelConfig &config) const
Applies chat template formatting to the input prompt.
std::string decode(const std::vector< int > &ids, bool skip_special_tokens=true) const
Decodes token IDs back to text.
Data structures for GGUF (GPT-Generated Unified Format) file format.
Logging utilities for the TinyLlama implementation.
Complete representation of a GGUF file's contents.
Model configuration structure holding architecture and hyperparameters.
bool operator()(const llm_bigram_bpe &l, const llm_bigram_bpe &r) const
std::priority_queue< llm_bigram_bpe, queue_storage, comparator > queue
std::vector< llm_bigram_bpe > queue_storage