TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
tokenizer.h
Go to the documentation of this file.
1#pragma once
2
3#include <limits>
4#include <map>
5#include <memory>
6#include <string>
7#include <unordered_map>
8#include <unordered_set>
9#include <utility>
10#include <vector>
11#include <queue>
12#include <functional>
13
14#include "gguf_structs.h"
15#include "logger.h"
16#include "model.h"
17
18
19
20// Helper struct to represent segments during BPE tokenization
21struct llm_symbol {
22 using index = int;
23 index prev; // index of the previous symbol in the linked list
24 index next; // index of the next symbol in the linked list
25 const char * text; // pointer to the start of the symbol's text in the original string
26 size_t n; // length of the symbol's text
27};
28
29// Helper struct representing a potential byte pair merge
31 // Comparator for the priority queue: higher rank first, then lower left index
32 struct comparator {
33 bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
34 // Prioritize lower rank (higher priority in BPE merges)
35 // If ranks are equal, prioritize the one starting earlier (lower left index)
36 return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
37 }
38 };
39
40 using queue_storage = std::vector<llm_bigram_bpe>;
41 // Define a min-priority queue based on the comparator
42 using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
43
44 llm_symbol::index left; // index of the left symbol in the pair
45 llm_symbol::index right; // index of the right symbol in the pair
46 std::string text; // the merged text of the pair (for checking against merges map)
47 int rank; // rank of the merge (lower is better)
48 size_t size; // size of the merged text (for validation)
49};
50
51
52
61class Tokenizer {
62 public:
70
71 enum class Type {
72 UNKNOWN,
75 };
76
83 Tokenizer(const std::string& vocab_path, const std::string& model_path, const ModelConfig& config);
84
90 explicit Tokenizer(const GGUFData& gguf_data, const ModelConfig& config);
91
97 std::vector<std::string> tokenize(const std::string& text) const;
98
99
105 std::vector<std::string> ids_to_tokens(const std::vector<int>& ids) const;
106
112 std::string detokenize(const std::vector<std::string>& tokens) const;
113
122 std::vector<int> encode(
123 const std::string& text, bool add_bos = true, bool add_eos = false,
124 PreTokenizeMethod pre_tok_override = PreTokenizeMethod::DEFAULT) const;
125
132 std::string decode(const std::vector<int>& ids,
133 bool skip_special_tokens = true) const;
134
142 std::string apply_chat_template(const std::string& user_prompt,
143 const std::string& system_message,
144 const ModelConfig& config) const;
145
150 int vocab_size() const;
151
157 bool is_added_token(int id) const;
158
163 int bos_token_id() const { return bos_token_id_; }
164
169 int eos_token_id() const { return eos_token_id_; }
170
175 int pad_token_id() const { return pad_token_id_; }
176
181 int unk_token_id() const { return unk_token_id_; }
182
183 const std::string& get_gguf_chat_template() const;
184
185 private:
189 void load_vocab_from_json(const std::string& vocab_path,
190 std::unordered_map<std::string, int>& token_to_id,
191 std::vector<std::string>& id_to_token);
192
196 void load_bpe_merges_from_json(const std::string& model_path);
197
201 void load_sentencepiece_model(const std::string& model_path);
202
203
204 // Token mappings
205 std::unordered_map<std::string, int> token_to_id_;
206 std::vector<std::string> id_to_token_;
207 std::unordered_map<std::string, int> bpe_merges_;
208 std::vector<std::string> tiktoken_merges_list_;
209 std::vector<float> token_scores_;
210 std::vector<int32_t> token_types_;
213 std::unordered_map<std::string, int> added_tokens_;
215 // Special tokens
216 std::string unk_token_;
217 std::string bos_token_;
218 std::string eos_token_;
219 std::string pad_token_;
221 // Special token IDs
223 int bos_token_id_ = -1;
224 int eos_token_id_ = -1;
225 int pad_token_id_ = -1;
228 std::string pre_tok_type_ = "unknown";
229 std::unordered_map<int, std::string> id_to_added_token_;
230 std::unordered_set<std::string> chat_template_special_tokens;
231 std::unordered_map<char, int> byte_char_to_id_;
236 // SentencePiece specific helper methods (reinstated)
237 std::vector<std::string> bpe_tokenize(const std::string& text) const;
238 std::vector<std::string> bpe_tokenize_from_scores(const std::string& text) const;
239 std::vector<int> tokens_to_ids(const std::vector<std::string>& tokens) const;
240 std::string decode_sentencepiece(const std::vector<int>& ids, bool skip_special_tokens) const;
241 std::string capitalize_first_letter(std::string s) const;
242
243
244 // New BPE tokenization method for TikToken (Llama 3) path
245 std::vector<int> bpe_tokenize_to_ids(const std::string& text,
246 bool add_bos_token_param,
247 bool add_eos_token_param,
248 bool ignore_merges_param) const;
249
250 // Helper for the new BPE tokenization path
251 void add_bigram_to_queue_refactored(const char* text_data_base,
252 const std::vector<llm_symbol>& symbols,
253 llm_symbol::index first_symbol_idx,
254 std::priority_queue<std::pair<int, int>,
255 std::vector<std::pair<int, int>>,
256 std::greater<std::pair<int, int>>>& work_queue) const;
257
258 int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
259
260};
A lightweight tokenizer implementation for text processing.
Definition tokenizer.h:61
int eos_token_id_
Definition tokenizer.h:224
PreTokenizeMethod
Enumeration of available pre-tokenization methods.
Definition tokenizer.h:66
@ LLAMA_REGEX
Definition tokenizer.h:68
int bos_token_id_
Definition tokenizer.h:223
std::unordered_map< std::string, int > token_to_id_
Definition tokenizer.h:205
std::unordered_map< int, std::string > id_to_added_token_
Definition tokenizer.h:229
std::string capitalize_first_letter(std::string s) const
void load_bpe_merges_from_json(const std::string &model_path)
Loads BPE merge rules from JSON file.
void add_bigram_to_queue_refactored(const char *text_data_base, const std::vector< llm_symbol > &symbols, llm_symbol::index first_symbol_idx, std::priority_queue< std::pair< int, int >, std::vector< std::pair< int, int > >, std::greater< std::pair< int, int > > > &work_queue) const
int pad_token_id_
Definition tokenizer.h:225
bool is_added_token(int id) const
Checks if a token ID represents an added token.
ModelConfig::TokenizerFamily tokenizer_family_
Definition tokenizer.h:211
std::vector< int > bpe_tokenize_to_ids(const std::string &text, bool add_bos_token_param, bool add_eos_token_param, bool ignore_merges_param) const
int unk_token_id() const
Gets the unknown token ID.
Definition tokenizer.h:181
bool sentencepiece_model_loaded_
Definition tokenizer.h:227
std::string decode_sentencepiece(const std::vector< int > &ids, bool skip_special_tokens) const
std::vector< std::string > ids_to_tokens(const std::vector< int > &ids) const
Converts token IDs back to token strings.
std::vector< int > tokens_to_ids(const std::vector< std::string > &tokens) const
std::vector< float > token_scores_
Definition tokenizer.h:209
void load_sentencepiece_model(const std::string &model_path)
Loads a SentencePiece model.
const std::string & get_gguf_chat_template() const
std::vector< std::string > bpe_tokenize(const std::string &text) const
int find_bpe_rank(const std::string &token_left, const std::string &token_right) const
Definition tokenizer.cpp:51
std::vector< std::string > tiktoken_merges_list_
Definition tokenizer.h:208
std::string unk_token_
Definition tokenizer.h:216
std::vector< std::string > bpe_tokenize_from_scores(const std::string &text) const
Definition tokenizer.cpp:59
std::vector< std::string > tokenize(const std::string &text) const
Tokenizes input text into token strings.
std::vector< int > encode(const std::string &text, bool add_bos=true, bool add_eos=false, PreTokenizeMethod pre_tok_override=PreTokenizeMethod::DEFAULT) const
Encodes text into token IDs with optional special tokens.
int unk_token_id_
Definition tokenizer.h:222
bool initialized_from_gguf_
Definition tokenizer.h:212
std::string eos_token_
Definition tokenizer.h:218
std::unordered_map< char, int > byte_char_to_id_
Definition tokenizer.h:231
int vocab_size() const
Returns the size of the vocabulary.
void load_vocab_from_json(const std::string &vocab_path, std::unordered_map< std::string, int > &token_to_id, std::vector< std::string > &id_to_token)
Loads vocabulary from JSON file.
std::vector< int32_t > token_types_
Definition tokenizer.h:210
int eos_token_id() const
Gets the end-of-sequence token ID.
Definition tokenizer.h:169
std::string pad_token_
Definition tokenizer.h:219
std::unordered_set< std::string > chat_template_special_tokens
Definition tokenizer.h:230
std::string detokenize(const std::vector< std::string > &tokens) const
Combines tokens back into text.
int bos_token_id() const
Gets the beginning-of-sequence token ID.
Definition tokenizer.h:163
std::unordered_map< std::string, int > bpe_merges_
Definition tokenizer.h:207
std::string pre_tok_type_
Definition tokenizer.h:228
int pad_token_id() const
Gets the padding token ID.
Definition tokenizer.h:175
std::unordered_map< std::string, int > added_tokens_
Definition tokenizer.h:213
std::vector< std::string > id_to_token_
Definition tokenizer.h:206
std::string gguf_chat_template_
Definition tokenizer.h:234
std::string apply_chat_template(const std::string &user_prompt, const std::string &system_message, const ModelConfig &config) const
Applies chat template formatting to the input prompt.
std::string bos_token_
Definition tokenizer.h:217
std::string decode(const std::vector< int > &ids, bool skip_special_tokens=true) const
Decodes token IDs back to text.
Type type_
Definition tokenizer.h:233
Data structures for GGUF (GPT-Generated Unified Format) file format.
Logging utilities for the TinyLlama implementation.
Complete representation of a GGUF file's contents.
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
TokenizerFamily
Definition model.h:112
bool operator()(const llm_bigram_bpe &l, const llm_bigram_bpe &r) const
Definition tokenizer.h:33
std::string text
Definition tokenizer.h:46
llm_symbol::index right
Definition tokenizer.h:45
std::priority_queue< llm_bigram_bpe, queue_storage, comparator > queue
Definition tokenizer.h:42
std::vector< llm_bigram_bpe > queue_storage
Definition tokenizer.h:40
llm_symbol::index left
Definition tokenizer.h:44
const char * text
Definition tokenizer.h:25
index prev
Definition tokenizer.h:23
size_t n
Definition tokenizer.h:26
index next
Definition tokenizer.h:24