|
TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
|
#include "tokenizer.h"#include <algorithm>#include <cctype>#include <fstream>#include <iomanip>#include <iostream>#include <map>#include <nlohmann/json.hpp>#include <queue>#include <boost/regex.hpp>#include <boost/xpressive/xpressive.hpp>#include <sstream>#include <stdexcept>#include <unordered_set>#include <vector>#include <string>#include <limits>#include <utility>#include <functional>#include <filesystem>#include "logger.h"
Go to the source code of this file.
Typedefs | |
| using | json = nlohmann::json |
Functions | |
| bool | is_numeric (const std::string &s) |
| static std::unordered_map< std::string, int > | generate_bpe_merges_from_vocab_scores (const std::vector< std::string > &id_to_token, const std::vector< float > &token_scores) |
| static std::string | replace_all (std::string str, const std::string &from, const std::string &to) |
| static std::vector< std::pair< std::string, int > > | sort_tokens_by_length_desc (const std::unordered_map< std::string, int > &tokens_map) |
Variables | |
| const std::string | BPE_SPACE_CHAR = "\xC4\xA0" |
| using json = nlohmann::json |
Definition at line 28 of file tokenizer.cpp.
|
static |
Definition at line 422 of file tokenizer.cpp.
References Logger::debug(), Logger::info(), and Logger::warning().
Referenced by Tokenizer::Tokenizer().
| bool is_numeric | ( | const std::string & | s | ) |
Definition at line 36 of file tokenizer.cpp.
|
static |
Definition at line 1619 of file tokenizer.cpp.
Referenced by Tokenizer::apply_chat_template().
|
static |
Definition at line 2080 of file tokenizer.cpp.
Referenced by Tokenizer::bpe_tokenize_to_ids().
| const std::string BPE_SPACE_CHAR = "\xC4\xA0" |
Definition at line 26 of file tokenizer.cpp.
Referenced by Tokenizer::add_bigram_to_queue_refactored(), Tokenizer::bpe_tokenize_to_ids(), Tokenizer::decode(), and Tokenizer::Tokenizer().