TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
Classes | Functions
tinyllama Namespace Reference

Classes

class  TinyLlamaSession
 Represents an active TinyLlama session holding the loaded model and tokenizer. More...
 

Functions

static void log_vector_summary_detailed (const std::string &name, const std::vector< float > &v, int current_pos, int current_layer, int N)
 
static std::string read_file_api (const std::string &path)
 
static int argmax (const std::vector< float > &v)
 
static int sample_top_k_top_p_temperature (const std::vector< float > &logits, float temperature, int top_k, float top_p, std::mt19937 &rng)
 

Function Documentation

◆ argmax()

static int tinyllama::argmax ( const std::vector< float > &  v)
static

Definition at line 73 of file api.cpp.

73 {
74 if (v.empty()) {
75 Logger::error("Cannot perform argmax on empty vector");
76 return -1;
77 }
78
79 return std::distance(v.begin(), std::max_element(v.begin(), v.end()));
80}
static void error(const std::string &message)
Definition logger.cpp:143

References Logger::error().

◆ log_vector_summary_detailed()

static void tinyllama::log_vector_summary_detailed ( const std::string &  name,
const std::vector< float > &  v,
int  current_pos,
int  current_layer,
int  N 
)
static

Definition at line 32 of file api.cpp.

35 {
36 if (v.empty()) {
37 Logger::info(name + " (pos=" + std::to_string(current_pos) + ", layer=" +
38 std::to_string(current_layer) + "): EMPTY VECTOR");
39 return;
40 }
41 std::stringstream ss;
42 ss << name << " (pos=" << std::to_string(current_pos)
43 << ", layer=" << std::to_string(current_layer) << "): size=" << v.size();
44 ss << ", first " << N << ": [";
45 for (int i = 0; i < N && i < v.size(); ++i) {
46 ss << std::fixed << std::setprecision(4) << v[i]
47 << (i == N - 1 || i == v.size() - 1 ? "" : ", ");
48 }
49 ss << "]";
50 float min_val = v[0], max_val = v[0], sum = 0.0f;
51 bool all_finite = true;
52 for (float val : v) {
53 if (val < min_val) min_val = val;
54 if (val > max_val) max_val = val;
55 sum += val;
56 if (!std::isfinite(val)) all_finite = false;
57 }
58 ss << ", min=" << std::fixed << std::setprecision(4) << min_val;
59 ss << ", max=" << std::fixed << std::setprecision(4) << max_val;
60 ss << ", mean=" << std::fixed << std::setprecision(4) << (sum / v.size());
61 ss << ", finite=" << (all_finite ? "yes" : "no");
62 Logger::info(ss.str());
63}
static void info(const std::string &message)
Definition logger.cpp:135

References Logger::info().

Referenced by tinyllama::TinyLlamaSession::generate().

◆ read_file_api()

static std::string tinyllama::read_file_api ( const std::string &  path)
static

Definition at line 65 of file api.cpp.

65 {
66 std::filesystem::path fs_path(path);
67 std::ifstream file(fs_path, std::ios::binary);
68 if (!file) throw std::runtime_error("Failed to open file: " + path);
69 return std::string((std::istreambuf_iterator<char>(file)),
70 std::istreambuf_iterator<char>());
71}

◆ sample_top_k_top_p_temperature()

static int tinyllama::sample_top_k_top_p_temperature ( const std::vector< float > &  logits,
float  temperature,
int  top_k,
float  top_p,
std::mt19937 &  rng 
)
static

Definition at line 82 of file api.cpp.

84 {
85 if (logits.empty()) {
86 throw std::runtime_error("Cannot sample from empty logits.");
87 }
88
89 // If temperature is very low, fall back to greedy sampling
90 if (temperature < 0.05f) {
91 return std::distance(logits.begin(), std::max_element(logits.begin(), logits.end()));
92 }
93
94 int vocab_size = logits.size();
95
96 top_k = (std::min)(top_k, vocab_size);
97 if (top_k <= 0) top_k = vocab_size;
98
99 std::vector<float> scaled_logits(vocab_size);
100 float max_logit = -std::numeric_limits<float>::infinity();
101 for (float logit : logits) max_logit = (std::max)(max_logit, logit);
102
103 // Scale logits to avoid numerical instability
104 const float scale = 1.0f / temperature;
105 for (int i = 0; i < vocab_size; ++i) {
106 scaled_logits[i] = (logits[i] - max_logit) * scale;
107 }
108
109 std::vector<double> probs_double(vocab_size);
110 double sum_exp = 0.0;
111 for (int i = 0; i < vocab_size; ++i) {
112 probs_double[i] = std::exp(static_cast<double>(scaled_logits[i]));
113 sum_exp += probs_double[i];
114 }
115
116 // Normalize probabilities
117 if (sum_exp > 0.0) {
118 for (int i = 0; i < vocab_size; ++i) {
119 probs_double[i] /= sum_exp;
120 }
121 } else {
122 // If all probabilities are zero, fall back to uniform distribution
123 for (int i = 0; i < vocab_size; ++i) {
124 probs_double[i] = 1.0 / vocab_size;
125 }
126 }
127
128 std::vector<std::pair<float, int>> prob_idx(vocab_size);
129 for (int i = 0; i < vocab_size; ++i) {
130 prob_idx[i] = {static_cast<float>(probs_double[i]), i};
131 }
132
133 std::sort(prob_idx.begin(), prob_idx.end(),
134 std::greater<std::pair<float, int>>());
135
136 if (top_k < vocab_size) {
137 prob_idx.resize(top_k);
138 }
139
140 float cumulative_prob = 0.0f;
141 int last_idx = 0;
142 for (int i = 0; i < prob_idx.size(); ++i) {
143 cumulative_prob += prob_idx[i].first;
144 last_idx = i;
145 if (cumulative_prob >= top_p) {
146 break;
147 }
148 }
149 prob_idx.resize(last_idx + 1);
150
151 float final_sum = 0.0f;
152 for (const auto& pi : prob_idx) {
153 final_sum += pi.first;
154 }
155
156 // Renormalize probabilities after top-k and top-p filtering
157 std::vector<float> final_probs(prob_idx.size());
158 if (final_sum > 0.0f) {
159 for (size_t i = 0; i < prob_idx.size(); ++i) {
160 final_probs[i] = prob_idx[i].first / final_sum;
161 }
162 } else {
163 // If all probabilities are zero after filtering, use uniform distribution
164 float uniform_prob = 1.0f / prob_idx.size();
165 std::fill(final_probs.begin(), final_probs.end(), uniform_prob);
166 }
167
168 std::discrete_distribution<int> dist(final_probs.begin(), final_probs.end());
169 int sampled_idx_in_filtered = dist(rng);
170
171 return prob_idx[sampled_idx_in_filtered].second;
172}

Referenced by tinyllama::TinyLlamaSession::generate(), and tinyllama::TinyLlamaSession::generate_batch().