TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
model_utils.cpp
Go to the documentation of this file.
1#include "model_utils.h"
2#include "logger.h"
3#include "utils.h"
4#include "quantization.h"
5#include "model_constants.h"
6#include "model_macros.h"
7#include <algorithm>
8#include <cstring>
9#include <cmath>
10
11std::vector<float> TinyLlamaModel::lookup_embedding(int token_id) {
12 int hs = config_.hidden_size;
13 int vs = config_.vocab_size;
14
15 if (token_id < 0 || token_id >= vs) {
16 Logger::error("Token ID out of bounds in lookup_embedding: " +
17 std::to_string(token_id));
18 return std::vector<float>(hs, 0.0f);
19 }
20
21 std::vector<float> embedding_vec(hs, 0.0f);
22
23 if (!embed_tokens_q4k.empty()) {
24 if (hs % GGML_QK_K != 0) {
25 Logger::error("Hidden size (" + std::to_string(hs) +
26 ") is not divisible by GGML_QK_K (" +
27 std::to_string(GGML_QK_K) + ") for Q4_K embedding lookup.");
28 return embedding_vec;
29 }
30
31 size_t blocks_per_row = hs / GGML_QK_K;
32 size_t start_block_idx = (size_t)token_id * blocks_per_row;
33 size_t end_block_idx = start_block_idx + blocks_per_row;
34
35 if (end_block_idx > embed_tokens_q4k.size()) {
37 "Calculated block index out of bounds for Q4_K embedding table. "
38 "Token: " +
39 std::to_string(token_id) +
40 ", StartBlock: " + std::to_string(start_block_idx) +
41 ", EndBlock: " + std::to_string(end_block_idx) +
42 ", TableSize: " + std::to_string(embed_tokens_q4k.size()));
43 return embedding_vec;
44 }
45
46 float dequantized_block[GGML_QK_K];
47 for (size_t block_n = 0; block_n < blocks_per_row; ++block_n) {
48 dequantize_q4_k_m(&embed_tokens_q4k[start_block_idx + block_n],
49 dequantized_block, GGML_QK_K, false);
50
51 size_t dest_offset = block_n * GGML_QK_K;
52
53 size_t elements_to_copy = SAFE_MIN((size_t)GGML_QK_K, (size_t)(hs - dest_offset));
54 std::memcpy(&embedding_vec[dest_offset], dequantized_block,
55 elements_to_copy * sizeof(float));
56 }
57 return embedding_vec;
58 }
59
60 else if (!embed_tokens_q8_0.empty()) {
61 if (hs % GGML_QK8_0 != 0) {
62 Logger::error("Hidden size (" + std::to_string(hs) +
63 ") is not divisible by GGML_QK8_0 (" +
64 std::to_string(GGML_QK8_0) +
65 ") for Q8_0 embedding lookup.");
66 return embedding_vec;
67 }
68 size_t blocks_per_row = hs / GGML_QK8_0;
69 size_t start_block_idx = (size_t)token_id * blocks_per_row;
70 size_t end_block_idx = start_block_idx + blocks_per_row;
71
72 if (end_block_idx > embed_tokens_q8_0.size()) {
74 "Calculated block index out of bounds for Q8_0 embedding table. "
75 "Token: " +
76 std::to_string(token_id) +
77 ", StartBlock: " + std::to_string(start_block_idx) +
78 ", EndBlock: " + std::to_string(end_block_idx) +
79 ", TableSize: " + std::to_string(embed_tokens_q8_0.size()));
80 return embedding_vec;
81 }
82
83 float dequantized_block[GGML_QK8_0];
84
85 for (size_t block_n = 0; block_n < blocks_per_row; ++block_n) {
86 dequantize_q8_0_block(&embed_tokens_q8_0[start_block_idx + block_n],
87 dequantized_block);
88 size_t dest_offset = block_n * GGML_QK8_0;
89 size_t elements_to_copy = SAFE_MIN(static_cast<size_t>(GGML_QK8_0), static_cast<size_t>(hs - dest_offset));
90 std::memcpy(&embedding_vec[dest_offset], dequantized_block,
91 elements_to_copy * sizeof(float));
92
93 }
94
95 if (token_id < 2) {
96 float sum = 0.0f, min_val = embedding_vec[0], max_val = embedding_vec[0];
97 for (int i = 0; i < hs; ++i) {
98 sum += embedding_vec[i];
99 min_val = std::min(min_val, embedding_vec[i]);
100 max_val = std::max(max_val, embedding_vec[i]);
101 }
102 Logger::info("[Q8_0_EMBED_FINAL] Token " + std::to_string(token_id) +
103 " embedding stats: sum=" + std::to_string(sum) +
104 ", mean=" + std::to_string(sum / hs) +
105 ", min=" + std::to_string(min_val) +
106 ", max=" + std::to_string(max_val) +
107 ", first_4=[" + std::to_string(embedding_vec[0]) +
108 ", " + std::to_string(embedding_vec[1]) +
109 ", " + std::to_string(embedding_vec[2]) +
110 ", " + std::to_string(embedding_vec[3]) + "]");
111 }
112 return embedding_vec;
113 }
114
115 else if (!embed_tokens_q6k.empty()) {
116 if (hs % GGML_QK_K != 0) {
117 Logger::error("Hidden size (" + std::to_string(hs) +
118 ") is not divisible by GGML_QK_K (" +
119 std::to_string(GGML_QK_K) + ") for Q6_K embedding lookup.");
120 return embedding_vec;
121 }
122 size_t blocks_per_row = hs / GGML_QK_K;
123 size_t start_block_idx = (size_t)token_id * blocks_per_row;
124 size_t end_block_idx = start_block_idx + blocks_per_row;
125
126 if (end_block_idx > embed_tokens_q6k.size()) {
128 "Calculated block index out of bounds for Q6_K embedding table. "
129 "Token: " +
130 std::to_string(token_id) +
131 ", StartBlock: " + std::to_string(start_block_idx) +
132 ", EndBlock: " + std::to_string(end_block_idx) +
133 ", TableSize: " + std::to_string(embed_tokens_q6k.size()));
134 return embedding_vec;
135 }
136
137 float dequantized_block[GGML_QK_K];
138 for (size_t block_n = 0; block_n < blocks_per_row; ++block_n) {
139 dequantize_q6_k(&embed_tokens_q6k[start_block_idx + block_n],
140 dequantized_block, GGML_QK_K);
141 size_t dest_offset = block_n * GGML_QK_K;
142 size_t elements_to_copy = SAFE_MIN(static_cast<size_t>(GGML_QK_K), static_cast<size_t>(hs - dest_offset));
143 std::memcpy(&embedding_vec[dest_offset], dequantized_block,
144 elements_to_copy * sizeof(float));
145 }
146 return embedding_vec;
147 }
148
149 else if (!embed_tokens_f32.empty()) {
150 size_t offset = (size_t)token_id * hs;
151 if (offset + hs > embed_tokens_f32.size()) {
152 Logger::error("Embedding offset out of bounds in F32 lookup for token: " +
153 std::to_string(token_id));
154 return embedding_vec;
155 }
156
157 std::copy(embed_tokens_f32.begin() + offset,
158 embed_tokens_f32.begin() + offset + hs, embedding_vec.begin());
159 return embedding_vec;
160
161 } else if (!embed_tokens.empty()) {
162 size_t offset = (size_t)token_id * hs;
163 if (offset + hs > embed_tokens.size()) {
165 "Embedding offset out of bounds in BF16 lookup for token: " +
166 std::to_string(token_id));
167 return embedding_vec;
168 }
169 std::vector<uint16_t> token_embedding_bf16(
170 embed_tokens.begin() + offset, embed_tokens.begin() + offset + hs);
171
172 embedding_vec = bf16vec_to_float_vec(token_embedding_bf16);
173 return embedding_vec;
174
175 } else {
177 "No valid embedding table found (Q4_K, Q8_0, Q6_K, F32, BF16) for token: " +
178 std::to_string(token_id));
179
180 return embedding_vec;
181 }
182}
183
185 Logger::info("[ROPE_FREQ_ENTRY] Entered initialize_rope_freqs.");
186
187 Logger::info("[ROPE_FREQ_CHECK] num_attention_heads: " + std::to_string(config_.num_attention_heads));
188 if (config_.num_attention_heads == 0) {
189 Logger::error("Cannot initialize RoPE frequencies: num_attention_heads is zero.");
190 return;
191 }
193 Logger::info("[ROPE_FREQ_CHECK] calculated head_dim: " + std::to_string(head_dim));
194 if (head_dim == 0) {
195 Logger::error("Cannot initialize RoPE frequencies: calculated head_dim is zero.");
196 return;
197 }
198 Logger::info("[ROPE_FREQ_CHECK] head_dim % 2 check. head_dim: " + std::to_string(head_dim));
199 if (head_dim % 2 != 0) {
200 Logger::error("Cannot initialize RoPE frequencies: head_dim must be even.");
201 return;
202 }
203
204 Logger::info("[ROPE_INIT] Initializing RoPE with head_dim=" + std::to_string(head_dim) +
205 ", configured max_pos_emb=" + std::to_string(config_.max_position_embeddings) +
206 ", using internal rope::MAX_SEQUENCE_LENGTH=" + std::to_string(rope::MAX_SEQUENCE_LENGTH) +
207 ", configured rope_theta=" + std::to_string(config_.rope_theta));
208
209
210 if (precomputed_freqs_cis_.empty()) {
211 int max_seq_len = rope::MAX_SEQUENCE_LENGTH;
212 size_t required_size = (static_cast<size_t>(max_seq_len) * head_dim) / 2;
213 if (required_size == 0) {
214 Logger::warning("RoPE precomputation resulted in zero size. Max seq len: " +
215 std::to_string(max_seq_len) + ", head_dim: " + std::to_string(head_dim));
216 return;
217 }
218 precomputed_freqs_cis_.resize(required_size);
219
220 float rope_theta = config_.rope_theta > 0 ? config_.rope_theta : rope::ROPE_THETA;
221
222 for (int pos = 0; pos < max_seq_len; ++pos) {
223 for (int i = 0; i < head_dim; i += 2) {
224 float freq = 1.0f / std::pow(rope_theta, float(i) / head_dim);
225 float val = static_cast<float>(pos) * freq;
226 float cos_val = std::cos(val);
227 float sin_val = std::sin(val);
228 size_t flat_idx = (static_cast<size_t>(pos) * head_dim / 2) + (i / 2);
229 if (flat_idx < precomputed_freqs_cis_.size()){
230 precomputed_freqs_cis_[flat_idx] = {cos_val, sin_val};
231 } else {
232 Logger::error("RoPE precomputation index out of bounds: " + std::to_string(flat_idx) +
233 " vs size " + std::to_string(precomputed_freqs_cis_.size()));
234 return;
235 }
236 }
237 }
238 Logger::info("Precomputed RoPE frequencies on CPU. Size: " + std::to_string(precomputed_freqs_cis_.size()));
239 } else {
240 Logger::info("RoPE frequencies already precomputed.");
241 }
242}
243
245 return config_.vocab_size;
246}
static void warning(const std::string &message)
Definition logger.cpp:139
static void info(const std::string &message)
Definition logger.cpp:135
static void error(const std::string &message)
Definition logger.cpp:143
std::vector< block_q6_K > embed_tokens_q6k
Definition model.h:488
std::vector< float > lookup_embedding(int token_id)
Lookup the embedding vector for a given token ID.
void initialize_rope_freqs()
int get_vocab_size() const
Get the vocabulary size for the model.
std::vector< std::pair< float, float > > precomputed_freqs_cis_
Definition model.h:554
std::vector< block_q8_0 > embed_tokens_q8_0
Definition model.h:489
ModelConfig config_
Definition model.h:480
std::vector< block_q4_K > embed_tokens_q4k
Definition model.h:487
std::vector< uint16_t > embed_tokens
Definition model.h:483
std::vector< float > embed_tokens_f32
Definition model.h:486
constexpr size_t GGML_QK8_0
Definition gguf_parser.h:43
constexpr size_t GGML_QK_K
Block size constants for different quantization formats.
Definition gguf_parser.h:42
Logging utilities for the TinyLlama implementation.
Constants used throughout the TinyLlama model implementation.
#define SAFE_MIN(a, b)
constexpr float ROPE_THETA
constexpr int MAX_SEQUENCE_LENGTH
void dequantize_q4_k_m(const block_q4_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
void dequantize_q8_0_block(const block_q8_0 *qblock, float *output)
Dequantizes a Q8_0 block to float32.
void dequantize_q6_k(const block_q6_K *qblock, float *output, int num_weights_in_block, bool log_this_block)
Weight quantization structures and functions for model compression.
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
int num_attention_heads
Definition model.h:83
float rope_theta
Definition model.h:89
int max_position_embeddings
Definition model.h:87
std::vector< float > bf16vec_to_float_vec(const std::vector< uint16_t > &v_bf16)
Definition utils.cpp:198