TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
bindings.cpp
Go to the documentation of this file.
1#include <pybind11/pybind11.h>
2#include <pybind11/stl.h>
3#include <pybind11/iostream.h>
4
5#include "model_constants.h"
6#include "model.h"
7#include "api.h"
8
9namespace py = pybind11;
10
11PYBIND11_MODULE(tinyllama_bindings, m) {
12 m.doc() = R"pbdoc(
13 TinyLlama.cpp Python Bindings
14 =============================
15
16 Python bindings for the tinyllama.cpp inference engine supporting both GGUF and SafeTensors formats.
17
18 Key Classes:
19 - TinyLlamaSession: Main interface for model loading and text generation
20 - ModelConfig: Configuration object containing model parameters and metadata
21
22 Example Usage:
23 import tinyllama_cpp
24
25 # Basic CPU usage
26 session = tinyllama_cpp.TinyLlamaSession(
27 model_path="path/to/model.gguf",
28 tokenizer_path="path/to/tokenizer.json",
29 threads=4
30 )
31
32 # GPU usage with quantized KV cache
33 session = tinyllama_cpp.TinyLlamaSession(
34 model_path="path/to/model.gguf",
35 tokenizer_path="path/to/tokenizer.json",
36 threads=4,
37 n_gpu_layers=-1,
38 use_kv_quant=True
39 )
40
41 # Generate text
42 response = session.generate("What is AI?", steps=64, temperature=0.7)
43 print(response)
44
45 # Batch generation
46 prompts = ["What is AI?", "Explain quantum computing", "Tell me a joke"]
47 responses = session.generate_batch(prompts, steps=64)
48 for prompt, response in zip(prompts, responses):
49 print(f"Q: {prompt}\nA: {response}\n")
50 )pbdoc";
51
52 py::add_ostream_redirect(m, "ostream_redirect");
53
54 py::class_<ModelConfig> model_config_class(m, "ModelConfig", R"pbdoc(
55 Model configuration containing architecture parameters and metadata.
56
57 This class holds all the configuration parameters for a loaded model,
58 including architecture details, tokenizer information, and model metadata.
59 Most fields are automatically populated when loading a model.
60 )pbdoc");
61
62 model_config_class
63 .def(py::init<>(), "Create an empty ModelConfig object")
64 .def_readwrite("hidden_size", &ModelConfig::hidden_size, "Hidden dimension size of the model")
65 .def_readwrite("intermediate_size", &ModelConfig::intermediate_size, "Intermediate size in feed-forward layers")
66 .def_readwrite("num_attention_heads", &ModelConfig::num_attention_heads, "Number of attention heads")
67 .def_readwrite("num_key_value_heads", &ModelConfig::num_key_value_heads, "Number of key-value heads (for GQA)")
68 .def_readwrite("num_hidden_layers", &ModelConfig::num_hidden_layers, "Number of transformer layers")
69 .def_readwrite("vocab_size", &ModelConfig::vocab_size, "Vocabulary size")
70 .def_readwrite("max_position_embeddings", &ModelConfig::max_position_embeddings, "Maximum sequence length")
71 .def_readwrite("rms_norm_eps", &ModelConfig::rms_norm_eps, "RMS normalization epsilon")
72 .def_readwrite("rope_theta", &ModelConfig::rope_theta, "RoPE theta parameter")
73 .def_readwrite("hidden_act", &ModelConfig::hidden_act, "Activation function name")
74 .def_readwrite("torch_dtype", &ModelConfig::torch_dtype, "Original PyTorch data type")
75 .def_readwrite("bos_token_id", &ModelConfig::bos_token_id, "Beginning-of-sequence token ID")
76 .def_readwrite("eos_token_id", &ModelConfig::eos_token_id, "End-of-sequence token ID")
77 .def_readwrite("architecture", &ModelConfig::architecture, "Model architecture name")
78 .def_readwrite("model_name", &ModelConfig::model_name, "Model name")
79 .def_readwrite("chat_template_type", &ModelConfig::chat_template_type, "Chat template type")
80 .def_readwrite("pre_tokenizer_type", &ModelConfig::pre_tokenizer_type, "Pre-tokenizer type")
81 .def_readwrite("chat_template_string", &ModelConfig::chat_template_string, "Chat template string")
82 .def_readwrite("is_gguf_file_loaded", &ModelConfig::is_gguf_file_loaded, "Whether model was loaded from GGUF format")
83 .def_readonly("tokenizer_family", &ModelConfig::tokenizer_family, "Tokenizer family (LLAMA_SENTENCEPIECE or LLAMA3_TIKTOKEN)")
84 .def("__repr__",
85 [](const ModelConfig &cfg) {
86 std::string tf_str = "UNKNOWN";
87 if (cfg.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE) tf_str = "LLAMA_SENTENCEPIECE";
88 else if (cfg.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN) tf_str = "LLAMA3_TIKTOKEN";
89 return "<ModelConfig: vocab_size=" + std::to_string(cfg.vocab_size) +
90 ", hidden_size=" + std::to_string(cfg.hidden_size) +
91 ", tokenizer_family=" + tf_str +
92 ">";
93 }
94 );
95
96 py::enum_<ModelConfig::TokenizerFamily>(model_config_class, "TokenizerFamily", "Enumeration of supported tokenizer families")
97 .value("UNKNOWN", ModelConfig::TokenizerFamily::UNKNOWN, "Unknown tokenizer family")
98 .value("LLAMA_SENTENCEPIECE", ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE, "Llama/Llama2 SentencePiece tokenizer")
99 .value("LLAMA3_TIKTOKEN", ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN, "Llama3 TikToken-style tokenizer")
100 .export_values();
101
102 py::class_<tinyllama::TinyLlamaSession>(m, "TinyLlamaSession", R"pbdoc(
103 Main interface for TinyLlama model inference.
104
105 This class provides a high-level interface for loading models and generating text.
106 It supports both GGUF and SafeTensors formats, CPU and GPU inference, and various
107 sampling strategies for text generation.
108
109 The session manages the model, tokenizer, and KV cache, providing both single
110 prompt generation and efficient batch processing capabilities.
111 )pbdoc")
112 .def(py::init([](const py::object& model_path, const py::object& tokenizer_path, int threads, int n_gpu_layers, bool use_mmap, bool use_kv_quant, bool use_batch_generation, int max_batch_size) {
113 // Convert path-like objects to strings
114 std::string model_path_str = py::str(model_path);
115 std::string tokenizer_path_str = py::str(tokenizer_path);
116 return new tinyllama::TinyLlamaSession(model_path_str, tokenizer_path_str, threads, n_gpu_layers, use_mmap, use_kv_quant, use_batch_generation, max_batch_size);
117 }),
118 py::arg("model_path"),
119 py::arg("tokenizer_path"),
120 py::arg("threads") = 1,
121 py::arg("n_gpu_layers") = 0,
122 py::arg("use_mmap") = true,
123 py::arg("use_kv_quant") = false,
124 py::arg("use_batch_generation") = false,
125 py::arg("max_batch_size") = 1,
126 R"pbdoc(
127 Initialize a TinyLlama inference session.
128
129 Args:
130 model_path (str or Path): Path to model directory (SafeTensors) or .gguf file
131 tokenizer_path (str or Path): Path to tokenizer.json file. For GGUF models with
132 embedded tokenizer, this can be the same as model_path
133 threads (int, optional): Number of CPU threads for inference. Defaults to 1.
134 n_gpu_layers (int, optional): Number of layers to offload to GPU.
135 -1 = all layers, 0 = CPU only. Defaults to 0.
136 use_mmap (bool, optional): Use memory mapping for model loading. Defaults to True.
137 use_kv_quant (bool, optional): Use INT8 quantization for KV cache on GPU.
138 Reduces VRAM usage. Defaults to False.
139 use_batch_generation (bool, optional): Enable optimized batch generation mode.
140 Defaults to False.
141 max_batch_size (int, optional): Maximum number of sequences for batch processing.
142 Defaults to 1.
143
144 Raises:
145 RuntimeError: If model loading fails due to invalid paths, unsupported format,
146 or insufficient resources.
147
148 Example:
149 # Basic CPU usage
150 session = TinyLlamaSession("model.gguf", "tokenizer.json", threads=4)
151
152 # Using pathlib.Path objects
153 from pathlib import Path
154 session = TinyLlamaSession(
155 model_path=Path("model.gguf"),
156 tokenizer_path=Path("tokenizer.json"),
157 threads=4,
158 n_gpu_layers=-1,
159 use_kv_quant=True
160 )
161 )pbdoc"
162 )
164 py::arg("prompt"),
165 py::arg("steps") = 128,
166 py::arg("temperature") = 0.1f,
167 py::arg("top_k") = 40,
168 py::arg("top_p") = 0.9f,
169 py::arg("system_prompt") = "",
170 py::arg("apply_q_a_format") = true,
171 R"pbdoc(
172 Generate text based on a prompt using various sampling strategies.
173
174 Args:
175 prompt (str): Input text prompt to generate from
176 steps (int, optional): Number of tokens to generate. Defaults to 128.
177 temperature (float, optional): Sampling temperature. Lower values (0.1)
178 produce more focused/deterministic output,
179 higher values (1.0+) more creative/random.
180 Defaults to 0.1.
181 top_k (int, optional): Top-K sampling - limit to K most likely tokens.
182 Set to 0 to disable. Defaults to 40.
183 top_p (float, optional): Nucleus sampling - limit to tokens comprising
184 top P probability mass (0.0-1.0). Defaults to 0.9.
185 system_prompt (str, optional): System prompt to guide generation behavior.
186 Defaults to empty string.
187 apply_q_a_format (bool, optional): Apply Q:A formatting to prompt.
188 Recommended for most models. Defaults to True.
189
190 Returns:
191 str: Generated text (excluding the original prompt)
192
193 Raises:
194 RuntimeError: If generation fails due to tokenization errors or model issues.
195
196 Example:
197 # Basic generation
198 response = session.generate("What is artificial intelligence?")
199
200 # Creative generation with higher temperature
201 story = session.generate(
202 "Once upon a time",
203 steps=200,
204 temperature=0.8,
205 top_k=50
206 )
207
208 # Focused generation with system prompt
209 answer = session.generate(
210 "Explain quantum computing",
211 steps=100,
212 temperature=0.1,
213 system_prompt="You are a helpful physics teacher."
214 )
215 )pbdoc"
216 )
217 .def("generate_batch", &tinyllama::TinyLlamaSession::generate_batch,
218 py::arg("prompts"),
219 py::arg("steps") = 128,
220 py::arg("temperature") = 0.1f,
221 py::arg("top_k") = 40,
222 py::arg("top_p") = 0.9f,
223 py::arg("system_prompt") = "",
224 py::arg("apply_q_a_format") = true,
225 R"pbdoc(
226 Generate text for multiple prompts in parallel (batch processing).
227
228 This method processes multiple independent prompts simultaneously, providing
229 significant efficiency gains over sequential generate() calls. Each prompt
230 maintains its own KV cache state and is processed independently.
231
232 Args:
233 prompts (List[str]): List of input prompts to process in batch
234 steps (int, optional): Number of tokens to generate per prompt. Defaults to 128.
235 temperature (float, optional): Sampling temperature applied to all prompts.
236 Defaults to 0.1.
237 top_k (int, optional): Top-K sampling parameter for all prompts. Defaults to 40.
238 top_p (float, optional): Nucleus sampling parameter for all prompts. Defaults to 0.9.
239 system_prompt (str, optional): System prompt applied to all prompts.
240 Defaults to empty string.
241 apply_q_a_format (bool, optional): Apply Q:A formatting to all prompts.
242 Defaults to True.
243
244 Returns:
245 List[str]: List of generated text strings, one for each input prompt
246
247 Raises:
248 RuntimeError: If batch generation fails or prompts list is empty.
249
250 Example:
251 prompts = [
252 "What is machine learning?",
253 "Explain neural networks",
254 "How does backpropagation work?"
255 ]
256
257 responses = session.generate_batch(
258 prompts,
259 steps=100,
260 temperature=0.2
261 )
262
263 for prompt, response in zip(prompts, responses):
264 print(f"Q: {prompt}")
265 print(f"A: {response}\n")
266 )pbdoc"
267 )
268 .def("get_config", &tinyllama::TinyLlamaSession::get_config,
269 py::return_value_policy::reference_internal,
270 R"pbdoc(
271 Get the model configuration.
272
273 Returns:
274 ModelConfig: Reference to the session's model configuration containing
275 architecture parameters, tokenizer info, and metadata.
276
277 Example:
278 config = session.get_config()
279 print(f"Model has {config.num_hidden_layers} layers")
280 print(f"Vocabulary size: {config.vocab_size}")
281 print(f"Hidden size: {config.hidden_size}")
282 )pbdoc"
283 );
284}
PYBIND11_MODULE(tinyllama_bindings, m)
Definition bindings.cpp:11
Represents an active TinyLlama session holding the loaded model and tokenizer.
Definition api.h:26
std::string generate(const std::string &prompt, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text based on a given prompt.
Definition api.cpp:433
std::vector< std::string > generate_batch(const std::vector< std::string > &prompts, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text for multiple prompts in a single batch (parallel processing).
Definition api.cpp:780
const ModelConfig & get_config() const
Definition api.h:106
Constants used throughout the TinyLlama model implementation.
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
std::string chat_template_string
Definition model.h:100
std::string pre_tokenizer_type
Definition model.h:99
std::string architecture
Definition model.h:96
std::string model_name
Definition model.h:97
float rms_norm_eps
Definition model.h:88
int num_attention_heads
Definition model.h:83
std::string chat_template_type
Definition model.h:98
int intermediate_size
Definition model.h:82
int eos_token_id
Definition model.h:93
std::string torch_dtype
Definition model.h:91
bool is_gguf_file_loaded
Definition model.h:101
float rope_theta
Definition model.h:89
int num_hidden_layers
Definition model.h:85
int num_key_value_heads
Definition model.h:84
int bos_token_id
Definition model.h:92
std::string hidden_act
Definition model.h:90
TokenizerFamily tokenizer_family
Definition model.h:117
int max_position_embeddings
Definition model.h:87