1#include <pybind11/pybind11.h>
2#include <pybind11/stl.h>
3#include <pybind11/iostream.h>
9namespace py = pybind11;
13 TinyLlama.cpp Python Bindings
14 =============================
16 Python bindings for the tinyllama.cpp inference engine supporting both GGUF and SafeTensors formats.
19 - TinyLlamaSession: Main interface for model loading and text generation
20 - ModelConfig: Configuration object containing model parameters and metadata
26 session = tinyllama_cpp.TinyLlamaSession(
27 model_path="path/to/model.gguf",
28 tokenizer_path="path/to/tokenizer.json",
32 # GPU usage with quantized KV cache
33 session = tinyllama_cpp.TinyLlamaSession(
34 model_path="path/to/model.gguf",
35 tokenizer_path="path/to/tokenizer.json",
42 response = session.generate("What is AI?", steps=64, temperature=0.7)
46 prompts = ["What is AI?", "Explain quantum computing", "Tell me a joke"]
47 responses = session.generate_batch(prompts, steps=64)
48 for prompt, response in zip(prompts, responses):
49 print(f"Q: {prompt}\nA: {response}\n")
52 py::add_ostream_redirect(m, "ostream_redirect");
54 py::class_<ModelConfig> model_config_class(m,
"ModelConfig", R
"pbdoc(
55 Model configuration containing architecture parameters and metadata.
57 This class holds all the configuration parameters for a loaded model,
58 including architecture details, tokenizer information, and model metadata.
59 Most fields are automatically populated when loading a model.
63 .def(py::init<>(), "Create an empty ModelConfig object")
86 std::string tf_str =
"UNKNOWN";
89 return "<ModelConfig: vocab_size=" + std::to_string(cfg.
vocab_size) +
90 ", hidden_size=" + std::to_string(cfg.
hidden_size) +
91 ", tokenizer_family=" + tf_str +
96 py::enum_<ModelConfig::TokenizerFamily>(model_config_class,
"TokenizerFamily",
"Enumeration of supported tokenizer families")
102 py::class_<tinyllama::TinyLlamaSession>(m,
"TinyLlamaSession", R
"pbdoc(
103 Main interface for TinyLlama model inference.
105 This class provides a high-level interface for loading models and generating text.
106 It supports both GGUF and SafeTensors formats, CPU and GPU inference, and various
107 sampling strategies for text generation.
109 The session manages the model, tokenizer, and KV cache, providing both single
110 prompt generation and efficient batch processing capabilities.
112 .def(py::init([](const py::object& model_path,
const py::object& tokenizer_path,
int threads,
int n_gpu_layers,
bool use_mmap,
bool use_kv_quant,
bool use_batch_generation,
int max_batch_size) {
114 std::string model_path_str = py::str(model_path);
115 std::string tokenizer_path_str = py::str(tokenizer_path);
116 return new tinyllama::TinyLlamaSession(model_path_str, tokenizer_path_str, threads, n_gpu_layers, use_mmap, use_kv_quant, use_batch_generation, max_batch_size);
118 py::arg(
"model_path"),
119 py::arg(
"tokenizer_path"),
120 py::arg(
"threads") = 1,
121 py::arg(
"n_gpu_layers") = 0,
122 py::arg(
"use_mmap") =
true,
123 py::arg(
"use_kv_quant") =
false,
124 py::arg(
"use_batch_generation") =
false,
125 py::arg(
"max_batch_size") = 1,
127 Initialize a TinyLlama inference session.
130 model_path (str or Path): Path to model directory (SafeTensors) or .gguf file
131 tokenizer_path (str or Path): Path to tokenizer.json file. For GGUF models with
132 embedded tokenizer, this can be the same as model_path
133 threads (int, optional): Number of CPU threads for inference. Defaults to 1.
134 n_gpu_layers (int, optional): Number of layers to offload to GPU.
135 -1 = all layers, 0 = CPU only. Defaults to 0.
136 use_mmap (bool, optional): Use memory mapping for model loading. Defaults to True.
137 use_kv_quant (bool, optional): Use INT8 quantization for KV cache on GPU.
138 Reduces VRAM usage. Defaults to False.
139 use_batch_generation (bool, optional): Enable optimized batch generation mode.
141 max_batch_size (int, optional): Maximum number of sequences for batch processing.
145 RuntimeError: If model loading fails due to invalid paths, unsupported format,
146 or insufficient resources.
150 session = TinyLlamaSession("model.gguf", "tokenizer.json", threads=4)
152 # Using pathlib.Path objects
153 from pathlib import Path
154 session = TinyLlamaSession(
155 model_path=Path("model.gguf"),
156 tokenizer_path=Path("tokenizer.json"),
165 py::arg(
"steps") = 128,
166 py::arg(
"temperature") = 0.1f,
167 py::arg(
"top_k") = 40,
168 py::arg(
"top_p") = 0.9f,
169 py::arg(
"system_prompt") =
"",
170 py::arg(
"apply_q_a_format") =
true,
172 Generate text based on a prompt using various sampling strategies.
175 prompt (str): Input text prompt to generate from
176 steps (int, optional): Number of tokens to generate. Defaults to 128.
177 temperature (float, optional): Sampling temperature. Lower values (0.1)
178 produce more focused/deterministic output,
179 higher values (1.0+) more creative/random.
181 top_k (int, optional): Top-K sampling - limit to K most likely tokens.
182 Set to 0 to disable. Defaults to 40.
183 top_p (float, optional): Nucleus sampling - limit to tokens comprising
184 top P probability mass (0.0-1.0). Defaults to 0.9.
185 system_prompt (str, optional): System prompt to guide generation behavior.
186 Defaults to empty string.
187 apply_q_a_format (bool, optional): Apply Q:A formatting to prompt.
188 Recommended for most models. Defaults to True.
191 str: Generated text (excluding the original prompt)
194 RuntimeError: If generation fails due to tokenization errors or model issues.
198 response = session.generate("What is artificial intelligence?")
200 # Creative generation with higher temperature
201 story = session.generate(
208 # Focused generation with system prompt
209 answer = session.generate(
210 "Explain quantum computing",
213 system_prompt="You are a helpful physics teacher."
219 py::arg(
"steps") = 128,
220 py::arg(
"temperature") = 0.1f,
221 py::arg(
"top_k") = 40,
222 py::arg(
"top_p") = 0.9f,
223 py::arg(
"system_prompt") =
"",
224 py::arg(
"apply_q_a_format") =
true,
226 Generate text for multiple prompts in parallel (batch processing).
228 This method processes multiple independent prompts simultaneously, providing
229 significant efficiency gains over sequential generate() calls. Each prompt
230 maintains its own KV cache state and is processed independently.
233 prompts (List[str]): List of input prompts to process in batch
234 steps (int, optional): Number of tokens to generate per prompt. Defaults to 128.
235 temperature (float, optional): Sampling temperature applied to all prompts.
237 top_k (int, optional): Top-K sampling parameter for all prompts. Defaults to 40.
238 top_p (float, optional): Nucleus sampling parameter for all prompts. Defaults to 0.9.
239 system_prompt (str, optional): System prompt applied to all prompts.
240 Defaults to empty string.
241 apply_q_a_format (bool, optional): Apply Q:A formatting to all prompts.
245 List[str]: List of generated text strings, one for each input prompt
248 RuntimeError: If batch generation fails or prompts list is empty.
252 "What is machine learning?",
253 "Explain neural networks",
254 "How does backpropagation work?"
257 responses = session.generate_batch(
263 for prompt, response in zip(prompts, responses):
264 print(f"Q: {prompt}")
265 print(f"A: {response}\n")
269 py::return_value_policy::reference_internal,
271 Get the model configuration.
274 ModelConfig: Reference to the session's model configuration containing
275 architecture parameters, tokenizer info, and metadata.
278 config = session.get_config()
279 print(f"Model has {config.num_hidden_layers} layers")
280 print(f"Vocabulary size: {config.vocab_size}")
281 print(f"Hidden size: {config.hidden_size}")
PYBIND11_MODULE(tinyllama_bindings, m)
Represents an active TinyLlama session holding the loaded model and tokenizer.
std::string generate(const std::string &prompt, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text based on a given prompt.
std::vector< std::string > generate_batch(const std::vector< std::string > &prompts, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text for multiple prompts in a single batch (parallel processing).
const ModelConfig & get_config() const
Constants used throughout the TinyLlama model implementation.
Model configuration structure holding architecture and hyperparameters.
std::string chat_template_string
std::string pre_tokenizer_type
std::string chat_template_type
TokenizerFamily tokenizer_family
int max_position_embeddings