TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
Functions
bindings.cpp File Reference
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/iostream.h>
#include "model_constants.h"
#include "model.h"
#include "api.h"
Include dependency graph for bindings.cpp:

Go to the source code of this file.

Functions

 PYBIND11_MODULE (tinyllama_bindings, m)
 

Function Documentation

◆ PYBIND11_MODULE()

PYBIND11_MODULE ( tinyllama_bindings  ,
 
)

Definition at line 11 of file bindings.cpp.

11 {
12 m.doc() = R"pbdoc(
13 TinyLlama.cpp Python Bindings
14 =============================
15
16 Python bindings for the tinyllama.cpp inference engine supporting both GGUF and SafeTensors formats.
17
18 Key Classes:
19 - TinyLlamaSession: Main interface for model loading and text generation
20 - ModelConfig: Configuration object containing model parameters and metadata
21
22 Example Usage:
23 import tinyllama_cpp
24
25 # Basic CPU usage
26 session = tinyllama_cpp.TinyLlamaSession(
27 model_path="path/to/model.gguf",
28 tokenizer_path="path/to/tokenizer.json",
29 threads=4
30 )
31
32 # GPU usage with quantized KV cache
33 session = tinyllama_cpp.TinyLlamaSession(
34 model_path="path/to/model.gguf",
35 tokenizer_path="path/to/tokenizer.json",
36 threads=4,
37 n_gpu_layers=-1,
38 use_kv_quant=True
39 )
40
41 # Generate text
42 response = session.generate("What is AI?", steps=64, temperature=0.7)
43 print(response)
44
45 # Batch generation
46 prompts = ["What is AI?", "Explain quantum computing", "Tell me a joke"]
47 responses = session.generate_batch(prompts, steps=64)
48 for prompt, response in zip(prompts, responses):
49 print(f"Q: {prompt}\nA: {response}\n")
50 )pbdoc";
51
52 py::add_ostream_redirect(m, "ostream_redirect");
53
54 py::class_<ModelConfig> model_config_class(m, "ModelConfig", R"pbdoc(
55 Model configuration containing architecture parameters and metadata.
56
57 This class holds all the configuration parameters for a loaded model,
58 including architecture details, tokenizer information, and model metadata.
59 Most fields are automatically populated when loading a model.
60 )pbdoc");
61
62 model_config_class
63 .def(py::init<>(), "Create an empty ModelConfig object")
64 .def_readwrite("hidden_size", &ModelConfig::hidden_size, "Hidden dimension size of the model")
65 .def_readwrite("intermediate_size", &ModelConfig::intermediate_size, "Intermediate size in feed-forward layers")
66 .def_readwrite("num_attention_heads", &ModelConfig::num_attention_heads, "Number of attention heads")
67 .def_readwrite("num_key_value_heads", &ModelConfig::num_key_value_heads, "Number of key-value heads (for GQA)")
68 .def_readwrite("num_hidden_layers", &ModelConfig::num_hidden_layers, "Number of transformer layers")
69 .def_readwrite("vocab_size", &ModelConfig::vocab_size, "Vocabulary size")
70 .def_readwrite("max_position_embeddings", &ModelConfig::max_position_embeddings, "Maximum sequence length")
71 .def_readwrite("rms_norm_eps", &ModelConfig::rms_norm_eps, "RMS normalization epsilon")
72 .def_readwrite("rope_theta", &ModelConfig::rope_theta, "RoPE theta parameter")
73 .def_readwrite("hidden_act", &ModelConfig::hidden_act, "Activation function name")
74 .def_readwrite("torch_dtype", &ModelConfig::torch_dtype, "Original PyTorch data type")
75 .def_readwrite("bos_token_id", &ModelConfig::bos_token_id, "Beginning-of-sequence token ID")
76 .def_readwrite("eos_token_id", &ModelConfig::eos_token_id, "End-of-sequence token ID")
77 .def_readwrite("architecture", &ModelConfig::architecture, "Model architecture name")
78 .def_readwrite("model_name", &ModelConfig::model_name, "Model name")
79 .def_readwrite("chat_template_type", &ModelConfig::chat_template_type, "Chat template type")
80 .def_readwrite("pre_tokenizer_type", &ModelConfig::pre_tokenizer_type, "Pre-tokenizer type")
81 .def_readwrite("chat_template_string", &ModelConfig::chat_template_string, "Chat template string")
82 .def_readwrite("is_gguf_file_loaded", &ModelConfig::is_gguf_file_loaded, "Whether model was loaded from GGUF format")
83 .def_readonly("tokenizer_family", &ModelConfig::tokenizer_family, "Tokenizer family (LLAMA_SENTENCEPIECE or LLAMA3_TIKTOKEN)")
84 .def("__repr__",
85 [](const ModelConfig &cfg) {
86 std::string tf_str = "UNKNOWN";
87 if (cfg.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE) tf_str = "LLAMA_SENTENCEPIECE";
88 else if (cfg.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN) tf_str = "LLAMA3_TIKTOKEN";
89 return "<ModelConfig: vocab_size=" + std::to_string(cfg.vocab_size) +
90 ", hidden_size=" + std::to_string(cfg.hidden_size) +
91 ", tokenizer_family=" + tf_str +
92 ">";
93 }
94 );
95
96 py::enum_<ModelConfig::TokenizerFamily>(model_config_class, "TokenizerFamily", "Enumeration of supported tokenizer families")
97 .value("UNKNOWN", ModelConfig::TokenizerFamily::UNKNOWN, "Unknown tokenizer family")
98 .value("LLAMA_SENTENCEPIECE", ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE, "Llama/Llama2 SentencePiece tokenizer")
99 .value("LLAMA3_TIKTOKEN", ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN, "Llama3 TikToken-style tokenizer")
100 .export_values();
101
102 py::class_<tinyllama::TinyLlamaSession>(m, "TinyLlamaSession", R"pbdoc(
103 Main interface for TinyLlama model inference.
104
105 This class provides a high-level interface for loading models and generating text.
106 It supports both GGUF and SafeTensors formats, CPU and GPU inference, and various
107 sampling strategies for text generation.
108
109 The session manages the model, tokenizer, and KV cache, providing both single
110 prompt generation and efficient batch processing capabilities.
111 )pbdoc")
112 .def(py::init([](const py::object& model_path, const py::object& tokenizer_path, int threads, int n_gpu_layers, bool use_mmap, bool use_kv_quant, bool use_batch_generation, int max_batch_size) {
113 // Convert path-like objects to strings
114 std::string model_path_str = py::str(model_path);
115 std::string tokenizer_path_str = py::str(tokenizer_path);
116 return new tinyllama::TinyLlamaSession(model_path_str, tokenizer_path_str, threads, n_gpu_layers, use_mmap, use_kv_quant, use_batch_generation, max_batch_size);
117 }),
118 py::arg("model_path"),
119 py::arg("tokenizer_path"),
120 py::arg("threads") = 1,
121 py::arg("n_gpu_layers") = 0,
122 py::arg("use_mmap") = true,
123 py::arg("use_kv_quant") = false,
124 py::arg("use_batch_generation") = false,
125 py::arg("max_batch_size") = 1,
126 R"pbdoc(
127 Initialize a TinyLlama inference session.
128
129 Args:
130 model_path (str or Path): Path to model directory (SafeTensors) or .gguf file
131 tokenizer_path (str or Path): Path to tokenizer.json file. For GGUF models with
132 embedded tokenizer, this can be the same as model_path
133 threads (int, optional): Number of CPU threads for inference. Defaults to 1.
134 n_gpu_layers (int, optional): Number of layers to offload to GPU.
135 -1 = all layers, 0 = CPU only. Defaults to 0.
136 use_mmap (bool, optional): Use memory mapping for model loading. Defaults to True.
137 use_kv_quant (bool, optional): Use INT8 quantization for KV cache on GPU.
138 Reduces VRAM usage. Defaults to False.
139 use_batch_generation (bool, optional): Enable optimized batch generation mode.
140 Defaults to False.
141 max_batch_size (int, optional): Maximum number of sequences for batch processing.
142 Defaults to 1.
143
144 Raises:
145 RuntimeError: If model loading fails due to invalid paths, unsupported format,
146 or insufficient resources.
147
148 Example:
149 # Basic CPU usage
150 session = TinyLlamaSession("model.gguf", "tokenizer.json", threads=4)
151
152 # Using pathlib.Path objects
Represents an active TinyLlama session holding the loaded model and tokenizer.
Definition api.h:26
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
int hidden_size
Definition model.h:81
int vocab_size
Definition model.h:86
std::string chat_template_string
Definition model.h:100
std::string pre_tokenizer_type
Definition model.h:99
std::string architecture
Definition model.h:96
std::string model_name
Definition model.h:97
float rms_norm_eps
Definition model.h:88
int num_attention_heads
Definition model.h:83
std::string chat_template_type
Definition model.h:98
int intermediate_size
Definition model.h:82
int eos_token_id
Definition model.h:93
std::string torch_dtype
Definition model.h:91
bool is_gguf_file_loaded
Definition model.h:101
float rope_theta
Definition model.h:89
int num_hidden_layers
Definition model.h:85
int num_key_value_heads
Definition model.h:84
int bos_token_id
Definition model.h:92
std::string hidden_act
Definition model.h:90
TokenizerFamily tokenizer_family
Definition model.h:117
int max_position_embeddings
Definition model.h:87

References ModelConfig::architecture, ModelConfig::bos_token_id, ModelConfig::chat_template_string, ModelConfig::chat_template_type, ModelConfig::eos_token_id, tinyllama::TinyLlamaSession::generate(), tinyllama::TinyLlamaSession::generate_batch(), tinyllama::TinyLlamaSession::get_config(), ModelConfig::hidden_act, ModelConfig::hidden_size, ModelConfig::intermediate_size, ModelConfig::is_gguf_file_loaded, ModelConfig::LLAMA3_TIKTOKEN, ModelConfig::LLAMA_SENTENCEPIECE, ModelConfig::max_position_embeddings, ModelConfig::model_name, ModelConfig::num_attention_heads, ModelConfig::num_hidden_layers, ModelConfig::num_key_value_heads, ModelConfig::pre_tokenizer_type, ModelConfig::rms_norm_eps, ModelConfig::rope_theta, ModelConfig::tokenizer_family, ModelConfig::torch_dtype, ModelConfig::UNKNOWN, and ModelConfig::vocab_size.