TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
api.h
Go to the documentation of this file.
1#ifndef TINYLLAMA_API_H
2#define TINYLLAMA_API_H
3
4#include <memory>
5#include <stdexcept>
6#include <string>
7#include <vector>
8#include <random>
9#include <sstream>
10
11#include "model.h"
12#include "tokenizer.h"
13
14struct KVCache;
15
16namespace tinyllama {
17
27 public:
42 TinyLlamaSession(const std::string& model_path,
43 const std::string& tokenizer_path, int threads = 1,
44 int num_gpu_layers_from_cli = 0, bool cli_use_mmap = true,
45 bool use_kv_quant = false, bool use_batch_generation = false,
46 int max_batch_size = 1);
47
52
73 std::string generate(const std::string& prompt, int steps = 128,
74 float temperature = 0.1f,
75 int top_k = 40,
76 float top_p = 0.9f,
77 const std::string& system_prompt = "",
78 bool apply_q_a_format = false);
79
97 std::vector<std::string> generate_batch(const std::vector<std::string>& prompts,
98 int steps = 128,
99 float temperature = 0.1f,
100 int top_k = 40,
101 float top_p = 0.9f,
102 const std::string& system_prompt = "",
103 bool apply_q_a_format = false);
104
105 const Tokenizer* get_tokenizer() const { return tokenizer_.get(); }
106 const ModelConfig& get_config() const { return config_; }
108
109 private:
112
113 // Parallel batch processing methods
114 bool batch_prefill_parallel(const std::vector<std::vector<int>>& all_tokens,
115 const std::vector<int>& prompt_lengths,
116 std::vector<std::vector<float>>& batch_final_logits);
117
118 bool batch_generation_parallel(const std::vector<int>& current_tokens,
119 const std::vector<int>& token_positions,
120 const std::vector<int>& original_sequence_indices,
121 std::vector<std::vector<float>>& batch_logits);
122
123 std::unique_ptr<TinyLlamaModel> model_;
124 std::unique_ptr<Tokenizer> tokenizer_;
128 std::mt19937 rng_{std::random_device{}()}; // RNG for sampling
130 bool use_batch_generation_; // Enable batch generation
131 std::stringstream generated_stream_; // Added for streaming output
132 std::string generated_text_for_api_return_; // Added to accumulate full response
133
134 // Batch processing support
135 int max_batch_size_ = 1; // Maximum number of sequences for batch processing
136};
137
138} // namespace tinyllama
139
140#endif
A lightweight tokenizer implementation for text processing.
Definition tokenizer.h:61
Represents an active TinyLlama session holding the loaded model and tokenizer.
Definition api.h:26
ModelConfig config_
Definition api.h:125
bool batch_generation_parallel(const std::vector< int > &current_tokens, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, std::vector< std::vector< float > > &batch_logits)
Definition api.cpp:1367
bool batch_prefill_parallel(const std::vector< std::vector< int > > &all_tokens, const std::vector< int > &prompt_lengths, std::vector< std::vector< float > > &batch_final_logits)
Definition api.cpp:1088
std::mt19937 rng_
Definition api.h:128
std::stringstream generated_stream_
Definition api.h:131
std::unique_ptr< TinyLlamaModel > model_
Definition api.h:123
TinyLlamaSession(const TinyLlamaSession &)=delete
std::unique_ptr< Tokenizer > tokenizer_
Definition api.h:124
const Tokenizer * get_tokenizer() const
Definition api.h:105
std::string generate(const std::string &prompt, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text based on a given prompt.
Definition api.cpp:433
KVCache & get_kv_cache()
Definition api.h:107
std::vector< std::string > generate_batch(const std::vector< std::string > &prompts, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text for multiple prompts in a single batch (parallel processing).
Definition api.cpp:780
const ModelConfig & get_config() const
Definition api.h:106
std::string generated_text_for_api_return_
Definition api.h:132
~TinyLlamaSession()
Destructor to ensure proper cleanup (e.g., KVCache CUDA memory).
Definition api.cpp:429
TinyLlamaSession & operator=(const TinyLlamaSession &)=delete
Complete Key-Value cache for all transformer layers.
Definition model.h:151
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80