43 const std::string& tokenizer_path,
int threads = 1,
44 int num_gpu_layers_from_cli = 0,
bool cli_use_mmap =
true,
45 bool use_kv_quant =
false,
bool use_batch_generation =
false,
46 int max_batch_size = 1);
73 std::string
generate(
const std::string& prompt,
int steps = 128,
74 float temperature = 0.1f,
77 const std::string& system_prompt =
"",
78 bool apply_q_a_format =
false);
97 std::vector<std::string>
generate_batch(
const std::vector<std::string>& prompts,
99 float temperature = 0.1f,
102 const std::string& system_prompt =
"",
103 bool apply_q_a_format =
false);
115 const std::vector<int>& prompt_lengths,
116 std::vector<std::vector<float>>& batch_final_logits);
119 const std::vector<int>& token_positions,
120 const std::vector<int>& original_sequence_indices,
121 std::vector<std::vector<float>>& batch_logits);
128 std::mt19937
rng_{std::random_device{}()};
A lightweight tokenizer implementation for text processing.
Represents an active TinyLlama session holding the loaded model and tokenizer.
bool use_batch_generation_
bool batch_generation_parallel(const std::vector< int > ¤t_tokens, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, std::vector< std::vector< float > > &batch_logits)
bool batch_prefill_parallel(const std::vector< std::vector< int > > &all_tokens, const std::vector< int > &prompt_lengths, std::vector< std::vector< float > > &batch_final_logits)
std::stringstream generated_stream_
std::unique_ptr< TinyLlamaModel > model_
TinyLlamaSession(const TinyLlamaSession &)=delete
std::unique_ptr< Tokenizer > tokenizer_
const Tokenizer * get_tokenizer() const
std::string generate(const std::string &prompt, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text based on a given prompt.
std::vector< std::string > generate_batch(const std::vector< std::string > &prompts, int steps=128, float temperature=0.1f, int top_k=40, float top_p=0.9f, const std::string &system_prompt="", bool apply_q_a_format=false)
Generates text for multiple prompts in a single batch (parallel processing).
const ModelConfig & get_config() const
std::string generated_text_for_api_return_
~TinyLlamaSession()
Destructor to ensure proper cleanup (e.g., KVCache CUDA memory).
TinyLlamaSession & operator=(const TinyLlamaSession &)=delete
Complete Key-Value cache for all transformer layers.
Model configuration structure holding architecture and hyperparameters.