| clear_layer_dequantized_weights(int layer_idx) | TinyLlamaModel | |
| config_ | TinyLlamaModel | private |
| cpu_batch_processor_ | TinyLlamaModel | private |
| CPUBatchProcessor | TinyLlamaModel | friend |
| embed_tokens | TinyLlamaModel | private |
| embed_tokens_f32 | TinyLlamaModel | private |
| embed_tokens_q4k | TinyLlamaModel | private |
| embed_tokens_q6k | TinyLlamaModel | private |
| embed_tokens_q8_0 | TinyLlamaModel | private |
| embed_tokens_q8k | TinyLlamaModel | private |
| ensure_bf16_concatenated_weights_loaded() | TinyLlamaModel | |
| ensure_down_proj_dequantized(int layer_idx) | TinyLlamaModel | |
| ensure_embed_tokens_dequantized() | TinyLlamaModel | |
| ensure_f32_concatenated_weights_loaded() | TinyLlamaModel | |
| ensure_gate_proj_dequantized(int layer_idx) | TinyLlamaModel | |
| ensure_k_proj_dequantized(int layer_idx) | TinyLlamaModel | |
| ensure_layer_weights_on_gpu(int layer_idx) | TinyLlamaModel | |
| ensure_lm_head_dequantized() | TinyLlamaModel | |
| ensure_o_proj_dequantized(int layer_idx) | TinyLlamaModel | |
| ensure_q_proj_dequantized(int layer_idx) | TinyLlamaModel | |
| ensure_up_proj_dequantized(int layer_idx) | TinyLlamaModel | |
| ensure_v_proj_dequantized(int layer_idx) | TinyLlamaModel | |
| f32_concatenated_weights_loaded_ | TinyLlamaModel | private |
| final_norm | TinyLlamaModel | private |
| final_norm_f32 | TinyLlamaModel | private |
| final_norm_q4k | TinyLlamaModel | private |
| final_norm_q6k | TinyLlamaModel | private |
| forward(std::vector< float > &input, int n_tokens, KVCache *kv_cache, const std::vector< int > *attention_mask) | TinyLlamaModel | |
| forward_cpu_batch(const std::vector< float > &batch_input_activations, int num_tokens_in_batch, int num_cpu_layers_to_process, int start_pos_in_sequence, KVCache *kv_cache, const std::vector< int > &prompt_lengths={}) | TinyLlamaModel | |
| forward_cpu_batch_generation(const std::vector< float > &batch_input_activations, const std::vector< int > &token_positions, const std::vector< int > &original_sequence_indices, int num_tokens_in_batch, KVCache *kv_cache) | TinyLlamaModel | |
| forward_cpu_logits_batch(const std::vector< float > &final_batch_activations, int num_tokens_in_batch) | TinyLlamaModel | |
| free_bf16_concatenated_weights() | TinyLlamaModel | |
| free_layer_gpu_weights(int layer_idx) | TinyLlamaModel | |
| get_config() const | TinyLlamaModel | inline |
| get_embed_tokens() const | TinyLlamaModel | inline |
| get_gguf_data() const | TinyLlamaModel | inline |
| get_gguf_data_ptr() | TinyLlamaModel | inline |
| get_layers() | TinyLlamaModel | inline |
| get_lm_head() const | TinyLlamaModel | inline |
| get_vocab_size() const | TinyLlamaModel | |
| gguf_data_ | TinyLlamaModel | private |
| initialize_gpu_and_rope() | TinyLlamaModel | |
| initialize_rope_freqs() | TinyLlamaModel | |
| initialize_weights(const SafeTensorsLoader *loader, const GGUFData *gguf) | TinyLlamaModel | private |
| layers | TinyLlamaModel | private |
| lm_head | TinyLlamaModel | private |
| lm_head_f32 | TinyLlamaModel | private |
| lm_head_q4k | TinyLlamaModel | private |
| lm_head_q6k | TinyLlamaModel | private |
| lm_head_q8_0 | TinyLlamaModel | private |
| lm_head_q8k | TinyLlamaModel | private |
| lookup_embedding(int token_id) | TinyLlamaModel | |
| map_gguf_weights | TinyLlamaModel | friend |
| model_path_ | TinyLlamaModel | private |
| precomputed_freqs_cis_ | TinyLlamaModel | private |
| smart_gemm_batch_cuda(bool transa_user, bool transb_user, int m_user, int n_user, int k_user, const float *alpha_user, const float *A_f32_user, int lda_user, const float *B_f32_user, int ldb_user, const float *beta_user, float *C_f32_user, int ldc_user, cudaStream_t stream, const char *operation_name="GEMM") | TinyLlamaModel | |
| TinyLlamaModel(const ModelConfig &config, const SafeTensorsLoader &loader) | TinyLlamaModel | |
| TinyLlamaModel(const ModelConfig &initial_config, const std::string &model_path) | TinyLlamaModel | |
| TinyLlamaModel(const ModelConfig &config_from_session, std::unique_ptr< GGUFData > gguf_data_from_session) | TinyLlamaModel | |
| use_bf16_tensor_cores_ | TinyLlamaModel | private |
| ~TinyLlamaModel() | TinyLlamaModel | |