#include <cstdint>
#include <functional>
#include <nlohmann/json.hpp>
#include <string>
#include <unordered_map>
#include <vector>
#include "safetensors_loader.h"
#include <memory>
#include "quantization.h"

Include dependency graph for model.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes
struct	ModelConfig
	Model configuration structure holding architecture and hyperparameters. More...

struct	KVCacheLayer
	Key-Value cache for a single transformer layer. More...

struct	KVCache
	Complete Key-Value cache for all transformer layers. More...

struct	LayerWeights
	Structure holding all weights for a single transformer layer. More...

class	TinyLlamaModel
	Main transformer model class for TinyLlama. More...

Typedefs
using	ForwardDiagCallback = std::function< void(int layer, const std::string &name, const std::vector< float > &v)>

Enumerations
enum class	TensorName { Q_PROJ , K_PROJ , V_PROJ , O_PROJ , GATE_PROJ , UP_PROJ , DOWN_PROJ , TOKEN_EMBD , LM_HEAD , UNKNOWN }
	Enumeration of tensor names used in the TinyLlama model. More...

Functions
static std::string	tensor_name_to_string (TensorName tn)

ModelConfig	parse_model_config_from_gguf (const GGUFData &gguf)

ModelConfig	parse_model_config (const nlohmann::json &json)

int	argmax (const std::vector< float > &v)

float	bfloat16_to_float32 (uint16_t b16)

void	rmsnorm (const std::vector< float > &x, const std::vector< uint16_t > &weight, float eps, std::vector< float > &out)

void	matvec_bf16_f32 (const std::vector< uint16_t > &mat, const std::vector< float > &vec, std::vector< float > &out, int M, int N)

void	softmax (std::vector< float > &x)

std::vector< uint16_t >	uint8_vector_to_uint16_vector (const std::vector< uint8_t > &bytes, size_t numel)

void	log_vector_summary (const std::string &name, const std::vector< float > &v, int head_count=5)

void	log_vector_summary_batch (const std::string &name, const std::vector< float > &batch_vector, int num_tokens_in_batch, int single_token_vector_size, int head_count=5)

Typedef Documentation

◆ ForwardDiagCallback

using ForwardDiagCallback = std::function<void( int layer, const std::string& name, const std::vector<float>& v)>

Definition at line 230 of file model.h.

Enumeration Type Documentation

◆ TensorName

enum class TensorName

strong

Enumeration of tensor names used in the TinyLlama model.

This enum class defines the different types of tensors used in the transformer architecture, including attention projections, feed-forward layers, and embeddings.

Enumerator
Q_PROJ	Query projection matrix
K_PROJ	Key projection matrix
V_PROJ	Value projection matrix
O_PROJ	Output projection matrix
GATE_PROJ	Gate projection for SwiGLU activation
UP_PROJ	Upward projection in feed-forward network
DOWN_PROJ	Downward projection in feed-forward network
TOKEN_EMBD	Token embedding matrix
LM_HEAD	Language model head for final token prediction
UNKNOWN	Unknown tensor type

Definition at line 36 of file model.h.

                      {
    Q_PROJ,      
    K_PROJ,      
    V_PROJ,      
    O_PROJ,      
    GATE_PROJ,   
    UP_PROJ,     
    DOWN_PROJ,   
    TOKEN_EMBD,  
    LM_HEAD,     
    UNKNOWN      
};

Function Documentation

◆ argmax()

int argmax ( const std::vector< float > & v )

Definition at line 185 of file utils.cpp.

                                      {
  if (v.empty()) {
    Logger::error("Cannot perform argmax on empty vector");
    return -1;
  }
  auto max_it = std::max_element(v.begin(), v.end());
  float max_val = *max_it;
  int max_idx = std::distance(v.begin(), max_it);
  Logger::debug("[ARGMAX HELPER] Max value found: " + std::to_string(max_val) +
                " at index: " + std::to_string(max_idx));
  return max_idx;
}

References Logger::debug(), and Logger::error().

◆ bfloat16_to_float32()

float bfloat16_to_float32 ( uint16_t b16 )

Definition at line 144 of file utils.cpp.

                                         {
  if (bf16 == bfloat16::ZERO) return 0.0f;
  if (bf16 == bfloat16::NEG_ZERO) return -0.0f;
 
  bool is_nan = ((bf16 & bfloat16::EXPONENT_MASK) == bfloat16::EXPONENT_MASK) && 
                ((bf16 & bfloat16::MANTISSA_MASK) != 0);
  if (is_nan) return std::numeric_limits<float>::quiet_NaN();
 
  if ((bf16 & bfloat16::EXPONENT_MASK) == bfloat16::EXPONENT_MASK && 
      (bf16 & bfloat16::MANTISSA_MASK) == 0) {
    return (bf16 & bfloat16::SIGN_BIT) ? -std::numeric_limits<float>::infinity()
                                      : std::numeric_limits<float>::infinity();
  }
 
  uint32_t bits = static_cast<uint32_t>(bf16) << bfloat16::SHIFT_BITS;
  float result;
  std::memcpy(&result, &bits, sizeof(float));
 
  return result;
}

References bfloat16::EXPONENT_MASK, bfloat16::MANTISSA_MASK, bfloat16::NEG_ZERO, bfloat16::SHIFT_BITS, bfloat16::SIGN_BIT, and bfloat16::ZERO.

◆ log_vector_summary()

void log_vector_summary	(	const std::string &	name,
		const std::vector< float > &	v,
		int	head_count = `5`
	)

Definition at line 207 of file utils.cpp.

                                                                                          {
  if (v.empty()) {
    Logger::info(name + ": EMPTY");
    return;
  }
  std::stringstream ss;
  size_t actual_head_count = SAFE_MIN(static_cast<size_t>(head_count), v.size());
 
  ss << name << ": size=" << v.size();
 
  if (actual_head_count > 0) {
    ss << ", first " << actual_head_count << ": [";
    for (size_t i = 0; i < actual_head_count; ++i) {
      ss << (i > 0 ? " " : "") << std::fixed << std::setprecision(4) << v[i];
    }
    ss << "]";
  }
  float minv = *std::min_element(v.begin(), v.end());
  float maxv = *std::max_element(v.begin(), v.end());
  double sum = std::accumulate(v.begin(), v.end(), 0.0);
  float mean = sum / v.size();
  bool all_finite = std::all_of(v.begin(), v.end(), [](float x) { return std::isfinite(x); });
  ss << ", min=" << minv << ", max=" << maxv << ", mean=" << mean
     << ", finite=" << (all_finite ? "yes" : "NO");
  Logger::info(ss.str());
}

References Logger::info(), and SAFE_MIN.

Referenced by TinyLlamaModel::forward().

◆ log_vector_summary_batch()

void log_vector_summary_batch	(	const std::string &	name,
		const std::vector< float > &	batch_vector,
		int	num_tokens_in_batch,
		int	single_token_vector_size,
		int	head_count = `5`
	)

◆ matvec_bf16_f32()

void matvec_bf16_f32	(	const std::vector< uint16_t > &	mat,
		const std::vector< float > &	vec,
		std::vector< float > &	out,
		int	M,
		int	N
	)

◆ parse_model_config()

ModelConfig parse_model_config ( const nlohmann::json & json )

Definition at line 20 of file model_config.cpp.

                                                         {
  ModelConfig cfg;
  cfg.hidden_size = json.value("hidden_size", 0);
  cfg.intermediate_size = json.value("intermediate_size", 0);
  cfg.num_attention_heads = json.value("num_attention_heads", 0);
  cfg.num_key_value_heads = json.value("num_key_value_heads", 0);
  cfg.num_hidden_layers = json.value("num_hidden_layers", 0);
  cfg.vocab_size = json.value("vocab_size", 0);
  cfg.max_position_embeddings = json.value("max_position_embeddings", 0);
  cfg.rms_norm_eps = json.value("rms_norm_eps", 1e-5f);
  cfg.rope_theta = json.value("rope_theta", 10000.0f);
  cfg.hidden_act = json.value("hidden_act", "silu");
  cfg.torch_dtype = json.value("torch_dtype", "bfloat16");
  cfg.bos_token_id = json.value("bos_token_id", 1);
  cfg.eos_token_id = json.value("eos_token_id", 2);
  cfg.unk_token_id = json.value("unk_token_id", -1);
  cfg.pad_token_id = json.value("pad_token_id", -1); 
 
  // Infer Architecture if available
  if (json.contains("architectures") && json["architectures"].is_array() && !json["architectures"].empty()) {
      // Take the first architecture string if multiple are listed
      cfg.architecture = json["architectures"][0].get<std::string>();
  } else {
      cfg.architecture = "unknown"; 
  }
  cfg.model_name = json.value("model_type", cfg.architecture); // Use model_type or fallback to architecture
 
  
  Logger::info("[parse_json_config] Inferring tokenizer family for SafeTensors. Arch: '" + cfg.architecture + "', Vocab: " + std::to_string(cfg.vocab_size));
  bool is_llama3_vocab_size_json = (cfg.vocab_size == 128256);
  bool is_llama3_arch_hint_json = (cfg.architecture.find("LlamaForCausalLM") != std::string::npos && // Llama 3 often uses this
                              cfg.architecture.find("Llama2") == std::string::npos); // Exclude Llama 2 explicitly if needed
 
  if (is_llama3_vocab_size_json && is_llama3_arch_hint_json) {
      cfg.tokenizer_family = ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN;
      Logger::info("[parse_json_config] Result: Identified LLAMA3_TIKTOKEN (vocab size + arch hint).");
       if (cfg.rope_theta == 10000.0f) { 
            float llama3_rope_candidate = json.value("rope_theta", 500000.0f); // Check rope_theta in config.json
            if (llama3_rope_candidate > 10000.0f) {
                cfg.rope_theta = llama3_rope_candidate;
                Logger::info("[parse_json_config] Adjusted rope_theta to " + std::to_string(cfg.rope_theta) + " for Llama 3 model (was 10000.0).");
            }
       }
  } else if (cfg.vocab_size == 32000 || cfg.architecture.find("Llama") != std::string::npos) { // Common for Llama 1/2/TinyLlama
      cfg.tokenizer_family = ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE;
      Logger::info("[parse_json_config] Result: Identified LLAMA_SENTENCEPIECE (vocab size or arch hint).");
  } else {
      cfg.tokenizer_family = ModelConfig::TokenizerFamily::UNKNOWN;
      Logger::warning("[parse_json_config] Result: UNKNOWN tokenizer family.");
  }
  
 
  return cfg;
}

References ModelConfig::architecture, ModelConfig::bos_token_id, ModelConfig::eos_token_id, ModelConfig::hidden_act, ModelConfig::hidden_size, Logger::info(), ModelConfig::intermediate_size, ModelConfig::LLAMA3_TIKTOKEN, ModelConfig::LLAMA_SENTENCEPIECE, ModelConfig::max_position_embeddings, ModelConfig::model_name, ModelConfig::num_attention_heads, ModelConfig::num_hidden_layers, ModelConfig::num_key_value_heads, ModelConfig::pad_token_id, ModelConfig::rms_norm_eps, ModelConfig::rope_theta, ModelConfig::tokenizer_family, ModelConfig::torch_dtype, ModelConfig::unk_token_id, ModelConfig::UNKNOWN, ModelConfig::vocab_size, and Logger::warning().

◆ parse_model_config_from_gguf()

ModelConfig parse_model_config_from_gguf ( const GGUFData & gguf )

Definition at line 75 of file model_config.cpp.

                                                               {
  ModelConfig config;
  Logger::info("[parse_gguf_config] Entered function.");
 
  auto get_meta_string = [&](const std::string& key,
                             const std::string& default_val) -> std::string {
    auto it = gguf.metadata.find(key);
    if (it != gguf.metadata.end() &&
        std::holds_alternative<std::string>(it->second)) {
      return std::get<std::string>(it->second);
    }
    return default_val;
  };
 
  auto get_meta_value = [&](const std::string& key, auto default_value) {
    using TargetType = typename std::decay<decltype(default_value)>::type;
    auto it = gguf.metadata.find(key);
    if (it != gguf.metadata.end()) {
      return std::visit(
          [&](const auto& val) -> TargetType {
            using T = std::decay_t<decltype(val)>;
 
            if constexpr (std::is_integral_v<TargetType>) {
              if constexpr (std::is_integral_v<T> && !std::is_same_v<T, bool>) {
                if constexpr (std::is_unsigned_v<T> &&
                              std::is_signed_v<TargetType>) {
                  if (val > static_cast<std::make_unsigned_t<TargetType>>(
                                std::numeric_limits<TargetType>::max())) {
                    Logger::warning("Metadata key '" + key + "' value " +
                                    std::to_string(val) +
                                    " overflows TargetType. Using default.");
                    return default_value;
                  }
                }
 
                else if constexpr (std::is_signed_v<T> &&
                                   std::is_signed_v<TargetType> &&
                                   sizeof(T) > sizeof(TargetType)) {
                  if (val > static_cast<T>(
                                std::numeric_limits<TargetType>::max()) ||
                      val < static_cast<T>(
                                std::numeric_limits<TargetType>::lowest())) {
                    Logger::warning("Metadata key '" + key + "' value " +
                                    std::to_string(val) +
                                    " overflows TargetType. Using default.");
                    return default_value;
                  }
                }
                return static_cast<TargetType>(val);
              }
            } else if constexpr (std::is_floating_point_v<TargetType>) {
              if constexpr (std::is_floating_point_v<T>) {
                return static_cast<TargetType>(val);
              }
            } else if constexpr (std::is_same_v<TargetType, bool>) {
              if constexpr (std::is_same_v<T, bool>) {
                return val;
              }
            } else if constexpr (std::is_same_v<TargetType, std::string>) {
              if constexpr (std::is_same_v<T, std::string>) {
                return val;
              }
            }
            Logger::warning("Metadata key '" + key +
                            "' has stored type incompatible with requested "
                            "TargetType. Using default.");
            return default_value;
          },
          it->second);
    } else {
      return default_value;
    }
  };
 
  config.vocab_size = get_meta_value("tokenizer.ggml.vocab_size",
                                     get_meta_value("llama.vocab_size", 32000));
  config.hidden_size = get_meta_value("llama.embedding_length", 4096);
  config.intermediate_size = get_meta_value("llama.feed_forward_length", 11008);
  config.num_attention_heads = get_meta_value("llama.attention.head_count", 32);
  config.num_hidden_layers = get_meta_value("llama.block_count", 32);
  config.num_key_value_heads = get_meta_value("llama.attention.head_count_kv",
                                              config.num_attention_heads);
  config.max_position_embeddings = get_meta_value("llama.context_length", 4096);
  if (config.max_position_embeddings == 0 ||
      config.max_position_embeddings > 8192) {
    Logger::warning("max_position_embeddings from GGUF is " +
                    std::to_string(config.max_position_embeddings) +
                    ", overriding to sensible default (2048)");
    config.max_position_embeddings = 2048;
  }
  config.rms_norm_eps =
      get_meta_value("llama.attention.layer_norm_rms_epsilon", 1e-5f);
  config.rope_theta = get_meta_value("llama.rope.freq_base", 10000.0f);
  config.hidden_act = "silu";
  config.bos_token_id = get_meta_value("tokenizer.ggml.bos_token_id", -1);
  config.eos_token_id = get_meta_value("tokenizer.ggml.eos_token_id", -1);
  config.unk_token_id = get_meta_value("tokenizer.ggml.unk_token_id", -1);
  config.pad_token_id = get_meta_value("tokenizer.ggml.padding_token_id", -1);
 
  config.architecture = get_meta_string("general.architecture", "unknown");
  config.model_name = get_meta_string("general.name", "unknown");
  bool has_pre_key = gguf.metadata.count("tokenizer.ggml.pre");
  bool has_merges = !gguf.tokenizer_merges.empty();
 
  Logger::info("[parse_gguf_config] Architecture: " + config.architecture +
               ", Vocab Size: " + std::to_string(config.vocab_size) +
               ", Has Merges: " + (has_merges ? "Yes" : "No"));
 
  
  Logger::info("[parse_gguf_config] Identifying tokenizer family...");
  bool is_llama3_arch_hint = (config.architecture.find("llama3") != std::string::npos ||
                         config.architecture.find("Llama-3") != std::string::npos ||
                         config.architecture.find("Meta-Llama-3") != std::string::npos);
  bool is_llama3_vocab_size = (config.vocab_size == 128256);
  std::string ggml_tokenizer_model = get_meta_string("tokenizer.ggml.model", "");
  bool is_tiktoken_style_tokenizer_model = (ggml_tokenizer_model == "gpt2");
 
  Logger::info("[parse_gguf_config] L3 Hints: arch_hint=" + std::string(is_llama3_arch_hint ? "Y":"N") +
                 ", vocab_size_match=" + std::string(is_llama3_vocab_size ? "Y":"N") +
                 ", has_merges=" + std::string(has_merges ? "Y":"N") +
                 ", ggml_tokenizer_model_key='" + ggml_tokenizer_model + "' (is_tiktoken_style: " + std::string(is_tiktoken_style_tokenizer_model ? "Y":"N") + ")" );
 
  if (has_merges && is_llama3_vocab_size && is_tiktoken_style_tokenizer_model) {
    config.tokenizer_family = ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN;
    Logger::info("[parse_gguf_config] Result: Identified LLAMA3_TIKTOKEN (merges + vocab_size + ggml_tokenizer_model='gpt2'). Architecture string was: '" + config.architecture + "'");
    if (!is_llama3_arch_hint && config.architecture == "llama") {
         Logger::info("[parse_gguf_config] Note: Classified as Llama 3 based on tokenizer/vocab, but arch string was 'llama'.");
    }
    if (config.rope_theta == 10000.0f) { 
         float llama3_rope_candidate = get_meta_value("llama.rope.freq_base", 500000.0f); 
         if (llama3_rope_candidate > 10000.0f) {
             config.rope_theta = llama3_rope_candidate;
             Logger::info("[parse_gguf_config] Adjusted rope_theta to " + std::to_string(config.rope_theta) + " for Llama 3 model (was 10000.0).");
         }
    }
  } else if (config.architecture == "llama" || config.architecture.find("Llama-2") != std::string::npos || config.architecture.find("TinyLlama") != std::string::npos) {
    config.tokenizer_family = ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE;
     Logger::info("[parse_gguf_config] Result: Identified LLAMA_SENTENCEPIECE based on architecture: '" + config.architecture + "'");
  } else {
    config.tokenizer_family = ModelConfig::TokenizerFamily::UNKNOWN;
     Logger::info("[parse_gguf_config] Result: UNKNOWN tokenizer family for architecture: '" + config.architecture + "'");
  }
 
  // Existing chat_template_type and pre_tokenizer_type logic based on architecture and pre_key
  if (config.model_name.find("TinyLlama") != std::string::npos ||
      (config.architecture == "llama" && has_pre_key)) {
    config.chat_template_type = "tinyllama";
  } else if (config.architecture == "llama" && !has_pre_key) {
    config.chat_template_type = "llama2";
  } else {
    config.chat_template_type = "unknown";
    Logger::warning("Could not determine chat template type for arch='" +
                    config.architecture + "', name='" + config.model_name +
                    "'.");
  }
 
  if (has_pre_key) {
    config.pre_tokenizer_type =
        get_meta_string("tokenizer.ggml.pre", "unknown");
  } else if (config.architecture == "llama") {
    config.pre_tokenizer_type = "llama";
  } else {
    config.pre_tokenizer_type = "unknown";
  }
  Logger::info("Determined config: architecture='" + config.architecture +
               "', model_name='" + config.model_name + "', chat_template='" +
               config.chat_template_type + "', pre_tokenizer='" +
               config.pre_tokenizer_type + "'");
 
  if (config.model_name == "llama" && config.pre_tokenizer_type != "llama") {
    config.chat_template_type = "llama2";
    Logger::info(
        "Inferred chat_template_type='llama2' based on model_type and "
        "missing/different pre_tokenizer_type.");
  }
 
  auto template_it = gguf.metadata.find("tokenizer.chat_template");
  if (template_it != gguf.metadata.end() &&
      std::holds_alternative<std::string>(template_it->second)) {
    config.chat_template_string = std::get<std::string>(template_it->second);
    Logger::info("Found tokenizer.chat_template in metadata.");
 
  } else {
    Logger::info(
        "tokenizer.chat_template not found or not a string in metadata. Will "
        "use fallback logic.");
    config.chat_template_string = "";
  }
  if (config.chat_template_type == "unknown") {
    if (config.model_name == "llama" && config.pre_tokenizer_type != "llama") {
      config.chat_template_type = "llama2";
      Logger::info(
          "Inferred chat_template_type='llama2' based on model name and "
          "missing/different pre_tokenizer_type.");
    } else if (config.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN) {
        Logger::info("Llama 3 model identified. Chat template will primarily rely on 'tokenizer.chat_template' from GGUF if present.");
        // Set a generic type for now, actual application will use the string.
        if (gguf.metadata.count("tokenizer.chat_template")) {
            config.chat_template_type = "llama3_gguf_direct"; 
        } else {
            config.chat_template_type = "llama3_fallback"; // Or some other indicator
            Logger::warning("Llama 3 model detected, but 'tokenizer.chat_template' not found in GGUF metadata.");
        }
    }
  }
 
  Logger::info(std::string("[parse_gguf_config] Finished parsing. Returning config. Family: ") + 
                (config.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA3_TIKTOKEN ? "L3_TIKTOKEN" : 
                 (config.tokenizer_family == ModelConfig::TokenizerFamily::LLAMA_SENTENCEPIECE ? "L2_SPM" : "UNKNOWN")));
  return config;
}

References ModelConfig::architecture, ModelConfig::bos_token_id, ModelConfig::chat_template_string, ModelConfig::chat_template_type, ModelConfig::eos_token_id, ModelConfig::hidden_act, ModelConfig::hidden_size, Logger::info(), ModelConfig::intermediate_size, ModelConfig::LLAMA3_TIKTOKEN, ModelConfig::LLAMA_SENTENCEPIECE, ModelConfig::max_position_embeddings, GGUFData::metadata, ModelConfig::model_name, ModelConfig::num_attention_heads, ModelConfig::num_hidden_layers, ModelConfig::num_key_value_heads, ModelConfig::pad_token_id, ModelConfig::pre_tokenizer_type, ModelConfig::rms_norm_eps, ModelConfig::rope_theta, ModelConfig::tokenizer_family, GGUFData::tokenizer_merges, ModelConfig::unk_token_id, ModelConfig::UNKNOWN, ModelConfig::vocab_size, and Logger::warning().

Referenced by TinyLlamaModel::TinyLlamaModel().

◆ rmsnorm()

void rmsnorm	(	const std::vector< float > &	x,
		const std::vector< uint16_t > &	weight,
		float	eps,
		std::vector< float > &	out
	)

◆ softmax()

void softmax ( std::vector< float > & x )

◆ tensor_name_to_string()

static std::string tensor_name_to_string ( TensorName tn )

static

Definition at line 49 of file model.h.

                                                      {
  switch (tn) {
    case TensorName::Q_PROJ:
      return "Q_PROJ";
    case TensorName::K_PROJ:
      return "K_PROJ";
    case TensorName::V_PROJ:
      return "V_PROJ";
    case TensorName::O_PROJ:
      return "O_PROJ";
    case TensorName::GATE_PROJ:
      return "GATE_PROJ";
    case TensorName::UP_PROJ:
      return "UP_PROJ";
    case TensorName::DOWN_PROJ:
      return "DOWN_PROJ";
    case TensorName::TOKEN_EMBD:
      return "TOKEN_EMBD";
    case TensorName::LM_HEAD:
      return "LM_HEAD";
    default:
      return "UNKNOWN";
  }
}

References DOWN_PROJ, GATE_PROJ, K_PROJ, LM_HEAD, O_PROJ, Q_PROJ, TOKEN_EMBD, UP_PROJ, and V_PROJ.

◆ uint8_vector_to_uint16_vector()

std::vector< uint16_t > uint8_vector_to_uint16_vector	(	const std::vector< uint8_t > &	bytes,
		size_t	numel
	)

Definition at line 176 of file utils.cpp.

                                                                                                 {
  if (bytes.size() != numel * 2) {
    throw std::runtime_error("Byte vector size mismatch for uint16_t conversion");
  }
  std::vector<uint16_t> out(numel);
  std::memcpy(out.data(), bytes.data(), bytes.size());
  return out;
}

Classes

Typedefs

Enumerations

Functions