tinyllama.cpp/cpu__attention_8h_source.html

#pragma once


#include <vector>

#include "kv_cache.h"


void update_kv_cache_batch_cpu(

    KVCache* kv_cache,

    int layer_idx,

    const std::vector<float>& k_batch_for_layer,

    const std::vector<float>& v_batch_for_layer,

    int num_tokens_in_batch,

    int start_pos_in_sequence,

    int num_kv_heads,

    int head_dim

);


void attention_batch_cpu(

    const std::vector<float>& q_batch_roped,

    KVCacheLayer& current_layer_kv_cache,

    std::vector<float>& batch_attn_output,

    int num_tokens_in_batch,

    int start_pos_in_sequence,

    int num_q_heads,

    int num_kv_heads,

    int head_dim,

    float attention_scale

);


void update_kv_cache_batch_cpu_sequence_aware(

    KVCache* kv_cache,

    int layer_idx,

    const std::vector<float>& k_batch_for_layer,

    const std::vector<float>& v_batch_for_layer,

    int num_tokens_in_batch,

    const std::vector<int>& sequence_indices,

    const std::vector<int>& position_in_sequence,

    int num_kv_heads,

    int head_dim

);


void attention_batch_cpu_sequence_aware(

    const std::vector<float>& q_batch_roped,

    KVCacheLayer& current_layer_kv_cache,

    std::vector<float>& batch_attn_output,

    int num_tokens_in_batch,

    const std::vector<int>& sequence_indices,

    const std::vector<int>& position_in_sequence,

    int num_q_heads,

    int num_kv_heads,

    int head_dim,

    float attention_scale,

    int max_seq_len_per_sequence

);

attention_batch_cpu_sequence_aware
void attention_batch_cpu_sequence_aware(const std::vector< float > &q_batch_roped, KVCacheLayer &current_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale, int max_seq_len_per_sequence)
Definition cpu_attention.cpp:254

attention_batch_cpu
void attention_batch_cpu(const std::vector< float > &q_batch_roped, KVCacheLayer &current_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, int start_pos_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale)
Definition cpu_attention.cpp:91

update_kv_cache_batch_cpu_sequence_aware
void update_kv_cache_batch_cpu_sequence_aware(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_kv_heads, int head_dim)
Definition cpu_attention.cpp:203

update_kv_cache_batch_cpu
void update_kv_cache_batch_cpu(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, int start_pos_in_sequence, int num_kv_heads, int head_dim)
Definition cpu_attention.cpp:7

kv_cache.h

KVCacheLayer
Key-Value cache for a single transformer layer.
Definition model.h:130

KVCache
Complete Key-Value cache for all transformer layers.
Definition model.h:151