9 const std::vector<float>& k_batch_for_layer,
10 const std::vector<float>& v_batch_for_layer,
11 int num_tokens_in_batch,
12 int start_pos_in_sequence,
18 const std::vector<float>& q_batch_roped,
20 std::vector<float>& batch_attn_output,
21 int num_tokens_in_batch,
22 int start_pos_in_sequence,
32 const std::vector<float>& k_batch_for_layer,
33 const std::vector<float>& v_batch_for_layer,
34 int num_tokens_in_batch,
35 const std::vector<int>& sequence_indices,
36 const std::vector<int>& position_in_sequence,
42 const std::vector<float>& q_batch_roped,
44 std::vector<float>& batch_attn_output,
45 int num_tokens_in_batch,
46 const std::vector<int>& sequence_indices,
47 const std::vector<int>& position_in_sequence,
51 float attention_scale,
52 int max_seq_len_per_sequence
void attention_batch_cpu_sequence_aware(const std::vector< float > &q_batch_roped, KVCacheLayer ¤t_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale, int max_seq_len_per_sequence)
void attention_batch_cpu(const std::vector< float > &q_batch_roped, KVCacheLayer ¤t_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, int start_pos_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale)
void update_kv_cache_batch_cpu_sequence_aware(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_kv_heads, int head_dim)
void update_kv_cache_batch_cpu(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, int start_pos_in_sequence, int num_kv_heads, int head_dim)
Key-Value cache for a single transformer layer.
Complete Key-Value cache for all transformer layers.