TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
cpu_attention.h
Go to the documentation of this file.
1#pragma once
2
3#include <vector>
4#include "kv_cache.h"
5
7 KVCache* kv_cache,
8 int layer_idx,
9 const std::vector<float>& k_batch_for_layer,
10 const std::vector<float>& v_batch_for_layer,
11 int num_tokens_in_batch,
12 int start_pos_in_sequence,
13 int num_kv_heads,
14 int head_dim
15);
16
18 const std::vector<float>& q_batch_roped,
19 KVCacheLayer& current_layer_kv_cache,
20 std::vector<float>& batch_attn_output,
21 int num_tokens_in_batch,
22 int start_pos_in_sequence,
23 int num_q_heads,
24 int num_kv_heads,
25 int head_dim,
26 float attention_scale
27);
28
30 KVCache* kv_cache,
31 int layer_idx,
32 const std::vector<float>& k_batch_for_layer,
33 const std::vector<float>& v_batch_for_layer,
34 int num_tokens_in_batch,
35 const std::vector<int>& sequence_indices,
36 const std::vector<int>& position_in_sequence,
37 int num_kv_heads,
38 int head_dim
39);
40
42 const std::vector<float>& q_batch_roped,
43 KVCacheLayer& current_layer_kv_cache,
44 std::vector<float>& batch_attn_output,
45 int num_tokens_in_batch,
46 const std::vector<int>& sequence_indices,
47 const std::vector<int>& position_in_sequence,
48 int num_q_heads,
49 int num_kv_heads,
50 int head_dim,
51 float attention_scale,
52 int max_seq_len_per_sequence
53);
void attention_batch_cpu_sequence_aware(const std::vector< float > &q_batch_roped, KVCacheLayer &current_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale, int max_seq_len_per_sequence)
void attention_batch_cpu(const std::vector< float > &q_batch_roped, KVCacheLayer &current_layer_kv_cache, std::vector< float > &batch_attn_output, int num_tokens_in_batch, int start_pos_in_sequence, int num_q_heads, int num_kv_heads, int head_dim, float attention_scale)
void update_kv_cache_batch_cpu_sequence_aware(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, const std::vector< int > &sequence_indices, const std::vector< int > &position_in_sequence, int num_kv_heads, int head_dim)
void update_kv_cache_batch_cpu(KVCache *kv_cache, int layer_idx, const std::vector< float > &k_batch_for_layer, const std::vector< float > &v_batch_for_layer, int num_tokens_in_batch, int start_pos_in_sequence, int num_kv_heads, int head_dim)
Key-Value cache for a single transformer layer.
Definition model.h:130
Complete Key-Value cache for all transformer layers.
Definition model.h:151