tinyllama.cpp/cuda__kernels_8h_source.html

#ifndef CUDA_KERNELS_H

#define CUDA_KERNELS_H


#ifdef HAS_CUDA


// Use safe headers only for Windows CUDA 12.1+ workaround, normal headers everywhere else

#if defined(WINDOWS_CUDA_12_1_WORKAROUND) && defined(_WIN32)

#include "cuda_safe_headers.h"

#else

// Normal CUDA header inclusion for non-problematic platforms (Ubuntu, etc.)

#include <cuda_runtime.h>

#include <cublas_v2.h>

#include <cuda_fp16.h>

#include <cuda_bf16.h>

#endif


#include <cstdint>

#include <stdexcept>

#include <string>

#include <vector>


#include "logger.h"


inline void gpuAssert(cudaError_t code, const char* file, int line,

                      bool abort = true) {

  if (code != cudaSuccess) {

    std::string err_msg =

        "GPUassert: " + std::string(cudaGetErrorString(code)) + " " +

        std::string(file) + " " + std::to_string(line);

    Logger::error(err_msg);

    if (abort) throw std::runtime_error(err_msg);

  }

}


#define gpuErrchk(ans) \

  { gpuAssert((ans), __FILE__, __LINE__); }


void rmsnorm_vector_cuda(const float* x_dev, const float* weight_dev,

                         float* out_dev, int n, float eps,

                         cudaStream_t stream = 0);


void rmsnorm_vector_cuda(const std::vector<float>& x_in_host,

                         const std::vector<float>& weight_host,

                         std::vector<float>& out_host, int n, float eps);


// New Batch RMSNorm

void rmsnorm_batch_cuda(float* d_out, float* d_in, const float* d_weight,

                        int num_tokens, int hidden_size, float eps,

                        cudaStream_t stream);


__global__ void reduce_partial_sums_kernel(const float* partial_sums, float* total_sum_sq_out, int num_partial_sums);


void matvec_f32_f32_cuda(cublasHandle_t handle,

                         const std::vector<float>& mat_f32_host,

                         const std::vector<float>& vec_f32_host,

                         std::vector<float>& out_f32_host, int rows, int cols);


void matvec_f32_f32_cuda(cublasHandle_t handle, const float* mat_f32_dev,

                        const float* vec_f32_dev, float* out_f32_dev,

                        int rows, int cols, cudaStream_t stream = 0);


// New Generic GEMM for FP32 (if needed, can be adapted for BF16 later or use cublasGemmEx)

void gemm_f32_f32_cuda(cublasHandle_t handle,

                       bool transa, bool transb,

                       int m, int n, int k,

                       const float* alpha,

                       const float* A, int lda,

                       const float* B, int ldb,

                       const float* beta,

                       float* C, int ldc,

                       cudaStream_t stream);


void silu_cuda(const std::vector<float>& x_host,

               std::vector<float>& out_host, int n);


void softmax_vector_cuda(const std::vector<float>& x_host,

                         std::vector<float>& out_host, int n);


void rope_cuda(float* vec, int num_heads, int head_dim,

               const float* freqs_cis_dev, int pos, bool use_adjacent_pairing, cudaStream_t stream);


void attention_cuda(const float* Q_current_dev, const float* K_layer_cache_base,

                    const float* V_layer_cache_base, float* out_dev,

                    int num_heads, int current_seq_len, int head_dim,

                    float scale, int cache_max_seq_len, int cache_num_kv_heads,

                    cudaStream_t stream = 0);


void attention_cuda_selective_dequant(const float* Q_current_dev,

                                     const int8_t* K_quantized_cache_base,

                                     const int8_t* V_quantized_cache_base,

                                     const float* K_scales_cache_base,

                                     const float* V_scales_cache_base,

                                     float* selective_k_dequant_buffer,

                                     float* selective_v_dequant_buffer,

                                     float* out_dev,

                                     int num_heads, int current_seq_len, int head_dim,

                                     float scale, int cache_max_seq_len, int cache_num_kv_heads,

                                     cudaStream_t stream = 0);


void add_vectors_cuda(const float* a_dev, const float* b_dev,

                     float* result_dev, int n, cudaStream_t stream = 0);


void add_residual_cuda(const float* matvec_out_dev, const float* residual_dev,

                       float* result_dev, int n, cudaStream_t stream = 0);


void update_kv_cache_cuda(float* cache_base_ptr,

                         const float* current_kv_vector,

                          int pos, int kv_head_idx, int max_seq_len,

                          int num_kv_heads, int head_dim,

                          cudaStream_t stream = 0);


void rope_and_update_kv_cache_cuda(float* cache_base_ptr,

                                   const float* kv_vector_head,

                                  const float* all_freqs_cis_base,

                                  int pos, int kv_head_idx, int max_seq_len,

                                   int num_kv_heads, int head_dim,

                                   cudaStream_t stream = 0);


void swiglu_cuda(const float* gate_dev, const float* up_dev,

                 float* out_dev, int n, cudaStream_t stream = 0);


void lookup_embedding_bf16_f32_cuda(const uint16_t* embedding_table_dev,

                                   float* output_vector_dev,

                                   int token_id, int hidden_size,

                                   int vocab_size, cudaStream_t stream = 0);


void lookup_embedding_cuda(const void* table_dev, float* output_dev,

                           int token_id, int hidden_size, int vocab_size,

                           bool is_bf16, cudaStream_t stream);


void matvec_bf16_f32_cuda(cublasHandle_t handle,

                         const uint16_t* mat_bf16_dev,

                         const float* vec_f32_dev,

                         float* out_f32_dev,

                         int rows, int cols,

                         bool use_tensor_cores,

                         cudaStream_t stream = 0);


__global__ void convert_bf16_to_fp32_kernel(const uint16_t* __restrict__ bf16_in,

                                            float* __restrict__ fp32_out,

                                            size_t n_elements);


// KVCache Quantization Kernels (FP32 <-> INT8)


void quantize_fp32_to_int8_symmetric_per_tensor_cuda(

    const float* fp32_in_dev,

    int8_t* int8_out_dev,

    float* scale_out_dev,

    int num_elements,

    cudaStream_t stream = 0);


void dequantize_int8_to_fp32_symmetric_per_tensor_cuda(

    const int8_t* int8_in_dev,

    const float* scale_in_dev,

    float* fp32_out_dev,

    int num_elements,

    cudaStream_t stream = 0);


// New Batched SwiGLU (SiLU + element-wise multiply)

void swiglu_batch_cuda(float* d_out_batch, // Output: [num_tokens, intermediate_size]

                       const float* d_gate_act_batch,   // Input: Gate activations [num_tokens, intermediate_size]

                       const float* d_up_act_batch,     // Input: Up activations [num_tokens, intermediate_size]

                       int num_tokens,

                       int intermediate_size,

                       cudaStream_t stream);


// New Batched RoPE

void rope_batch_cuda(float* d_q_batch, float* d_k_batch,

                     const float* d_all_freqs_cis_base, // Changed name

                     int num_tokens, int num_q_heads, int num_kv_heads, int head_dim,

                     int start_pos_offset,      // Changed name

                     bool use_adjacent_pairing, // Changed name and type

                     cudaStream_t stream);


// New Batched Attention for Prefill

void attention_batch_prefill_cuda(

    const float* d_q_batch_strided,   // Input Q: [B, H_q, D_h]

    const float* d_k_batch_strided,   // Input K for current batch

    const float* d_v_batch_strided,   // Input V for current batch

    float* d_kv_cache_k_base,         // K Cache: [S_max, H_kv, D_h]

    float* d_kv_cache_v_base,         // V Cache: [S_max, H_kv, D_h]

    float* d_output_batch_strided,    // Output: [B, H_q, D_h]

    int num_tokens_in_batch,          // B

    int start_pos_in_kv_cache,        // Start position for this batch in KV cache

    int cache_max_seq_len,            // Max capacity of KV cache

    int num_q_heads,                  // H_q

    int num_kv_heads,                 // H_kv

    int head_dim,                     // D_h

    float scale,

    cudaStream_t stream,

    const int* attention_mask_cu = nullptr // Optional attention mask, changed name

);


// New Batched Add Residual

void add_residual_batch_cuda(float* d_output_batch, // Output: [num_tokens, hidden_size]

                             const float* d_input_a_batch,   // Input A: [num_tokens, hidden_size]

                             const float* d_input_b_batch,   // Input B: [num_tokens, hidden_size]

                             int num_tokens, int hidden_size,

                             cudaStream_t stream);


// New Batched KV Cache Update (FP32 example)

// This function seems to be missing a corresponding definition for separate K and V caches.

// The existing update_kv_cache_batch_cuda takes a single cache layer base.

// For now, I will comment out the _fp32 version and assume the generic one is intended to be used twice.

/*

void update_kv_cache_batch_cuda_fp32(

    float* d_kvcache_k, float* d_kvcache_v,

    const float* d_k_batch_current, const float* d_v_batch_current,

    int num_tokens_in_batch, int start_pos_in_kv_cache,

    int max_seq_len_in_cache, int num_kv_heads, int head_dim,

    cudaStream_t stream

);

*/

// Assuming the following is the intended generic function for batch KV update:

void update_kv_cache_batch_cuda(

    float* d_kv_cache_layer_base,        // Device pointer to the K or V cache for the current layer

    const float* d_keys_or_values_batch, // Device pointer to the batch of K or V vectors to be written

    int start_pos_in_kv_cache,           // The sequence position in the cache where writing for this batch should begin

    int num_tokens_in_batch,             // Number of tokens in the d_keys_or_values_batch

    int num_kv_heads,                    // Number of K/V heads

    int head_dim,                        // Dimension of each K/V head

    int cache_max_seq_len,               // Maximum sequence length capacity of the cache

    cudaStream_t stream

);


void rmsnorm_vector_cuda_optimized(const float* x_dev, const float* weight_dev,

                                 float* out_dev, int n, float eps,

                                 cudaStream_t stream = 0);


void softmax_vector_cuda_optimized(const float* x_dev, float* out_dev, int n,

                                  cudaStream_t stream = 0);


void attention_cuda_optimized(const float* Q_current_dev, const float* K_layer_cache_base,

                             const float* V_layer_cache_base, float* out_dev,

                             int num_heads, int current_seq_len, int head_dim,

                             float scale, int cache_max_seq_len, int cache_num_kv_heads,

                             cudaStream_t stream = 0);


// New BF16 Tensor Core Matrix-Matrix Operations

void gemm_bf16_bf16_cuda(cublasHandle_t handle,

                         bool transa_user, bool transb_user,

                         int m_user, int n_user, int k_user,

                         const float* alpha_user,

                         const uint16_t* A_bf16_user, int lda_user,

                         const uint16_t* B_bf16_user, int ldb_user,

                         const float* beta_user,

                         uint16_t* C_bf16_user, int ldc_user,

                         cudaStream_t stream);


void gemm_f32_to_bf16_f32_cuda(cublasHandle_t handle,

                               bool transa_user, bool transb_user,

                               int m_user, int n_user, int k_user,

                               const float* alpha_user,

                               const float* A_f32_user, int lda_user,

                               const uint16_t* B_bf16_user, int ldb_user,

                               const float* beta_user,

                               float* C_f32_user, int ldc_user,

                               cudaStream_t stream);


// Conversion utilities

void convert_fp32_to_bf16_cuda(const float* fp32_in_dev, uint16_t* bf16_out_dev,

                               size_t n_elements, cudaStream_t stream);


void convert_bf16_to_fp32_cuda(const uint16_t* bf16_in_dev, float* fp32_out_dev,

                               size_t n_elements, cudaStream_t stream);


__global__ void convert_fp32_to_bf16_kernel(const float* __restrict__ fp32_in,

                                            uint16_t* __restrict__ bf16_out,

                                            size_t n_elements);


#endif // HAS_CUDA


#endif // CUDA_KERNELS_H

Logger::error
static void error(const std::string &message)
Definition logger.cpp:143

cuda_safe_headers.h
Safe CUDA header inclusion wrapper for Windows CUDA 12.1+ compatibility.

logger.h
Logging utilities for the TinyLlama implementation.