Advanced Chunk Processing Library 0.2.0
A comprehensive C++ library for advanced data chunking strategies and processing operations
Loading...
Searching...
No Matches
gpu_chunking.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <cuda_runtime.h>
4#include <device_launch_parameters.h>
5#include <memory>
6#include <stdexcept>
7#include <vector>
8
9namespace gpu_chunking {
10
11// CUDA error checking helper
12#define CUDA_CHECK(call) \
13 do { \
14 cudaError_t error = call; \
15 if (error != cudaSuccess) { \
16 throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(error)); \
17 } \
18 } while (0)
19
20// Device code for parallel chunking
21__global__ void chunk_kernel(const int* data, int* chunk_boundaries, int data_size, int window_size,
22 float threshold) {
23 int idx = blockIdx.x * blockDim.x + threadIdx.x;
24 if (idx >= data_size - window_size)
25 return;
26
27 // Compute local statistics for the window
28 float window_sum = 0.0f;
29 float window_max = data[idx];
30 float window_min = data[idx];
31
32 for (int i = 0; i < window_size; ++i) {
33 int current = data[idx + i];
34 window_sum += current;
35 window_max = max(window_max, (float)current);
36 window_min = min(window_min, (float)current);
37 }
38
39 float window_mean = window_sum / window_size;
40 float variance = 0.0f;
41
42 // Compute variance
43 for (int i = 0; i < window_size; ++i) {
44 float diff = data[idx + i] - window_mean;
45 variance += diff * diff;
46 }
47 variance /= window_size;
48
49 // Determine if this position should be a chunk boundary
50 bool is_boundary = false;
51 if (idx > 0) {
52 float prev_value = data[idx - 1];
53 float current_value = data[idx];
54 float value_diff = abs(current_value - prev_value);
55 float range = window_max - window_min;
56
57 is_boundary = (value_diff > threshold * range) && (variance > threshold * window_mean);
58 }
59
60 chunk_boundaries[idx] = is_boundary ? 1 : 0;
61}
62
63template <typename T>
65private:
67 float threshold;
68 cudaStream_t stream;
69
70 // Helper to allocate GPU memory
71 template <typename U>
72 U* allocate_device_memory(size_t size) {
73 U* d_ptr;
74 CUDA_CHECK(cudaMalloc(&d_ptr, size * sizeof(U)));
75 return d_ptr;
76 }
77
78 // Helper to copy data to GPU
79 template <typename U>
80 void copy_to_device(U* d_ptr, const U* h_ptr, size_t size) {
81 CUDA_CHECK(cudaMemcpyAsync(d_ptr, h_ptr, size * sizeof(U), cudaMemcpyHostToDevice, stream));
82 }
83
84 // Helper to copy data from GPU
85 template <typename U>
86 void copy_from_device(U* h_ptr, const U* d_ptr, size_t size) {
87 CUDA_CHECK(cudaMemcpyAsync(h_ptr, d_ptr, size * sizeof(U), cudaMemcpyDeviceToHost, stream));
88 }
89
90public:
91 GPUChunking(int window_sz = 32, float thresh = 0.1f)
92 : window_size(window_sz), threshold(thresh) {
93 CUDA_CHECK(cudaStreamCreate(&stream));
94 }
95
97 cudaStreamDestroy(stream);
98 }
99
100 std::vector<std::vector<T>> chunk(const std::vector<T>& data) {
101 if (data.empty())
102 return {};
103
104 // Allocate device memory
105 int* d_data = allocate_device_memory<int>(data.size());
106 int* d_boundaries = allocate_device_memory<int>(data.size());
107
108 // Copy input data to GPU
109 copy_to_device(d_data, data.data(), data.size());
110
111 // Configure kernel launch parameters
112 const int BLOCK_SIZE = 256;
113 int num_blocks = (data.size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
114
115 // Launch kernel
116 chunk_kernel<<<num_blocks, BLOCK_SIZE, 0, stream>>>(d_data, d_boundaries, data.size(),
118
119 // Check for kernel errors
120 CUDA_CHECK(cudaGetLastError());
121
122 // Copy boundaries back to host
123 std::vector<int> boundaries(data.size());
124 copy_from_device(boundaries.data(), d_boundaries, data.size());
125
126 // Synchronize stream
127 CUDA_CHECK(cudaStreamSynchronize(stream));
128
129 // Free device memory
130 CUDA_CHECK(cudaFree(d_data));
131 CUDA_CHECK(cudaFree(d_boundaries));
132
133 // Create chunks based on boundaries
134 std::vector<std::vector<T>> chunks;
135 std::vector<T> current_chunk;
136
137 for (size_t i = 0; i < data.size(); ++i) {
138 current_chunk.push_back(data[i]);
139 if (boundaries[i] || i == data.size() - 1) {
140 if (!current_chunk.empty()) {
141 chunks.push_back(std::move(current_chunk));
142 current_chunk = std::vector<T>();
143 }
144 }
145 }
146
147 return chunks;
148 }
149
150 // Getters and setters
151 void set_window_size(int size) {
152 if (size <= 0) {
153 throw std::invalid_argument("Window size must be positive");
154 }
155 window_size = size;
156 }
157
158 void set_threshold(float thresh) {
159 if (thresh <= 0.0f || thresh >= 1.0f) {
160 throw std::invalid_argument("Threshold must be between 0 and 1");
161 }
162 threshold = thresh;
163 }
164
165 int get_window_size() const {
166 return window_size;
167 }
168 float get_threshold() const {
169 return threshold;
170 }
171
172 // Check if CUDA is available
173 static bool is_gpu_available() {
174 int device_count;
175 cudaError_t error = cudaGetDeviceCount(&device_count);
176 return (error == cudaSuccess) && (device_count > 0);
177 }
178
179 // Get GPU properties
180 static std::string get_gpu_info() {
181 if (!is_gpu_available()) {
182 return "No CUDA-capable GPU found";
183 }
184
185 cudaDeviceProp prop;
186 CUDA_CHECK(cudaGetDeviceProperties(&prop, 0));
187
188 return std::string("GPU Device: ") + prop.name +
189 "\nCompute capability: " + std::to_string(prop.major) + "." +
190 std::to_string(prop.minor);
191 }
192};
193
194} // namespace gpu_chunking
static std::string get_gpu_info()
U * allocate_device_memory(size_t size)
GPUChunking(int window_sz=32, float thresh=0.1f)
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
void copy_from_device(U *h_ptr, const U *d_ptr, size_t size)
void set_threshold(float thresh)
void copy_to_device(U *d_ptr, const U *h_ptr, size_t size)
#define CUDA_CHECK(call)
__global__ void chunk_kernel(const int *data, int *chunk_boundaries, int data_size, int window_size, float threshold)