Advanced Chunk Processing Library 0.2.0
A comprehensive C++ library for advanced data chunking strategies and processing operations
Loading...
Searching...
No Matches
neural_chunking.hpp
Go to the documentation of this file.
1/**
2 * @file neural_chunking.hpp
3 * @brief Neural network-based chunking algorithms
4 * @author Jonathan Reich
5 * @date 2024-12-07
6 */
7
8#pragma once
9#include "chunk_common.hpp"
10#include <cmath>
11#include <memory>
12#include <numeric> // for std::accumulate
13#include <stdexcept>
14#include <vector>
15
16namespace neural_chunking {
17
18/**
19 * @brief Neural network layer implementation
20 * @tparam T Data type for layer computations
21 */
22template <typename T>
24public:
25 Layer(size_t input_size, size_t output_size)
26 : input_size_(input_size), output_size_(output_size) {
27 weights_.resize(input_size * output_size);
28 biases_.resize(output_size);
29 initialize_weights();
30 }
31
32 std::vector<T> forward(const std::vector<T>& input) {
33 if (input.size() != input_size_) {
34 throw std::invalid_argument("Invalid input size");
35 }
36 std::vector<T> output(output_size_);
37 // Simple forward pass implementation
38 for (size_t i = 0; i < output_size_; ++i) {
39 output[i] = biases_[i];
40 for (size_t j = 0; j < input_size_; ++j) {
41 output[i] += input[j] * weights_[i * input_size_ + j];
42 }
43 }
44 return output;
45 }
46
47private:
50 std::vector<T> weights_;
51 std::vector<T> biases_;
52
54 // Simple Xavier initialization
55 T scale = std::sqrt(2.0 / (input_size_ + output_size_));
56 for (auto& w : weights_) {
57 w = (static_cast<T>(rand()) / RAND_MAX * 2 - 1) * scale;
58 }
59 for (auto& b : biases_) {
60 b = 0;
61 }
62 }
63};
64
65/**
66 * @brief Configuration for neural network chunking
67 */
69 size_t input_size; ///< Size of input layer
70 size_t hidden_size; ///< Size of hidden layer
71 double learning_rate; ///< Learning rate for training
72 size_t batch_size; ///< Batch size for processing
73 double threshold; ///< Decision threshold for chunk boundaries
74};
75
76/**
77 * @brief Class implementing neural network-based chunking
78 * @tparam T Data type of elements to chunk
79 */
80template <typename T>
82private:
84 double threshold_;
87 std::string activation_;
88 size_t epochs_;
89
90 // Add private activation functions
91 double apply_activation(double x) const {
92 if (activation_ == "relu") {
93 return x > 0 ? x : 0;
94 } else if (activation_ == "sigmoid") {
95 return 1.0 / (1.0 + std::exp(-x));
96 } else { // tanh
97 return std::tanh(x);
98 }
99 }
100
101 double activation_derivative(double x) const {
102 if (activation_ == "relu") {
103 return x > 0 ? 1 : 0;
104 } else if (activation_ == "sigmoid") {
105 double sig = apply_activation(x);
106 return sig * (1 - sig);
107 } else { // tanh
108 double tanh_x = std::tanh(x);
109 return 1 - tanh_x * tanh_x;
110 }
111 }
112
113 // Add training helper methods
114 std::vector<double> prepare_batch(const std::vector<T>& data, size_t start_idx) const {
115 std::vector<double> batch;
116 batch.reserve(std::min(batch_size_, data.size() - start_idx));
117
118 for (size_t i = 0; i < batch_size_ && (start_idx + i) < data.size(); ++i) {
120 batch.push_back(compute_feature(data[start_idx + i]));
121 } else {
122 batch.push_back(static_cast<double>(data[start_idx + i]));
123 }
124 }
125 return batch;
126 }
127
128 template <typename U>
129 double compute_feature(const U& arr) const {
132 // Handle 2D arrays
133 double sum = 0.0;
134 for (const auto& inner : arr) {
135 sum += compute_feature(inner);
136 }
137 return sum / arr.size();
138 } else {
139 // Handle 1D arrays
140 return std::accumulate(arr.begin(), arr.end(), 0.0) / arr.size();
141 }
142 } else {
143 // Handle scalar values
144 return static_cast<double>(arr);
145 }
146 }
147
148public:
149 NeuralChunking(size_t window_size = 8, double threshold = 0.5)
150 : window_size_(window_size), threshold_(threshold), learning_rate_(0.01), batch_size_(32),
151 activation_("relu"), epochs_(100) {}
152
153 void set_window_size(size_t size) {
154 window_size_ = size;
155 }
156 void set_threshold(double threshold) {
157 threshold_ = threshold;
158 }
159
160 size_t get_window_size() const {
161 return window_size_;
162 }
163 double get_threshold() const {
164 return threshold_;
165 }
166
167 std::vector<std::vector<T>> chunk(const std::vector<T>& data) const {
168 if (data.empty()) {
169 return {};
170 }
171
172 // Handle case where data is smaller than window size
173 if (data.size() <= window_size_) {
174 return {data};
175 }
176
177 std::vector<std::vector<T>> result;
178 std::vector<T> current_chunk;
179
180 for (const auto& value : data) {
182 double feature = compute_feature(value);
183 if (!current_chunk.empty() &&
184 std::abs(feature - compute_feature(current_chunk.back())) > threshold_) {
185 result.push_back(current_chunk);
186 current_chunk.clear();
187 }
188 } else {
189 // Single-dimension logic
190 if (!current_chunk.empty() &&
191 std::abs(static_cast<double>(value - current_chunk.back())) > threshold_) {
192 result.push_back(current_chunk);
193 current_chunk.clear();
194 }
195 }
196 current_chunk.push_back(value);
197 }
198
199 if (!current_chunk.empty()) {
200 result.push_back(current_chunk);
201 }
202
203 return result;
204 }
205
206 /**
207 * @brief Set the learning rate for neural network training
208 * @param rate Learning rate value (must be positive)
209 */
210 void set_learning_rate(double rate) {
211 if (rate <= 0.0) {
212 throw std::invalid_argument("Learning rate must be positive");
213 }
214 learning_rate_ = rate;
215 }
216
217 /**
218 * @brief Get the current learning rate
219 * @return Current learning rate
220 */
221 double get_learning_rate() const {
222 return learning_rate_;
223 }
224
225 /**
226 * @brief Set the batch size for training
227 * @param size Batch size (must be positive)
228 */
229 void set_batch_size(size_t size) {
230 if (size == 0) {
231 throw std::invalid_argument("Batch size must be positive");
232 }
233 batch_size_ = size;
234 }
235
236 /**
237 * @brief Get the current batch size
238 * @return Current batch size
239 */
240 size_t get_batch_size() const {
241 return batch_size_;
242 }
243
244 /**
245 * @brief Set the activation function type
246 * @param activation Activation function name ("relu", "sigmoid", or "tanh")
247 */
248 void set_activation(const std::string& activation) {
249 if (activation != "relu" && activation != "sigmoid" && activation != "tanh") {
250 throw std::invalid_argument(
251 "Invalid activation function. Supported: relu, sigmoid, tanh");
252 }
253 activation_ = activation;
254 }
255
256 /**
257 * @brief Get the current activation function type
258 * @return Current activation function name
259 */
260 std::string get_activation() const {
261 return activation_;
262 }
263
264 /**
265 * @brief Set the number of training epochs
266 * @param num_epochs Number of epochs (must be positive)
267 */
268 void set_epochs(size_t num_epochs) {
269 if (num_epochs == 0) {
270 throw std::invalid_argument("Number of epochs must be positive");
271 }
272 epochs_ = num_epochs;
273 }
274
275 /**
276 * @brief Get the current number of training epochs
277 * @return Current number of epochs
278 */
279 size_t get_epochs() const {
280 return epochs_;
281 }
282
283 /**
284 * @brief Train the neural network on the provided data
285 * @param data Training data
286 * @return Vector of loss values for each epoch
287 */
288 std::vector<double> train(const std::vector<T>& data) {
289 if (data.size() < window_size_) {
290 throw std::invalid_argument("Training data size must be larger than window size");
291 }
292
293 // Initialize neural network layers
294 Layer<double> input_layer(window_size_, window_size_);
295 Layer<double> hidden_layer(window_size_, 1);
296
297 std::vector<double> epoch_losses;
298 epoch_losses.reserve(epochs_);
299
300 // Training loop
301 for (size_t epoch = 0; epoch < epochs_; ++epoch) {
302 double epoch_loss = 0.0;
303 size_t num_batches = (data.size() + batch_size_ - 1) / batch_size_;
304
305 for (size_t batch = 0; batch < num_batches; ++batch) {
306 size_t start_idx = batch * batch_size_;
307 auto batch_data = prepare_batch(data, start_idx);
308 if (batch_data.size() < window_size_)
309 break;
310
311 // Forward pass
312 auto hidden = input_layer.forward(batch_data);
313 for (auto& h : hidden)
314 h = apply_activation(h);
315 auto output = hidden_layer.forward(hidden);
316
317 // Compute loss
318 double target = batch_data.back();
319 double prediction = output[0];
320 double loss = 0.5 * (prediction - target) * (prediction - target);
321 epoch_loss += loss;
322
323 // Backward pass and update weights (simplified)
324 double error = prediction - target;
325 double delta = error * activation_derivative(prediction);
326
327 // Update weights (simplified backpropagation)
328 for (size_t i = 0; i < window_size_; ++i) {
329 hidden[i] -= learning_rate_ * delta * batch_data[i];
330 }
331 }
332
333 epoch_losses.push_back(epoch_loss / num_batches);
334 }
335
336 return epoch_losses;
337 }
338};
339
340} // namespace neural_chunking
#define CHUNK_EXPORT
Neural network layer implementation.
Layer(size_t input_size, size_t output_size)
std::vector< T > weights_
std::vector< T > forward(const std::vector< T > &input)
Class implementing neural network-based chunking.
std::vector< double > train(const std::vector< T > &data)
Train the neural network on the provided data.
void set_learning_rate(double rate)
Set the learning rate for neural network training.
double activation_derivative(double x) const
size_t get_batch_size() const
Get the current batch size.
std::vector< double > prepare_batch(const std::vector< T > &data, size_t start_idx) const
size_t get_epochs() const
Get the current number of training epochs.
std::vector< std::vector< T > > chunk(const std::vector< T > &data) const
void set_activation(const std::string &activation)
Set the activation function type.
double compute_feature(const U &arr) const
void set_threshold(double threshold)
void set_epochs(size_t num_epochs)
Set the number of training epochs.
void set_batch_size(size_t size)
Set the batch size for training.
NeuralChunking(size_t window_size=8, double threshold=0.5)
std::string get_activation() const
Get the current activation function type.
double get_learning_rate() const
Get the current learning rate.
double apply_activation(double x) const
size_t input_size
Size of input layer.
double threshold
Decision threshold for chunk boundaries.
size_t batch_size
Batch size for processing.
double learning_rate
Learning rate for training.
size_t hidden_size
Size of hidden layer.
Configuration for neural network chunking.