Advanced Chunk Processing Library 0.2.0
A comprehensive C++ library for advanced data chunking strategies and processing operations
Loading...
Searching...
No Matches
sophisticated_chunking.hpp
Go to the documentation of this file.
1#pragma once
2#include "chunk_common.hpp"
3#include <algorithm>
4#include <cmath>
5#include <map>
6#include <memory>
7#include <numeric>
8#include <random>
9#include <set>
10#include <stdexcept>
11#include <string>
12#include <type_traits>
13#include <vector>
14
16
17/**
18 * @brief Wavelet-based chunking strategy using signal processing principles
19 * @tparam T The type of elements to be chunked
20 */
21template <typename T>
23private:
25 double threshold_;
26 std::string wavelet_type_;
27
28 /**
29 * @brief Compute discrete wavelet transform coefficients
30 * @param data Input data sequence
31 * @return Vector of wavelet coefficients
32 */
33 std::vector<double> computeWaveletCoefficients(const std::vector<T>& data) const {
34 if (data.size() < window_size_) {
35 return std::vector<double>();
36 }
37
38 std::vector<double> coefficients;
39 coefficients.reserve(data.size() - window_size_ + 1);
40
41 // Different wavelet implementations
42 if (wavelet_type_ == "haar" || wavelet_type_ == "db1") {
43 // Haar wavelet transform
44 for (size_t i = 0; i <= data.size() - window_size_; ++i) {
45 double sum = 0.0;
46 for (size_t j = 0; j < window_size_ / 2; ++j) {
47 double diff = static_cast<double>(data[i + j]) -
48 static_cast<double>(data[i + window_size_ - 1 - j]);
49 sum += diff * diff;
50 }
51 coefficients.push_back(std::sqrt(sum / window_size_));
52 }
53 } else if (wavelet_type_ == "sym2") {
54 // Symlet 2 wavelet transform
55 const std::vector<double> h = {-0.1294, 0.2241, 0.8365,
56 0.4830}; // Symlet 2 coefficients
57 for (size_t i = 0; i <= data.size() - window_size_; ++i) {
58 double sum = 0.0;
59 for (size_t j = 0; j < std::min(window_size_, size_t(4)); ++j) {
60 if (i + j < data.size()) {
61 sum += h[j] * static_cast<double>(data[i + j]);
62 }
63 }
64 coefficients.push_back(std::abs(sum));
65 }
66 } else {
67 throw std::invalid_argument("Unsupported wavelet type: " + wavelet_type_);
68 }
69
70 return coefficients;
71 }
72
73public:
74 /**
75 * @brief Constructor for wavelet-based chunking
76 * @param window_size Size of the sliding window
77 * @param threshold Coefficient threshold for chunk boundaries
78 */
79 WaveletChunking(size_t window_size = 8, double threshold = 0.5)
80 : window_size_(window_size), threshold_(threshold), wavelet_type_("haar") {}
81
82 /**
83 * @brief Chunk data based on wavelet transform analysis
84 * @param data Input data to be chunked
85 * @return Vector of chunks
86 */
87 std::vector<std::vector<T>> chunk(const std::vector<T>& data) const {
88 if (data.empty()) {
89 return {};
90 }
91
92 auto coefficients = computeWaveletCoefficients(data);
93 std::vector<std::vector<T>> chunks;
94 std::vector<T> current_chunk;
95
96 size_t i = 0;
97 for (const T& value : data) {
98 current_chunk.push_back(value);
99
100 if (i < coefficients.size() && coefficients[i] > threshold_) {
101 if (!current_chunk.empty()) {
102 chunks.push_back(current_chunk);
103 current_chunk.clear();
104 }
105 }
106 ++i;
107 }
108
109 if (!current_chunk.empty()) {
110 chunks.push_back(current_chunk);
111 }
112
113 return chunks;
114 }
115
116 /**
117 * @brief Get the size of the sliding window
118 * @return Size of the sliding window
119 */
120 size_t get_window_size() const {
121 return window_size_;
122 }
123
124 /**
125 * @brief Get the coefficient threshold for chunk boundaries
126 * @return Coefficient threshold for chunk boundaries
127 */
128 double get_threshold() const {
129 return threshold_;
130 }
131
132 /**
133 * @brief Set the size of the sliding window
134 * @param size Size of the sliding window
135 */
136 void set_window_size(size_t size) {
137 if (size == 0)
138 throw std::invalid_argument("Window size cannot be zero");
139 window_size_ = size;
140 }
141
142 /**
143 * @brief Set the coefficient threshold for chunk boundaries
144 * @param threshold Coefficient threshold for chunk boundaries
145 */
146 void set_threshold(double threshold) {
147 threshold_ = threshold;
148 }
149
150 /**
151 * @brief Get the current wavelet type
152 * @return Current wavelet type
153 */
154 std::string get_wavelet_type() const {
155 return wavelet_type_;
156 }
157
158 /**
159 * @brief Set the wavelet type
160 * @param type Wavelet type ("haar", "db1", or "sym2")
161 */
162 void set_wavelet_type(const std::string& type) {
163 if (type != "haar" && type != "db1" && type != "sym2") {
164 throw std::invalid_argument("Invalid wavelet type. Supported types: haar, db1, sym2");
165 }
166 wavelet_type_ = type;
167 }
168};
169
170/**
171 * @brief Information theory based chunking using mutual information
172 * @tparam T The type of elements to be chunked
173 */
174template <typename T>
176private:
179
180 /**
181 * @brief Calculate mutual information between adjacent segments
182 * @param segment1 First segment
183 * @param segment2 Second segment
184 * @return Mutual information value
185 */
186 double calculateMutualInformation(const std::vector<T>& segment1,
187 const std::vector<T>& segment2) const {
188 if (segment1.empty() || segment2.empty()) {
189 return 0.0;
190 }
191
192 // Calculate frequency distributions
193 std::map<T, double> p1, p2;
194 std::map<std::pair<T, T>, double> p12;
195
196 for (const auto& val : segment1) {
197 p1[val] += 1.0 / segment1.size();
198 }
199
200 for (const auto& val : segment2) {
201 p2[val] += 1.0 / segment2.size();
202 }
203
204 // Calculate joint distribution
205 size_t min_size = std::min(segment1.size(), segment2.size());
206 for (size_t i = 0; i < min_size; ++i) {
207 p12[{segment1[i], segment2[i]}] += 1.0 / min_size;
208 }
209
210 // Calculate mutual information
211 double mi = 0.0;
212 for (const auto& [val1, prob1] : p1) {
213 for (const auto& [val2, prob2] : p2) {
214 auto joint_prob = p12[{val1, val2}];
215 if (joint_prob > 0) {
216 mi += joint_prob * std::log2(joint_prob / (prob1 * prob2));
217 }
218 }
219 }
220
221 return mi;
222 }
223
224public:
225 /**
226 * @brief Constructor for mutual information based chunking
227 * @param context_size Size of context window
228 * @param mi_threshold Threshold for mutual information
229 */
230 MutualInformationChunking(size_t context_size = 5, double mi_threshold = 0.3)
231 : context_size_(context_size), mi_threshold_(mi_threshold) {}
232
233 /**
234 * @brief Chunk data based on mutual information analysis
235 * @param data Input data to be chunked
236 * @return Vector of chunks
237 */
238 std::vector<std::vector<T>> chunk(const std::vector<T>& data) const {
239 if (data.size() < 2 * context_size_) {
240 return {data};
241 }
242
243 std::vector<std::vector<T>> chunks;
244 std::vector<T> current_chunk;
245
246 for (size_t i = 0; i < data.size(); ++i) {
247 current_chunk.push_back(data[i]);
248
249 if (current_chunk.size() >= context_size_ && i + context_size_ < data.size()) {
250 std::vector<T> next_segment(data.begin() + i + 1,
251 data.begin() +
252 std::min(i + 1 + context_size_, data.size()));
253
254 double mi = calculateMutualInformation(current_chunk, next_segment);
255
256 if (mi < mi_threshold_) {
257 chunks.push_back(current_chunk);
258 current_chunk.clear();
259 }
260 }
261 }
262
263 if (!current_chunk.empty()) {
264 chunks.push_back(current_chunk);
265 }
266
267 return chunks;
268 }
269
270 /**
271 * @brief Get the size of context window
272 * @return Size of context window
273 */
274 size_t get_context_size() const {
275 return context_size_;
276 }
277
278 /**
279 * @brief Get the threshold for mutual information
280 * @return Threshold for mutual information
281 */
282 double get_mi_threshold() const {
283 return mi_threshold_;
284 }
285
286 /**
287 * @brief Set the size of context window
288 * @param size Size of context window
289 */
290 void set_context_size(size_t size) {
291 if (size == 0)
292 throw std::invalid_argument("Context size cannot be zero");
293 context_size_ = size;
294 }
295
296 /**
297 * @brief Set the threshold for mutual information
298 * @param threshold Threshold for mutual information
299 */
300 void set_mi_threshold(double threshold) {
301 mi_threshold_ = threshold;
302 }
303};
304
305/**
306 * @brief Dynamic time warping based chunking for sequence alignment
307 * @tparam T The type of elements to be chunked
308 */
309template <typename T>
311private:
314 std::string distance_metric_;
315
316 double calculate_distance(double a, double b) const {
317 if (distance_metric_ == "manhattan") {
318 return std::abs(a - b);
319 } else if (distance_metric_ == "cosine") {
320 double dot = a * b;
321 double norm_a = std::abs(a);
322 double norm_b = std::abs(b);
323 if (norm_a == 0 || norm_b == 0)
324 return 0.0;
325 return 1.0 - (dot / (norm_a * norm_b));
326 } else {
327 double diff = a - b;
328 return diff * diff;
329 }
330 }
331
332 double compute_dtw_core(const std::vector<double>& seq1,
333 const std::vector<double>& seq2) const {
334 const size_t n = seq1.size();
335 const size_t m = seq2.size();
336 std::vector<std::vector<double>> dp(
337 n + 1, std::vector<double>(m + 1, std::numeric_limits<double>::infinity()));
338
339 dp[0][0] = 0.0;
340
341 for (size_t i = 1; i <= n; ++i) {
342 for (size_t j = std::max(1ul, i - window_size_); j <= std::min(m, i + window_size_);
343 ++j) {
344 double cost = calculate_distance(seq1[i - 1], seq2[j - 1]);
345 dp[i][j] = cost + std::min({
346 dp[i - 1][j], // insertion
347 dp[i][j - 1], // deletion
348 dp[i - 1][j - 1] // match
349 });
350 }
351 }
352
353 return dp[n][m];
354 }
355
356 template <typename U>
357 std::vector<double> flatten_features(const U& data) const {
360 // Handle 2D arrays
361 std::vector<double> flattened;
362 for (const auto& inner : data) {
363 auto inner_features = flatten_features(inner);
364 flattened.insert(flattened.end(), inner_features.begin(), inner_features.end());
365 }
366 return flattened;
367 } else {
368 // Handle 1D arrays
369 return std::vector<double>(data.begin(), data.end());
370 }
371 } else {
372 // Handle scalar values
373 return {static_cast<double>(data)};
374 }
375 }
376
377 double compute_dtw_distance(const std::vector<T>& seq1, const std::vector<T>& seq2) const {
380 // For multi-dimensional data, flatten and compare features
381 auto features1 = flatten_features(seq1);
382 auto features2 = flatten_features(seq2);
383 return compute_dtw_core(features1, features2);
384 } else {
385 // For 1D vector data
386 return compute_dtw_core(seq1, seq2);
387 }
388 } else {
389 // For scalar data
390 return std::abs(static_cast<double>(seq1[0] - seq2[0]));
391 }
392 }
393
394 /**
395 * @brief Compute DTW distance between sequences
396 * @param seq1 First sequence
397 * @param seq2 Second sequence
398 * @return DTW distance
399 */
400 double computeDTWDistance(const std::vector<T>& seq1, const std::vector<T>& seq2) const;
401
402public:
403 /**
404 * @brief Constructor for DTW-based chunking
405 * @param window_size Size of the warping window
406 * @param dtw_threshold Threshold for chunk boundaries
407 */
408 DTWChunking(size_t window_size = 10, double dtw_threshold = 1.0)
409 : window_size_(window_size), dtw_threshold_(dtw_threshold), distance_metric_("euclidean") {}
410
411 /**
412 * @brief Chunk data based on DTW analysis
413 * @param data Input data to be chunked
414 * @return Vector of chunks
415 */
416 std::vector<std::vector<T>> chunk(const std::vector<T>& data) const {
417 if (data.empty()) {
418 return {};
419 }
420
421 std::vector<std::vector<T>> result;
422 std::vector<T> current_chunk;
423
424 for (const auto& value : data) {
426 if (!current_chunk.empty()) {
427 double distance = compute_dtw_distance(value, current_chunk.back());
428 if (distance > dtw_threshold_) {
429 result.push_back(current_chunk);
430 current_chunk.clear();
431 }
432 }
433 } else {
434 // Single-dimension logic
435 if (!current_chunk.empty() &&
436 std::abs(static_cast<double>(value - current_chunk.back())) > dtw_threshold_) {
437 result.push_back(current_chunk);
438 current_chunk.clear();
439 }
440 }
441 current_chunk.push_back(value);
442 }
443
444 if (!current_chunk.empty()) {
445 result.push_back(current_chunk);
446 }
447
448 return result;
449 }
450
451 /**
452 * @brief Get the size of the warping window
453 * @return Size of the warping window
454 */
455 size_t get_window_size() const {
456 return window_size_;
457 }
458
459 /**
460 * @brief Get the threshold for chunk boundaries
461 * @return Threshold for chunk boundaries
462 */
463 double get_dtw_threshold() const {
464 return dtw_threshold_;
465 }
466
467 /**
468 * @brief Set the size of the warping window
469 * @param size Size of the warping window
470 */
471 void set_window_size(size_t size) {
472 if (size == 0)
473 throw std::invalid_argument("Window size cannot be zero");
474 window_size_ = size;
475 }
476
477 /**
478 * @brief Set the threshold for chunk boundaries
479 * @param threshold Threshold for chunk boundaries
480 */
481 void set_dtw_threshold(double threshold) {
482 dtw_threshold_ = threshold;
483 }
484
485 /**
486 * @brief Get the distance metric
487 * @return Distance metric
488 */
489 std::string get_distance_metric() const {
490 return distance_metric_;
491 }
492
493 /**
494 * @brief Set the distance metric
495 * @param metric Distance metric
496 */
497 void set_distance_metric(const std::string& metric) {
498 if (metric != "euclidean" && metric != "manhattan" && metric != "cosine") {
499 throw std::invalid_argument(
500 "Invalid distance metric. Supported metrics: euclidean, manhattan, cosine");
501 }
502 distance_metric_ = metric;
503 }
504};
505
506} // namespace sophisticated_chunking
Dynamic time warping based chunking for sequence alignment.
std::vector< std::vector< T > > chunk(const std::vector< T > &data) const
Chunk data based on DTW analysis.
void set_distance_metric(const std::string &metric)
Set the distance metric.
double calculate_distance(double a, double b) const
DTWChunking(size_t window_size=10, double dtw_threshold=1.0)
Constructor for DTW-based chunking.
std::vector< double > flatten_features(const U &data) const
std::string get_distance_metric() const
Get the distance metric.
size_t get_window_size() const
Get the size of the warping window.
void set_window_size(size_t size)
Set the size of the warping window.
double compute_dtw_core(const std::vector< double > &seq1, const std::vector< double > &seq2) const
double computeDTWDistance(const std::vector< T > &seq1, const std::vector< T > &seq2) const
Compute DTW distance between sequences.
double get_dtw_threshold() const
Get the threshold for chunk boundaries.
void set_dtw_threshold(double threshold)
Set the threshold for chunk boundaries.
double compute_dtw_distance(const std::vector< T > &seq1, const std::vector< T > &seq2) const
Information theory based chunking using mutual information.
std::vector< std::vector< T > > chunk(const std::vector< T > &data) const
Chunk data based on mutual information analysis.
size_t get_context_size() const
Get the size of context window.
MutualInformationChunking(size_t context_size=5, double mi_threshold=0.3)
Constructor for mutual information based chunking.
void set_context_size(size_t size)
Set the size of context window.
double calculateMutualInformation(const std::vector< T > &segment1, const std::vector< T > &segment2) const
Calculate mutual information between adjacent segments.
void set_mi_threshold(double threshold)
Set the threshold for mutual information.
double get_mi_threshold() const
Get the threshold for mutual information.
Wavelet-based chunking strategy using signal processing principles.
double get_threshold() const
Get the coefficient threshold for chunk boundaries.
void set_wavelet_type(const std::string &type)
Set the wavelet type.
void set_window_size(size_t size)
Set the size of the sliding window.
std::vector< double > computeWaveletCoefficients(const std::vector< T > &data) const
Compute discrete wavelet transform coefficients.
std::string get_wavelet_type() const
Get the current wavelet type.
void set_threshold(double threshold)
Set the coefficient threshold for chunk boundaries.
WaveletChunking(size_t window_size=8, double threshold=0.5)
Constructor for wavelet-based chunking.
std::vector< std::vector< T > > chunk(const std::vector< T > &data) const
Chunk data based on wavelet transform analysis.
size_t get_window_size() const
Get the size of the sliding window.