Advanced Chunk Processing Library 0.2.0
A comprehensive C++ library for advanced data chunking strategies and processing operations
Loading...
Searching...
No Matches
chunk_strategies.hpp
Go to the documentation of this file.
1/**
2 * @file chunk_strategies.hpp
3 * @brief Defines various strategies for chunking data based on different criteria.
4 */
5
6#pragma once
7
8#include "chunk_common.hpp"
9#include <cmath>
10#include <functional>
11#include <map>
12#include <memory>
13#include <numeric>
14#include <vector>
15
16// Forward declarations
17namespace chunk_processing {
18template <typename T>
19class ChunkStrategy;
20template <typename T>
21class VarianceStrategy;
22template <typename T>
23class EntropyStrategy;
24} // namespace chunk_processing
25
26// Include implementation files
28
29namespace chunk_processing {
30
31template <typename T>
33public:
34 virtual ~ChunkStrategy() = default;
35 virtual std::vector<std::vector<T>> apply(const std::vector<T>& data) const = 0;
36};
37
38template <typename T>
40private:
41 std::function<bool(T)> predicate_;
43
44public:
45 // Constructor for predicate-based chunking
46 explicit PatternBasedStrategy(std::function<bool(T)> predicate)
47 : predicate_(std::move(predicate)), pattern_size_(0) {}
48
49 // Constructor for size-based pattern chunking
50 explicit PatternBasedStrategy(size_t pattern_size) : pattern_size_(pattern_size) {}
51
52 std::vector<std::vector<T>> apply(const std::vector<T>& data) const override {
53 std::vector<std::vector<T>> result;
54 if (data.empty())
55 return result;
56
57 if (pattern_size_ > 0) {
58 // Size-based pattern chunking
59 for (size_t i = 0; i < data.size(); i += pattern_size_) {
60 size_t end = std::min(i + pattern_size_, data.size());
61 result.emplace_back(data.begin() + i, data.begin() + end);
62 }
63 } else {
64 // Predicate-based chunking
65 std::vector<T> current_chunk;
66 for (const auto& value : data) {
67 if (!current_chunk.empty() && predicate_(value)) {
68 result.push_back(current_chunk);
69 current_chunk.clear();
70 }
71 current_chunk.push_back(value);
72 }
73 if (!current_chunk.empty()) {
74 result.push_back(current_chunk);
75 }
76 }
77 return result;
78 }
79};
80
81template <typename T>
83private:
84 double threshold_;
85
86 double calculate_rolling_variance(const T& new_value, double prev_mean, double& mean,
87 size_t n) const {
88 mean = prev_mean + (static_cast<double>(new_value) - prev_mean) / n;
89 double variance = 0.0;
90 if (n > 1) {
91 variance = std::pow(static_cast<double>(new_value) - mean, 2.0) / (n - 1);
92 }
93 return variance;
94 }
95
96public:
97 explicit VarianceStrategy(double threshold) : threshold_(threshold) {}
98
99 std::vector<std::vector<T>> apply(const std::vector<T>& data) const override {
100 std::vector<std::vector<T>> result;
101 if (data.empty())
102 return result;
103
104 std::vector<T> current_chunk;
105 double mean = 0.0;
106 size_t count = 0;
107
108 for (const auto& value : data) {
109 count++;
110 current_chunk.push_back(value);
111
112 if (count > 1) {
113 double new_mean = 0.0;
114 double variance = calculate_rolling_variance(value, mean, new_mean, count);
115 mean = new_mean;
116
117 if (variance > threshold_) {
118 if (current_chunk.size() > 1) {
119 result.push_back(current_chunk);
120 current_chunk.clear();
121 current_chunk.push_back(value);
122 count = 1;
123 mean = static_cast<double>(value);
124 }
125 }
126 } else {
127 mean = static_cast<double>(value);
128 }
129 }
130
131 if (!current_chunk.empty()) {
132 result.push_back(current_chunk);
133 }
134
135 return result;
136 }
137};
138
139template <typename T>
141private:
143
144 double calculate_entropy(const std::vector<T>& chunk) const {
145 if (chunk.empty())
146 return 0.0;
147
148 // Calculate frequency distribution
149 std::map<T, double> freq;
150 for (const auto& val : chunk) {
151 freq[val] += 1.0;
152 }
153
154 // Calculate entropy
155 double entropy = 0.0;
156 double n = static_cast<double>(chunk.size());
157 for (const auto& pair : freq) {
158 double p = pair.second / n;
159 entropy -= p * std::log2(p);
160 }
161
162 return entropy;
163 }
164
165public:
166 explicit EntropyStrategy(double threshold) : threshold_(threshold) {}
167
168 std::vector<std::vector<T>> apply(const std::vector<T>& data) const override {
169 std::vector<std::vector<T>> result;
170 if (data.empty())
171 return result;
172
173 // If threshold is 0, return the entire data as a single chunk
174 if (threshold_ <= 0.0) {
175 return {data};
176 }
177
178 std::vector<T> current_chunk;
179 for (const auto& value : data) {
180 current_chunk.push_back(value);
181
182 if (current_chunk.size() > 1) {
183 double entropy = calculate_entropy(current_chunk);
184 if (entropy > threshold_) {
185 result.push_back(current_chunk);
186 current_chunk.clear();
187 }
188 }
189 }
190
191 if (!current_chunk.empty()) {
192 result.push_back(current_chunk);
193 }
194
195 return result;
196 }
197};
198
199} // namespace chunk_processing
virtual std::vector< std::vector< T > > apply(const std::vector< T > &data) const =0
virtual ~ChunkStrategy()=default
double calculate_entropy(const std::vector< T > &chunk) const
std::vector< std::vector< T > > apply(const std::vector< T > &data) const override
std::vector< std::vector< T > > apply(const std::vector< T > &data) const override
PatternBasedStrategy(std::function< bool(T)> predicate)
double calculate_rolling_variance(const T &new_value, double prev_mean, double &mean, size_t n) const
std::vector< std::vector< T > > apply(const std::vector< T > &data) const override
Advanced sub-chunking strategies for hierarchical data processing.