Advanced Chunk Processing Library 0.2.0
A comprehensive C++ library for advanced data chunking strategies and processing operations
Loading...
Searching...
No Matches
chunk.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "chunk_common.hpp"
4#include <numeric>
5#include <stdexcept>
6#include <string>
7#include <vector>
8
9namespace chunk_processing {
10
11/**
12 * @brief A template class for managing and processing data in chunks
13 * @tparam T The type of elements stored in the chunk
14 */
15template <typename T>
17private:
19 std::vector<T> data_;
20 std::vector<std::vector<T>> chunks_;
21
22 // Helper class for handling jagged arrays
23 template <typename U>
25 public:
26 static std::vector<std::vector<U>> normalize(const std::vector<std::vector<U>>& jagged) {
27 if (jagged.empty())
28 return {};
29
30 // Find maximum size
31 size_t max_size = 0;
32 for (const auto& row : jagged) {
33 max_size = std::max(max_size, row.size());
34 }
35
36 // Normalize by padding with default values
37 std::vector<std::vector<U>> normalized;
38 normalized.reserve(jagged.size());
39
40 for (const auto& row : jagged) {
41 std::vector<U> normalized_row = row;
42 normalized_row.resize(max_size, U{}); // Pad with default values
43 normalized.push_back(std::move(normalized_row));
44 }
45
46 return normalized;
47 }
48
49 static std::vector<std::vector<std::vector<U>>>
50 normalize_3d(const std::vector<std::vector<std::vector<U>>>& jagged_3d) {
51 if (jagged_3d.empty())
52 return {};
53
54 // Find maximum sizes
55 size_t max_rows = 0, max_cols = 0;
56 for (const auto& matrix : jagged_3d) {
57 max_rows = std::max(max_rows, matrix.size());
58 for (const auto& row : matrix) {
59 max_cols = std::max(max_cols, row.size());
60 }
61 }
62
63 // Normalize by padding
64 std::vector<std::vector<std::vector<U>>> normalized;
65 normalized.reserve(jagged_3d.size());
66
67 for (const auto& matrix : jagged_3d) {
68 std::vector<std::vector<U>> norm_matrix;
69 norm_matrix.reserve(max_rows);
70
71 for (const auto& row : matrix) {
72 std::vector<U> norm_row = row;
73 norm_row.resize(max_cols, U{});
74 norm_matrix.push_back(std::move(norm_row));
75 }
76
77 while (norm_matrix.size() < max_rows) {
78 norm_matrix.push_back(std::vector<U>(max_cols));
79 }
80
81 normalized.push_back(std::move(norm_matrix));
82 }
83
84 return normalized;
85 }
86 };
87
88 // Helper function for validation
89 void validate_size(size_t size, const std::string& param) const {
90 if (size == 0) {
91 throw std::invalid_argument(param + " must be greater than 0");
92 }
93 }
94 // Ensure data is properly copied
95 std::vector<std::vector<T>> make_chunks(size_t size) const {
96 std::vector<std::vector<T>> result;
97 result.reserve((data_.size() + size - 1) / size);
98
99 for (size_t i = 0; i < data_.size(); i += size) {
100 size_t chunk_end = std::min(i + size, data_.size());
101 result.emplace_back(data_.begin() + i, data_.begin() + chunk_end);
102 }
103 return result;
104 }
105
107 chunks_.clear();
108 for (size_t i = 0; i < data_.size(); i += chunk_size_) {
109 std::vector<T> chunk;
110 for (size_t j = 0; j < chunk_size_ && i + j < data_.size(); ++j) {
111 chunk.push_back(data_[i + j]);
112 }
113 chunks_.push_back(chunk);
114 }
115 }
116
117 // Add support for checking dimensionality
118 template <typename U>
119 static constexpr size_t get_depth() {
120 if constexpr (is_vector<U>::value)
121 return 1 + get_depth<typename U::value_type>();
122 return 0;
123 }
124
125public:
126 explicit Chunk(size_t chunk_size = 1) : chunk_size_(chunk_size) {
127 validate_size(chunk_size, "Chunk size");
128 }
129
130 void add(const T& element) {
131 data_.push_back(element);
132 update_chunks();
133 }
134
135 void add(const std::vector<T>& elements) {
136 data_.insert(data_.end(), elements.begin(), elements.end());
137 update_chunks();
138 }
139
140 std::vector<std::vector<T>> chunk_by_size(size_t size) {
141 if (data_.empty()) {
142 throw std::invalid_argument("Cannot chunk empty data");
143 }
144 if (size == 0) {
145 throw std::invalid_argument("Chunk size cannot be zero");
146 }
147 return make_chunks(size);
148 }
149
150 std::vector<std::vector<T>> chunk_by_threshold(T threshold) {
151 if (data_.empty()) {
152 throw std::invalid_argument("Cannot chunk empty data");
153 }
154 if (data_.size() < chunk_size_) {
155 throw std::invalid_argument("Input size must be at least chunk size");
156 }
157 if (threshold <= T{}) {
158 throw std::invalid_argument("Threshold must be positive");
159 }
160
161 std::vector<std::vector<T>> result;
162 std::vector<T> current_chunk;
163 T running_sum = T{};
164
165 for (const T& value : data_) {
166 if (running_sum + value > threshold && !current_chunk.empty()) {
167 result.push_back(current_chunk);
168 current_chunk.clear();
169 running_sum = T{};
170 }
171 current_chunk.push_back(value);
172 running_sum += value;
173 }
174
175 if (!current_chunk.empty()) {
176 result.push_back(current_chunk);
177 }
178
179 return result;
180 }
181
182 std::vector<std::vector<T>> get_chunks() const {
183 return chunks_;
184 }
185
186 size_t size() const {
187 return data_.size();
188 }
189
190 size_t chunk_count() const {
191 return (data_.size() + chunk_size_ - 1) / chunk_size_;
192 }
193
194 size_t get_chunk_size() const {
195 return chunk_size_;
196 }
197 const std::vector<T>& get_data() const {
198 return data_;
199 }
200
201 void set_chunk_size(size_t new_size) {
202 validate_size(new_size, "Chunk size");
203 chunk_size_ = new_size;
204 update_chunks();
205 }
206
207 // Add methods to handle multi-dimensional data
208 template <typename U = T>
209 std::enable_if_t<is_vector<U>::value> add(const U& nested_data) {
210 // First validate dimensions
211 if (chunk_processing::is_jagged(nested_data)) {
212 throw std::invalid_argument("Jagged arrays are not supported");
213 }
214
215 // For 3D arrays, check additional level
218 reinterpret_cast<const std::vector<
219 std::vector<std::vector<typename U::value_type::value_type>>>&>(
220 nested_data))) {
221 throw std::invalid_argument("Jagged 3D arrays are not supported");
222 }
223 }
224
225 data_.push_back(nested_data);
226 update_chunks();
227 }
228
229 // Get the dimensionality of the data
230 static constexpr size_t dimensions() {
231 return get_depth<T>();
232 }
233
234 // Add methods to handle jagged arrays
235 template <typename U>
236 std::vector<std::vector<U>> handle_jagged_2d(const std::vector<std::vector<U>>& data) {
237 if (!chunk_processing::is_jagged(data)) {
238 return data; // Already uniform
239 }
241 }
242
243 template <typename U>
244 std::vector<std::vector<std::vector<U>>>
245 handle_jagged_3d(const std::vector<std::vector<std::vector<U>>>& data) {
247 return data; // Already uniform
248 }
250 }
251
252 template <typename U>
253 void validate_dimensions(const std::vector<U>& data, size_t expected_size = 0) {
255 // For jagged arrays, normalize instead of throwing error
256 if (chunk_processing::is_jagged(data)) {
257 auto normalized = handle_jagged_2d(data);
258 if (expected_size > 0 && normalized.size() != expected_size) {
259 throw std::invalid_argument("Inconsistent dimensions after normalization");
260 }
261 return;
262 }
263
264 // For 3D arrays, handle jagged data similarly
267 reinterpret_cast<const std::vector<
268 std::vector<std::vector<typename U::value_type::value_type>>>&>(
269 data))) {
270 auto normalized = handle_jagged_3d(data);
271 if (expected_size > 0 && normalized.size() != expected_size) {
272 throw std::invalid_argument(
273 "Inconsistent dimensions after 3D normalization");
274 }
275 return;
276 }
277 }
278
279 // Check size consistency for non-jagged arrays
280 if (expected_size > 0 && data.size() != expected_size) {
281 throw std::invalid_argument("Inconsistent dimensions in nested array");
282 }
283
284 // Recursively validate inner dimensions
285 if (!data.empty()) {
286 validate_dimensions(data[0], data[0].size());
287 }
288 }
289 }
290};
291
292} // namespace chunk_processing
#define CHUNK_EXPORT
static std::vector< std::vector< U > > normalize(const std::vector< std::vector< U > > &jagged)
Definition chunk.hpp:26
static std::vector< std::vector< std::vector< U > > > normalize_3d(const std::vector< std::vector< std::vector< U > > > &jagged_3d)
Definition chunk.hpp:50
A template class for managing and processing data in chunks.
Definition chunk.hpp:16
std::enable_if_t< is_vector< U >::value > add(const U &nested_data)
Definition chunk.hpp:209
std::vector< std::vector< T > > make_chunks(size_t size) const
Definition chunk.hpp:95
Chunk(size_t chunk_size=1)
Definition chunk.hpp:126
void validate_dimensions(const std::vector< U > &data, size_t expected_size=0)
Definition chunk.hpp:253
size_t chunk_count() const
Definition chunk.hpp:190
size_t size() const
Definition chunk.hpp:186
static constexpr size_t get_depth()
Definition chunk.hpp:119
std::vector< std::vector< T > > get_chunks() const
Definition chunk.hpp:182
void add(const T &element)
Definition chunk.hpp:130
size_t get_chunk_size() const
Definition chunk.hpp:194
void validate_size(size_t size, const std::string &param) const
Definition chunk.hpp:89
const std::vector< T > & get_data() const
Definition chunk.hpp:197
std::vector< std::vector< T > > chunk_by_size(size_t size)
Definition chunk.hpp:140
std::vector< std::vector< U > > handle_jagged_2d(const std::vector< std::vector< U > > &data)
Definition chunk.hpp:236
std::vector< std::vector< T > > chunk_by_threshold(T threshold)
Definition chunk.hpp:150
std::vector< std::vector< std::vector< U > > > handle_jagged_3d(const std::vector< std::vector< std::vector< U > > > &data)
Definition chunk.hpp:245
std::vector< T > data_
Definition chunk.hpp:19
void add(const std::vector< T > &elements)
Definition chunk.hpp:135
void set_chunk_size(size_t new_size)
Definition chunk.hpp:201
std::vector< std::vector< T > > chunks_
Definition chunk.hpp:20
static constexpr size_t dimensions()
Definition chunk.hpp:230
bool is_jagged(const std::vector< std::vector< T > > &data)
bool is_jagged_3d(const std::vector< std::vector< std::vector< T > > > &data)