Advanced Chunk Processing Library 0.2.0
A comprehensive C++ library for advanced data chunking strategies and processing operations
Loading...
Searching...
No Matches
advanced_structures.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <algorithm>
4#include <boost/graph/adjacency_list.hpp>
5#include <boost/graph/connected_components.hpp>
6#include <cmath>
7#include <deque>
8#include <memory>
9#include <random>
10#include <stack>
11#include <stdexcept>
12#include <string>
13#include <unordered_map>
14#include <vector>
15
17
18// Default NLP model implementation
20 template <typename ContentType>
21 double calculateSimilarity(const ContentType& content1, const ContentType& content2) {
22 // Simple similarity calculation (e.g., based on length)
23 return 1.0 - static_cast<double>(std::abs(static_cast<int>(content1.size()) -
24 static_cast<int>(content2.size()))) /
25 std::max(content1.size(), content2.size());
26 }
27};
28
29/**
30 * @brief Template class for semantic-based content chunking
31 *
32 * SemanticChunker splits content based on semantic boundaries using
33 * configurable NLP models and similarity metrics.
34 *
35 * @tparam ContentType Type of content to be chunked
36 * @tparam ModelType Type of NLP model to use for similarity calculations
37 */
38template <typename ContentType, typename ModelType = DefaultNLPModel>
40private:
41 ModelType model; ///< NLP model instance
42 double similarity_threshold; ///< Threshold for determining chunk boundaries
43
44public:
45 /**
46 * @brief Construct a new Semantic Chunker
47 *
48 * @param threshold Similarity threshold for chunk boundaries (default: 0.7)
49 * @param custom_model Custom NLP model instance (optional)
50 */
51 explicit SemanticChunker(double threshold = 0.7, ModelType custom_model = ModelType())
52 : model(custom_model), similarity_threshold(threshold) {}
53
54 /**
55 * @brief Chunk content based on semantic boundaries
56 *
57 * @param content Input content to be chunked
58 * @return std::vector<ContentType> Vector of content chunks
59 */
60 std::vector<ContentType> chunk(const ContentType& content);
61
62 /**
63 * @brief Set a new NLP model
64 *
65 * @param new_model New model instance to use
66 */
67 void setModel(ModelType new_model) {
68 model = new_model;
69 }
70
71 /**
72 * @brief Set new similarity threshold
73 *
74 * @param threshold New threshold value between 0.0 and 1.0
75 */
76 void setSimilarityThreshold(double threshold) {
77 similarity_threshold = threshold;
78 }
79};
80
81/**
82 * @brief Specialization of SemanticChunker for string content
83 *
84 * Provides optimized implementation for string-based content chunking.
85 *
86 * @tparam ModelType Type of NLP model to use
87 */
88template <typename ModelType>
89class SemanticChunker<std::string, ModelType> {
90private:
91 ModelType model; ///< NLP model instance
92 double similarity_threshold; ///< Similarity threshold for chunking
93
94public:
95 /**
96 * @brief Construct a new string-specialized Semantic Chunker
97 *
98 * @param threshold Similarity threshold (default: 0.7)
99 * @param custom_model Custom NLP model instance
100 */
101 explicit SemanticChunker(double threshold = 0.7, ModelType custom_model = ModelType())
102 : model(custom_model), similarity_threshold(threshold) {}
103
104 /**
105 * @brief Chunk string content into semantic segments
106 *
107 * @param content Input string to be chunked
108 * @return std::vector<std::string> Vector of string chunks
109 */
110 std::vector<std::string> chunk(const std::string& content) {
111 std::vector<std::string> chunks;
112 if (content.empty()) {
113 return chunks;
114 }
115
116 // Split by sentences (simple implementation)
117 size_t start = 0;
118 size_t pos = 0;
119 std::string current_chunk;
120
121 while ((pos = content.find_first_of(".!?", start)) != std::string::npos) {
122 // Include the punctuation mark and any following whitespace
123 size_t end = pos + 1;
124 while (end < content.length() && std::isspace(content[end])) {
125 end++;
126 }
127
128 // Extract the sentence
129 std::string sentence = content.substr(start, end - start);
130 if (!sentence.empty()) {
131 chunks.push_back(sentence);
132 }
133
134 start = end;
135 }
136
137 // Add remaining content if any
138 if (start < content.length()) {
139 chunks.push_back(content.substr(start));
140 }
141
142 return chunks;
143 }
144};
145
146/**
147 * @brief A skip list implementation for efficient chunk searching
148 * @tparam T The type of elements stored in the skip list
149 */
150template <typename T>
152private:
153 struct Node {
155 std::vector<std::shared_ptr<Node>> forward;
156 explicit Node(T val, int level) : value(val), forward(level) {}
157 };
158
159 std::shared_ptr<Node> head;
161 float p;
163
164 /**
165 * @brief Generates a random level for node insertion
166 * @return The random level
167 */
169 int lvl = 1;
170 while ((static_cast<float>(rand()) / RAND_MAX) < p && lvl < max_level) {
171 lvl++;
172 }
173 return lvl;
174 }
175
176public:
177 /**
178 * @brief Constructs a new ChunkSkipList object
179 * @param max_lvl Maximum level of the skip list
180 * @param prob Probability factor for level generation
181 */
182 ChunkSkipList(int max_lvl = 16, float prob = 0.5)
183 : max_level(max_lvl), p(prob), current_level(1) {
184 head = std::make_shared<Node>(T(), max_level);
185 }
186
187 /**
188 * @brief Inserts a value into the skip list
189 * @param value The value to insert
190 */
191 void insert(const T& value) {
192 std::vector<std::shared_ptr<Node>> update(max_level);
193 auto current = head;
194
195 for (int i = current_level - 1; i >= 0; i--) {
196 while (current->forward[i] && current->forward[i]->value < value) {
197 current = current->forward[i];
198 }
199 update[i] = current;
200 }
201
202 int new_level = random_level();
203 if (new_level > current_level) {
204 for (int i = current_level; i < new_level; i++) {
205 update[i] = head;
206 }
207 current_level = new_level;
208 }
209
210 auto new_node = std::make_shared<Node>(value, new_level);
211 for (int i = 0; i < new_level; i++) {
212 new_node->forward[i] = update[i]->forward[i];
213 update[i]->forward[i] = new_node;
214 }
215 }
216
217 /**
218 * @brief Searches for a value in the skip list
219 * @param value The value to search for
220 * @return True if the value is found, false otherwise
221 */
222 bool search(const T& value) const {
223 auto current = head;
224 for (int i = current_level - 1; i >= 0; i--) {
225 while (current->forward[i] && current->forward[i]->value < value) {
226 current = current->forward[i];
227 }
228 }
229 current = current->forward[0];
230 return current && current->value == value;
231 }
232};
233
234/**
235 * @brief A B+ tree implementation for chunk indexing
236 * @tparam T The type of elements stored in the tree
237 */
238template <typename T>
240 static constexpr int ORDER = 4;
241
242 struct Node {
244 std::vector<T> keys;
245 std::vector<std::shared_ptr<Node>> children;
246 std::shared_ptr<Node> next;
247
248 Node(bool leaf = true) : is_leaf(leaf) {}
249 };
250
251 std::shared_ptr<Node> root;
252
253public:
254 /**
255 * @brief Constructs a new ChunkBPlusTree object
256 */
257 ChunkBPlusTree() : root(std::make_shared<Node>()) {}
258
259 /**
260 * @brief Inserts a key into the B+ tree
261 * @param key The key to insert
262 */
263 void insert(const T& key) {
264 if (root->keys.empty()) {
265 root->keys.push_back(key);
266 return;
267 }
268
269 if (root->keys.size() == ORDER - 1) {
270 auto new_root = std::make_shared<Node>(false);
271 new_root->children.push_back(root);
272 split_child(new_root, 0);
273 root = new_root;
274 }
275 insert_non_full(root, key);
276 }
277
278 /**
279 * @brief Searches for a key in the B+ tree
280 * @param key The key to search for
281 * @return True if the key is found, false otherwise
282 */
283 bool search(const T& key) const {
284 if (root == nullptr)
285 return false;
286
287 return search_node(root, key);
288 }
289
290private:
291 void split_child(std::shared_ptr<Node> parent, int index) {
292 auto child = parent->children[index];
293 auto new_child = std::make_shared<Node>(child->is_leaf);
294
295 parent->keys.insert(parent->keys.begin() + index, child->keys[ORDER / 2 - 1]);
296
297 parent->children.insert(parent->children.begin() + index + 1, new_child);
298
299 new_child->keys.assign(child->keys.begin() + ORDER / 2, child->keys.end());
300
301 child->keys.resize(ORDER / 2 - 1);
302
303 if (!child->is_leaf) {
304 new_child->children.assign(child->children.begin() + ORDER / 2, child->children.end());
305 child->children.resize(ORDER / 2);
306 }
307 }
308
309 void insert_non_full(std::shared_ptr<Node> node, const T& key) {
310 int i = node->keys.size() - 1;
311
312 if (node->is_leaf) {
313 node->keys.push_back(T());
314 while (i >= 0 && key < node->keys[i]) {
315 node->keys[i + 1] = node->keys[i];
316 i--;
317 }
318 node->keys[i + 1] = key;
319 } else {
320 while (i >= 0 && key < node->keys[i]) {
321 i--;
322 }
323 i++;
324
325 if (node->children[i]->keys.size() == ORDER - 1) {
326 split_child(node, i);
327 if (key > node->keys[i]) {
328 i++;
329 }
330 }
331 insert_non_full(node->children[i], key);
332 }
333 }
334
335 bool search_node(const std::shared_ptr<Node>& node, const T& key) const {
336 int i = 0;
337 while (i < node->keys.size() && key > node->keys[i]) {
338 i++;
339 }
340
341 if (i < node->keys.size() && key == node->keys[i]) {
342 return true;
343 }
344
345 if (node->is_leaf) {
346 return false;
347 }
348
349 return search_node(node->children[i], key);
350 }
351};
352
353/**
354 * @brief A deque-based chunk structure for double-ended operations
355 * @tparam T The type of elements stored in the chunk deque
356 */
357template <typename T>
359private:
360 std::deque<T> data_;
361
362public:
363 void push_back(const T& value) {
364 data_.push_back(value);
365 }
366
367 void push_front(const T& value) {
368 data_.push_front(value);
369 }
370
372 T value = data_.back();
373 data_.pop_back();
374 return value;
375 }
376
378 T value = data_.front();
379 data_.pop_front();
380 return value;
381 }
382
383 size_t size() const {
384 return data_.size();
385 }
386
387 bool empty() const {
388 return data_.empty();
389 }
390};
391
392/**
393 * @brief A stack-based chunk structure for LIFO operations
394 * @tparam T The type of elements stored in the chunk stack
395 */
396template <typename T>
398private:
399 std::stack<T> data_;
400
401public:
402 void push(const T& value) {
403 data_.push(value);
404 }
405
406 T pop() {
407 T value = data_.top();
408 data_.pop();
409 return value;
410 }
411
412 size_t size() const {
413 return data_.size();
414 }
415
416 bool empty() const {
417 return data_.empty();
418 }
419};
420
421/**
422 * @brief A treap implementation for efficient chunk searching and manipulation
423 * @tparam T The type of elements stored in the treap
424 */
425template <typename T>
427private:
428 struct Node {
431 std::shared_ptr<Node> left;
432 std::shared_ptr<Node> right;
433
434 Node(T val, int prio) : value(val), priority(prio), left(nullptr), right(nullptr) {}
435 };
436
437 std::shared_ptr<Node> root;
438 std::mt19937 gen;
439
440 std::shared_ptr<Node> rotate_right(std::shared_ptr<Node> node) {
441 auto new_root = node->left;
442 node->left = new_root->right;
443 new_root->right = node;
444 return new_root;
445 }
446
447 std::shared_ptr<Node> rotate_left(std::shared_ptr<Node> node) {
448 auto new_root = node->right;
449 node->right = new_root->left;
450 new_root->left = node;
451 return new_root;
452 }
453
454 std::shared_ptr<Node> insert(std::shared_ptr<Node> node, T value, int priority) {
455 if (!node) {
456 return std::make_shared<Node>(value, priority);
457 }
458
459 if (value < node->value) {
460 node->left = insert(node->left, value, priority);
461 if (node->left->priority > node->priority) {
462 node = rotate_right(node);
463 }
464 } else {
465 node->right = insert(node->right, value, priority);
466 if (node->right->priority > node->priority) {
467 node = rotate_left(node);
468 }
469 }
470 return node;
471 }
472
473 bool search(const std::shared_ptr<Node>& node, T value) const {
474 if (!node) {
475 return false;
476 }
477 if (node->value == value) {
478 return true;
479 }
480 if (value < node->value) {
481 return search(node->left, value);
482 } else {
483 return search(node->right, value);
484 }
485 }
486
487public:
488 ChunkTreap() : root(nullptr), gen(std::random_device{}()) {}
489
490 void insert(T value) {
491 int priority = std::uniform_int_distribution<>(1, 100)(gen);
492 root = insert(root, value, priority);
493 }
494
495 bool search(T value) const {
496 return search(root, value);
497 }
498};
499
500/**
501 * @brief Semantic boundaries-based chunking implementation
502 * @tparam T The type of elements to be chunked
503 */
504template <typename T>
506private:
508
509public:
510 explicit SemanticBoundariesChunk(double threshold = 0.5) : boundary_threshold(threshold) {}
511
512 std::vector<std::vector<T>> chunk(const std::vector<T>& data) {
513 std::vector<std::vector<T>> result;
514 if (data.empty())
515 return result;
516
517 std::vector<T> current_chunk;
518 for (const auto& item : data) {
519 current_chunk.push_back(item);
520 if (isBoundary(current_chunk)) {
521 result.push_back(current_chunk);
522 current_chunk.clear();
523 }
524 }
525
526 if (!current_chunk.empty()) {
527 result.push_back(current_chunk);
528 }
529
530 return result;
531 }
532
533protected:
534 virtual bool isBoundary(const std::vector<T>& chunk) {
535 return chunk.size() >= 3; // Default implementation
536 }
537};
538
539/**
540 * @brief Fractal pattern-based chunking implementation
541 * @tparam T The type of elements to be chunked
542 */
543template <typename T>
545private:
548
549public:
550 FractalPatternsChunk(size_t size = 3, double threshold = 0.8)
551 : pattern_size(size), similarity_threshold(threshold) {}
552
553 std::vector<std::vector<T>> chunk(const std::vector<T>& data) {
554 std::vector<std::vector<T>> result;
555 if (data.empty())
556 return result;
557
558 std::vector<T> current_chunk;
559 for (const auto& item : data) {
560 current_chunk.push_back(item);
561 if (hasPattern(current_chunk)) {
562 result.push_back(current_chunk);
563 current_chunk.clear();
564 }
565 }
566
567 if (!current_chunk.empty()) {
568 result.push_back(current_chunk);
569 }
570
571 return result;
572 }
573
574protected:
575 virtual bool hasPattern(const std::vector<T>& chunk) {
576 return chunk.size() >= pattern_size;
577 }
578};
579
580/**
581 * @brief Bloom filter-based chunking implementation
582 * @tparam T The type of elements to be chunked
583 */
584template <typename T>
586private:
589 std::vector<bool> filter;
590
591public:
592 BloomFilterChunk(size_t size = 1024, size_t num_funcs = 3)
593 : filter_size(size), num_hash_functions(num_funcs), filter(size, false) {}
594
595 std::vector<std::vector<T>> chunk(const std::vector<T>& data) {
596 std::vector<std::vector<T>> result;
597 if (data.empty())
598 return result;
599
600 std::vector<T> current_chunk;
601 for (const auto& item : data) {
602 current_chunk.push_back(item);
603 if (shouldSplit(current_chunk)) {
604 result.push_back(current_chunk);
605 current_chunk.clear();
606 }
607 }
608
609 if (!current_chunk.empty()) {
610 result.push_back(current_chunk);
611 }
612
613 return result;
614 }
615
616protected:
617 virtual bool shouldSplit(const std::vector<T>& chunk) {
618 return chunk.size() >= 4; // Default implementation
619 }
620};
621
622/**
623 * @brief Graph-based chunking implementation
624 * @tparam T The type of elements to be chunked
625 */
626template <typename T>
628private:
629 using Graph = boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS>;
631
632public:
633 explicit GraphBasedChunk(double threshold = 0.5) : edge_threshold(threshold) {}
634
635 std::vector<std::vector<T>> chunk(const std::vector<T>& data) {
636 std::vector<std::vector<T>> result;
637 if (data.empty())
638 return result;
639
640 Graph g(data.size());
641 buildGraph(data, g);
642
643 std::vector<int> components(data.size());
644 int num_components = boost::connected_components(g, &components[0]);
645
646 result.resize(num_components);
647 for (size_t i = 0; i < data.size(); ++i) {
648 result[components[i]].push_back(data[i]);
649 }
650
651 return result;
652 }
653
654protected:
655 virtual void buildGraph(const std::vector<T>& data, Graph& g) {
656 // Default implementation: connect adjacent elements
657 for (size_t i = 1; i < data.size(); ++i) {
658 boost::add_edge(i - 1, i, g);
659 }
660 }
661};
662
663template <typename T>
665private:
666 struct Level {
667 std::vector<T> data;
669
670 Level(size_t limit) : size_limit(limit) {}
671
672 bool is_full() const {
673 return data.size() >= size_limit;
674 }
675 };
676
677 std::vector<std::shared_ptr<Level>> levels;
678 std::vector<T> memtable; // In-memory buffer
680 size_t size_ratio; // Size ratio between levels
681
682 void compact_level(size_t level_idx) {
683 if (level_idx >= levels.size() - 1) {
684 // Create new level if we're at the last one
685 levels.push_back(std::make_shared<Level>(levels[level_idx]->size_limit * size_ratio));
686 }
687
688 auto& current_level = levels[level_idx]->data;
689 auto& next_level = levels[level_idx + 1]->data;
690
691 // Merge current level into next level
692 std::vector<T> merged;
693 std::merge(current_level.begin(), current_level.end(), next_level.begin(), next_level.end(),
694 std::back_inserter(merged));
695
696 // Update levels
697 current_level.clear();
698 next_level = std::move(merged);
699
700 // If next level is full, compact it too
701 if (levels[level_idx + 1]->is_full()) {
702 compact_level(level_idx + 1);
703 }
704 }
705
707 if (memtable.empty())
708 return;
709
710 // Sort memtable before flushing
711 std::sort(memtable.begin(), memtable.end());
712
713 if (levels.empty()) {
714 levels.push_back(std::make_shared<Level>(memtable_size_limit * size_ratio));
715 }
716
717 // Merge memtable with first level
718 std::vector<T> merged;
719 std::merge(memtable.begin(), memtable.end(), levels[0]->data.begin(), levels[0]->data.end(),
720 std::back_inserter(merged));
721
722 levels[0]->data = std::move(merged);
723 memtable.clear();
724
725 // If level 0 is full, trigger compaction
726 if (levels[0]->is_full()) {
727 compact_level(0);
728 }
729 }
730
731public:
732 ChunkLSMTree(size_t memtable_limit = 1024, size_t ratio = 4)
733 : memtable_size_limit(memtable_limit), size_ratio(ratio) {}
734
735 void insert(const T& value) {
736 memtable.push_back(value);
737
738 if (memtable.size() >= memtable_size_limit) {
740 }
741 }
742
743 bool search(const T& value) const {
744 // Search memtable first
745 if (std::binary_search(memtable.begin(), memtable.end(), value)) {
746 return true;
747 }
748
749 // Search through all levels
750 for (const auto& level : levels) {
751 if (std::binary_search(level->data.begin(), level->data.end(), value)) {
752 return true;
753 }
754 }
755 return false;
756 }
757
758 void force_flush() {
760 }
761};
762
763} // namespace advanced_structures
Bloom filter-based chunking implementation.
virtual bool shouldSplit(const std::vector< T > &chunk)
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
BloomFilterChunk(size_t size=1024, size_t num_funcs=3)
A B+ tree implementation for chunk indexing.
void insert(const T &key)
Inserts a key into the B+ tree.
void insert_non_full(std::shared_ptr< Node > node, const T &key)
void split_child(std::shared_ptr< Node > parent, int index)
bool search(const T &key) const
Searches for a key in the B+ tree.
ChunkBPlusTree()
Constructs a new ChunkBPlusTree object.
bool search_node(const std::shared_ptr< Node > &node, const T &key) const
A deque-based chunk structure for double-ended operations.
ChunkLSMTree(size_t memtable_limit=1024, size_t ratio=4)
std::vector< std::shared_ptr< Level > > levels
A skip list implementation for efficient chunk searching.
ChunkSkipList(int max_lvl=16, float prob=0.5)
Constructs a new ChunkSkipList object.
int random_level()
Generates a random level for node insertion.
void insert(const T &value)
Inserts a value into the skip list.
bool search(const T &value) const
Searches for a value in the skip list.
A stack-based chunk structure for LIFO operations.
A treap implementation for efficient chunk searching and manipulation.
std::shared_ptr< Node > insert(std::shared_ptr< Node > node, T value, int priority)
std::shared_ptr< Node > rotate_right(std::shared_ptr< Node > node)
std::shared_ptr< Node > rotate_left(std::shared_ptr< Node > node)
bool search(const std::shared_ptr< Node > &node, T value) const
Fractal pattern-based chunking implementation.
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
virtual bool hasPattern(const std::vector< T > &chunk)
FractalPatternsChunk(size_t size=3, double threshold=0.8)
Graph-based chunking implementation.
boost::adjacency_list< boost::vecS, boost::vecS, boost::undirectedS > Graph
virtual void buildGraph(const std::vector< T > &data, Graph &g)
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
Semantic boundaries-based chunking implementation.
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
virtual bool isBoundary(const std::vector< T > &chunk)
std::vector< std::string > chunk(const std::string &content)
Chunk string content into semantic segments.
SemanticChunker(double threshold=0.7, ModelType custom_model=ModelType())
Construct a new string-specialized Semantic Chunker.
Template class for semantic-based content chunking.
std::vector< ContentType > chunk(const ContentType &content)
Chunk content based on semantic boundaries.
SemanticChunker(double threshold=0.7, ModelType custom_model=ModelType())
Construct a new Semantic Chunker.
double similarity_threshold
Threshold for determining chunk boundaries.
void setModel(ModelType new_model)
Set a new NLP model.
void setSimilarityThreshold(double threshold)
Set new similarity threshold.
std::vector< std::shared_ptr< Node > > children
std::vector< std::shared_ptr< Node > > forward
double calculateSimilarity(const ContentType &content1, const ContentType &content2)