4#include <boost/graph/adjacency_list.hpp>
5#include <boost/graph/connected_components.hpp>
13#include <unordered_map>
20 template <
typename ContentType>
23 return 1.0 -
static_cast<double>(std::abs(
static_cast<int>(content1.size()) -
24 static_cast<int>(content2.size()))) /
25 std::max(content1.size(), content2.size());
38template <
typename ContentType,
typename ModelType = DefaultNLPModel>
51 explicit SemanticChunker(
double threshold = 0.7, ModelType custom_model = ModelType())
60 std::vector<ContentType>
chunk(
const ContentType& content);
88template <
typename ModelType>
101 explicit SemanticChunker(
double threshold = 0.7, ModelType custom_model = ModelType())
110 std::vector<std::string>
chunk(
const std::string& content) {
111 std::vector<std::string> chunks;
112 if (content.empty()) {
119 std::string current_chunk;
121 while ((pos = content.find_first_of(
".!?", start)) != std::string::npos) {
123 size_t end = pos + 1;
124 while (end < content.length() && std::isspace(content[end])) {
129 std::string sentence = content.substr(start, end - start);
130 if (!sentence.empty()) {
131 chunks.push_back(sentence);
138 if (start < content.length()) {
139 chunks.push_back(content.substr(start));
170 while ((
static_cast<float>(rand()) / RAND_MAX) <
p && lvl <
max_level) {
192 std::vector<std::shared_ptr<Node>> update(
max_level);
196 while (current->forward[i] && current->forward[i]->value < value) {
197 current = current->forward[i];
210 auto new_node = std::make_shared<Node>(value, new_level);
211 for (
int i = 0; i < new_level; i++) {
212 new_node->forward[i] = update[i]->forward[i];
213 update[i]->forward[i] = new_node;
225 while (current->forward[i] && current->forward[i]->value < value) {
226 current = current->forward[i];
229 current = current->forward[0];
230 return current && current->value == value;
264 if (
root->keys.empty()) {
265 root->keys.push_back(key);
270 auto new_root = std::make_shared<Node>(
false);
271 new_root->children.push_back(
root);
292 auto child = parent->children[index];
293 auto new_child = std::make_shared<Node>(child->is_leaf);
295 parent->keys.insert(parent->keys.begin() + index, child->keys[
ORDER / 2 - 1]);
297 parent->children.insert(parent->children.begin() + index + 1, new_child);
299 new_child->keys.assign(child->keys.begin() +
ORDER / 2, child->keys.end());
301 child->keys.resize(
ORDER / 2 - 1);
303 if (!child->is_leaf) {
304 new_child->children.assign(child->children.begin() +
ORDER / 2, child->children.end());
305 child->children.resize(
ORDER / 2);
310 int i = node->keys.size() - 1;
313 node->keys.push_back(T());
314 while (i >= 0 && key < node->keys[i]) {
315 node->keys[i + 1] = node->keys[i];
318 node->keys[i + 1] = key;
320 while (i >= 0 && key < node->keys[i]) {
325 if (node->children[i]->keys.size() ==
ORDER - 1) {
327 if (key > node->keys[i]) {
335 bool search_node(
const std::shared_ptr<Node>& node,
const T& key)
const {
337 while (i < node->keys.size() && key > node->keys[i]) {
341 if (i < node->keys.size() && key == node->keys[i]) {
364 data_.push_back(value);
368 data_.push_front(value);
372 T value =
data_.back();
378 T value =
data_.front();
388 return data_.empty();
407 T value =
data_.top();
417 return data_.empty();
441 auto new_root = node->left;
442 node->left = new_root->right;
443 new_root->right = node;
448 auto new_root = node->right;
449 node->right = new_root->left;
450 new_root->left = node;
454 std::shared_ptr<Node>
insert(std::shared_ptr<Node> node, T value,
int priority) {
456 return std::make_shared<Node>(value, priority);
459 if (value < node->value) {
460 node->left =
insert(node->left, value, priority);
461 if (node->left->priority > node->priority) {
465 node->right =
insert(node->right, value, priority);
466 if (node->right->priority > node->priority) {
473 bool search(
const std::shared_ptr<Node>& node, T value)
const {
477 if (node->value == value) {
480 if (value < node->value) {
481 return search(node->left, value);
483 return search(node->right, value);
491 int priority = std::uniform_int_distribution<>(1, 100)(
gen);
512 std::vector<std::vector<T>>
chunk(
const std::vector<T>& data) {
513 std::vector<std::vector<T>> result;
517 std::vector<T> current_chunk;
518 for (
const auto& item : data) {
519 current_chunk.push_back(item);
521 result.push_back(current_chunk);
522 current_chunk.clear();
526 if (!current_chunk.empty()) {
527 result.push_back(current_chunk);
535 return chunk.size() >= 3;
553 std::vector<std::vector<T>>
chunk(
const std::vector<T>& data) {
554 std::vector<std::vector<T>> result;
558 std::vector<T> current_chunk;
559 for (
const auto& item : data) {
560 current_chunk.push_back(item);
562 result.push_back(current_chunk);
563 current_chunk.clear();
567 if (!current_chunk.empty()) {
568 result.push_back(current_chunk);
595 std::vector<std::vector<T>>
chunk(
const std::vector<T>& data) {
596 std::vector<std::vector<T>> result;
600 std::vector<T> current_chunk;
601 for (
const auto& item : data) {
602 current_chunk.push_back(item);
604 result.push_back(current_chunk);
605 current_chunk.clear();
609 if (!current_chunk.empty()) {
610 result.push_back(current_chunk);
618 return chunk.size() >= 4;
629 using Graph = boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS>;
635 std::vector<std::vector<T>>
chunk(
const std::vector<T>& data) {
636 std::vector<std::vector<T>> result;
640 Graph g(data.size());
643 std::vector<int> components(data.size());
644 int num_components = boost::connected_components(g, &components[0]);
646 result.resize(num_components);
647 for (
size_t i = 0; i < data.size(); ++i) {
648 result[components[i]].push_back(data[i]);
657 for (
size_t i = 1; i < data.size(); ++i) {
658 boost::add_edge(i - 1, i, g);
677 std::vector<std::shared_ptr<Level>>
levels;
683 if (level_idx >=
levels.size() - 1) {
688 auto& current_level =
levels[level_idx]->data;
689 auto& next_level =
levels[level_idx + 1]->data;
692 std::vector<T> merged;
693 std::merge(current_level.begin(), current_level.end(), next_level.begin(), next_level.end(),
694 std::back_inserter(merged));
697 current_level.clear();
698 next_level = std::move(merged);
701 if (
levels[level_idx + 1]->is_full()) {
718 std::vector<T> merged;
720 std::back_inserter(merged));
722 levels[0]->data = std::move(merged);
726 if (
levels[0]->is_full()) {
750 for (
const auto& level :
levels) {
751 if (std::binary_search(level->data.begin(), level->data.end(), value)) {
Bloom filter-based chunking implementation.
size_t num_hash_functions
virtual bool shouldSplit(const std::vector< T > &chunk)
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
BloomFilterChunk(size_t size=1024, size_t num_funcs=3)
std::vector< bool > filter
A B+ tree implementation for chunk indexing.
void insert(const T &key)
Inserts a key into the B+ tree.
std::shared_ptr< Node > root
void insert_non_full(std::shared_ptr< Node > node, const T &key)
void split_child(std::shared_ptr< Node > parent, int index)
bool search(const T &key) const
Searches for a key in the B+ tree.
static constexpr int ORDER
ChunkBPlusTree()
Constructs a new ChunkBPlusTree object.
bool search_node(const std::shared_ptr< Node > &node, const T &key) const
A deque-based chunk structure for double-ended operations.
void push_back(const T &value)
void push_front(const T &value)
bool search(const T &value) const
ChunkLSMTree(size_t memtable_limit=1024, size_t ratio=4)
std::vector< std::shared_ptr< Level > > levels
void compact_level(size_t level_idx)
size_t memtable_size_limit
std::vector< T > memtable
void insert(const T &value)
A skip list implementation for efficient chunk searching.
ChunkSkipList(int max_lvl=16, float prob=0.5)
Constructs a new ChunkSkipList object.
int random_level()
Generates a random level for node insertion.
void insert(const T &value)
Inserts a value into the skip list.
std::shared_ptr< Node > head
bool search(const T &value) const
Searches for a value in the skip list.
A stack-based chunk structure for LIFO operations.
void push(const T &value)
A treap implementation for efficient chunk searching and manipulation.
std::shared_ptr< Node > root
bool search(T value) const
std::shared_ptr< Node > insert(std::shared_ptr< Node > node, T value, int priority)
std::shared_ptr< Node > rotate_right(std::shared_ptr< Node > node)
std::shared_ptr< Node > rotate_left(std::shared_ptr< Node > node)
bool search(const std::shared_ptr< Node > &node, T value) const
Fractal pattern-based chunking implementation.
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
virtual bool hasPattern(const std::vector< T > &chunk)
FractalPatternsChunk(size_t size=3, double threshold=0.8)
double similarity_threshold
Graph-based chunking implementation.
GraphBasedChunk(double threshold=0.5)
boost::adjacency_list< boost::vecS, boost::vecS, boost::undirectedS > Graph
virtual void buildGraph(const std::vector< T > &data, Graph &g)
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
Semantic boundaries-based chunking implementation.
SemanticBoundariesChunk(double threshold=0.5)
double boundary_threshold
std::vector< std::vector< T > > chunk(const std::vector< T > &data)
virtual bool isBoundary(const std::vector< T > &chunk)
ModelType model
NLP model instance.
std::vector< std::string > chunk(const std::string &content)
Chunk string content into semantic segments.
SemanticChunker(double threshold=0.7, ModelType custom_model=ModelType())
Construct a new string-specialized Semantic Chunker.
double similarity_threshold
Similarity threshold for chunking.
Template class for semantic-based content chunking.
ModelType model
NLP model instance.
std::vector< ContentType > chunk(const ContentType &content)
Chunk content based on semantic boundaries.
SemanticChunker(double threshold=0.7, ModelType custom_model=ModelType())
Construct a new Semantic Chunker.
double similarity_threshold
Threshold for determining chunk boundaries.
void setModel(ModelType new_model)
Set a new NLP model.
void setSimilarityThreshold(double threshold)
Set new similarity threshold.
std::shared_ptr< Node > next
std::vector< std::shared_ptr< Node > > children
std::vector< std::shared_ptr< Node > > forward
std::shared_ptr< Node > right
std::shared_ptr< Node > left
double calculateSimilarity(const ContentType &content1, const ContentType &content2)