Advanced Chunk Processing Library 0.2.0
A comprehensive C++ library for advanced data chunking strategies and processing operations
Loading...
Searching...
No Matches
sophisticated_chunking::MutualInformationChunking< T > Class Template Reference

Information theory based chunking using mutual information. More...

#include <sophisticated_chunking.hpp>

Public Member Functions

 MutualInformationChunking (size_t context_size=5, double mi_threshold=0.3)
 Constructor for mutual information based chunking.
 
std::vector< std::vector< T > > chunk (const std::vector< T > &data) const
 Chunk data based on mutual information analysis.
 
size_t get_context_size () const
 Get the size of context window.
 
double get_mi_threshold () const
 Get the threshold for mutual information.
 
void set_context_size (size_t size)
 Set the size of context window.
 
void set_mi_threshold (double threshold)
 Set the threshold for mutual information.
 

Private Member Functions

double calculateMutualInformation (const std::vector< T > &segment1, const std::vector< T > &segment2) const
 Calculate mutual information between adjacent segments.
 

Private Attributes

size_t context_size_
 
double mi_threshold_
 

Detailed Description

template<typename T>
class sophisticated_chunking::MutualInformationChunking< T >

Information theory based chunking using mutual information.

Template Parameters
TThe type of elements to be chunked

Definition at line 175 of file sophisticated_chunking.hpp.

Constructor & Destructor Documentation

◆ MutualInformationChunking()

template<typename T >
sophisticated_chunking::MutualInformationChunking< T >::MutualInformationChunking ( size_t  context_size = 5,
double  mi_threshold = 0.3 
)
inline

Constructor for mutual information based chunking.

Parameters
context_sizeSize of context window
mi_thresholdThreshold for mutual information

Definition at line 230 of file sophisticated_chunking.hpp.

Member Function Documentation

◆ calculateMutualInformation()

template<typename T >
double sophisticated_chunking::MutualInformationChunking< T >::calculateMutualInformation ( const std::vector< T > &  segment1,
const std::vector< T > &  segment2 
) const
inlineprivate

Calculate mutual information between adjacent segments.

Parameters
segment1First segment
segment2Second segment
Returns
Mutual information value

Definition at line 186 of file sophisticated_chunking.hpp.

187 {
188 if (segment1.empty() || segment2.empty()) {
189 return 0.0;
190 }
191
192 // Calculate frequency distributions
193 std::map<T, double> p1, p2;
194 std::map<std::pair<T, T>, double> p12;
195
196 for (const auto& val : segment1) {
197 p1[val] += 1.0 / segment1.size();
198 }
199
200 for (const auto& val : segment2) {
201 p2[val] += 1.0 / segment2.size();
202 }
203
204 // Calculate joint distribution
205 size_t min_size = std::min(segment1.size(), segment2.size());
206 for (size_t i = 0; i < min_size; ++i) {
207 p12[{segment1[i], segment2[i]}] += 1.0 / min_size;
208 }
209
210 // Calculate mutual information
211 double mi = 0.0;
212 for (const auto& [val1, prob1] : p1) {
213 for (const auto& [val2, prob2] : p2) {
214 auto joint_prob = p12[{val1, val2}];
215 if (joint_prob > 0) {
216 mi += joint_prob * std::log2(joint_prob / (prob1 * prob2));
217 }
218 }
219 }
220
221 return mi;
222 }

Referenced by sophisticated_chunking::MutualInformationChunking< T >::chunk().

◆ chunk()

template<typename T >
std::vector< std::vector< T > > sophisticated_chunking::MutualInformationChunking< T >::chunk ( const std::vector< T > &  data) const
inline

Chunk data based on mutual information analysis.

Parameters
dataInput data to be chunked
Returns
Vector of chunks

Definition at line 238 of file sophisticated_chunking.hpp.

238 {
239 if (data.size() < 2 * context_size_) {
240 return {data};
241 }
242
243 std::vector<std::vector<T>> chunks;
244 std::vector<T> current_chunk;
245
246 for (size_t i = 0; i < data.size(); ++i) {
247 current_chunk.push_back(data[i]);
248
249 if (current_chunk.size() >= context_size_ && i + context_size_ < data.size()) {
250 std::vector<T> next_segment(data.begin() + i + 1,
251 data.begin() +
252 std::min(i + 1 + context_size_, data.size()));
253
254 double mi = calculateMutualInformation(current_chunk, next_segment);
255
256 if (mi < mi_threshold_) {
257 chunks.push_back(current_chunk);
258 current_chunk.clear();
259 }
260 }
261 }
262
263 if (!current_chunk.empty()) {
264 chunks.push_back(current_chunk);
265 }
266
267 return chunks;
268 }
double calculateMutualInformation(const std::vector< T > &segment1, const std::vector< T > &segment2) const
Calculate mutual information between adjacent segments.

References sophisticated_chunking::MutualInformationChunking< T >::calculateMutualInformation(), sophisticated_chunking::MutualInformationChunking< T >::context_size_, and sophisticated_chunking::MutualInformationChunking< T >::mi_threshold_.

Referenced by demonstrate_mutual_information_chunking(), TEST_F(), and TEST_F().

◆ get_context_size()

template<typename T >
size_t sophisticated_chunking::MutualInformationChunking< T >::get_context_size ( ) const
inline

Get the size of context window.

Returns
Size of context window

Definition at line 274 of file sophisticated_chunking.hpp.

274 {
275 return context_size_;
276 }

References sophisticated_chunking::MutualInformationChunking< T >::context_size_.

◆ get_mi_threshold()

template<typename T >
double sophisticated_chunking::MutualInformationChunking< T >::get_mi_threshold ( ) const
inline

Get the threshold for mutual information.

Returns
Threshold for mutual information

Definition at line 282 of file sophisticated_chunking.hpp.

282 {
283 return mi_threshold_;
284 }

References sophisticated_chunking::MutualInformationChunking< T >::mi_threshold_.

◆ set_context_size()

template<typename T >
void sophisticated_chunking::MutualInformationChunking< T >::set_context_size ( size_t  size)
inline

Set the size of context window.

Parameters
sizeSize of context window

Definition at line 290 of file sophisticated_chunking.hpp.

290 {
291 if (size == 0)
292 throw std::invalid_argument("Context size cannot be zero");
293 context_size_ = size;
294 }

References sophisticated_chunking::MutualInformationChunking< T >::context_size_.

◆ set_mi_threshold()

template<typename T >
void sophisticated_chunking::MutualInformationChunking< T >::set_mi_threshold ( double  threshold)
inline

Set the threshold for mutual information.

Parameters
thresholdThreshold for mutual information

Definition at line 300 of file sophisticated_chunking.hpp.

300 {
301 mi_threshold_ = threshold;
302 }

References sophisticated_chunking::MutualInformationChunking< T >::mi_threshold_.

Member Data Documentation

◆ context_size_

◆ mi_threshold_


The documentation for this class was generated from the following file: