TinyLlama.cpp 1.0
A lightweight C++ implementation of the TinyLlama language model
Loading...
Searching...
No Matches
safetensors_loader.h
Go to the documentation of this file.
1#ifndef SAFETENSORS_LOADER_H
2#define SAFETENSORS_LOADER_H
3
4#ifdef _WIN32
5#include <windows.h>
6#else
7#include <fcntl.h>
8#include <sys/mman.h>
9#include <unistd.h>
10#endif
11
12#include <functional>
13#include <future>
14#include <map>
15#include <memory>
16#include <mutex>
17#include <nlohmann/json.hpp>
18#include <queue>
19#include <stdexcept>
20#include <string>
21#include <thread>
22#include <vector>
23#include <filesystem> // For directory operations, C++17
24
25#include "logger.h" // Assuming Logger is accessible
26
27struct ModelConfig; // Forward declaration
28
34class ThreadPool; // Forward declaration
35
42struct Shard {
46 std::string file_path;
47
51 void* mapped_data = nullptr;
52
56 size_t file_size = 0;
57
61 uint64_t metadata_size = 0;
62
66 const uint8_t* metadata_ptr = nullptr;
67
71 const uint8_t* tensor_data_block_ptr = nullptr;
72
73#ifdef _WIN32
74 HANDLE file_handle_ = INVALID_HANDLE_VALUE;
75 HANDLE mapping_handle_ = NULL;
76#else
77 int fd_ = -1;
78#endif
79
85 explicit Shard(const std::string& fp);
86
90 ~Shard();
91
95 Shard(Shard&& other) noexcept;
96
100 Shard& operator=(Shard&& other) noexcept;
101
109 const uint8_t* get_tensor_raw_data(size_t local_offset, size_t n_bytes) const;
110};
111
112
121 public:
125 struct TensorInfo {
126 std::string name;
127 std::string dtype;
128 std::vector<size_t> shape;
129 size_t data_offset;
130 size_t nbytes;
131 std::string shard_key;
132 };
133
143 explicit SafeTensorsLoader(const std::string& model_load_path);
144
149
152
157 std::vector<std::string> tensor_names() const;
158
165 std::vector<uint8_t> get_tensor_bytes(const std::string& name) const;
166
173 const TensorInfo& get_tensor_info(const std::string& name) const;
174
179 std::map<std::string, std::vector<uint8_t>> load_all_tensors_parallel() const;
180
191 static bool load_model_config_from_json(const std::string& model_path_or_dir, ModelConfig& config_to_populate);
192
193 private:
194 std::string model_load_path_;
195 bool is_sharded_ = false;
197 std::map<std::string, TensorInfo> tensors_;
198 std::map<std::string, std::unique_ptr<Shard>> loaded_shards_;
200 // If sharded via an index file, this maps tensor names directly to their shard key.
201 // If not sharded or sharded by pattern, this might be populated differently or less used.
202 std::map<std::string, std::string> tensor_name_to_shard_key_map_;
203
211 void load_from_directory(const std::string& directory_path);
212
220 void load_single_file(const std::string& file_path, const std::string& shard_key_override = "");
221
229 void parse_shard_metadata(Shard& shard, const std::string& shard_key);
230
240 std::vector<uint8_t> convert_tensor_data(const uint8_t* data, size_t size,
241 const std::string& dtype) const;
242
251 const Shard* get_shard_for_tensor(const std::string& tensor_name) const;
252};
253
261 public:
266 explicit ThreadPool(size_t num_threads);
267
271 ~ThreadPool();
272
281 template <class F, class... Args>
282 std::future<typename std::result_of<F(Args...)>::type> submit(F&& f,
283 Args&&... args);
284 private:
285 std::vector<std::thread> workers_;
286 std::queue<std::function<void()>> tasks_;
287 std::mutex queue_mutex_;
288 std::condition_variable condition_;
289 bool stop_ = false;
290};
291
292// Template implementation for ThreadPool::submit
293template <class F, class... Args>
294std::future<typename std::result_of<F(Args...)>::type> ThreadPool::submit(
295 F&& f, Args&&... args) {
296 using return_type = typename std::result_of<F(Args...)>::type;
297
298 auto task = std::make_shared<std::packaged_task<return_type()>>(
299 std::bind(std::forward<F>(f), std::forward<Args>(args)...));
300
301 std::future<return_type> res = task->get_future();
302 {
303 std::unique_lock<std::mutex> lock(queue_mutex_);
304 if (stop_) throw std::runtime_error("submit on stopped ThreadPool");
305 tasks_.emplace([task]() { (*task)(); });
306 }
307 condition_.notify_one();
308 return res;
309}
310
311#endif // SAFETENSORS_LOADER_H
Main class for loading tensors from SafeTensors format files (single or sharded)
std::map< std::string, std::unique_ptr< Shard > > loaded_shards_
std::map< std::string, TensorInfo > tensors_
SafeTensorsLoader(const SafeTensorsLoader &)=delete
const Shard * get_shard_for_tensor(const std::string &tensor_name) const
Get the Shard object for a given tensor name.
void load_from_directory(const std::string &directory_path)
Load tensors from a directory, handling index files and multiple shards.
static bool load_model_config_from_json(const std::string &model_path_or_dir, ModelConfig &config_to_populate)
Loads model configuration from a JSON file corresponding to a .safetensors model path.
std::map< std::string, std::string > tensor_name_to_shard_key_map_
void load_single_file(const std::string &file_path, const std::string &shard_key_override="")
Load a single .safetensors file as a shard.
std::map< std::string, std::vector< uint8_t > > load_all_tensors_parallel() const
Load all tensors in parallel.
std::vector< std::string > tensor_names() const
Get a list of all tensor names available in the loaded model.
const TensorInfo & get_tensor_info(const std::string &name) const
Get information about a specific tensor.
std::vector< uint8_t > convert_tensor_data(const uint8_t *data, size_t size, const std::string &dtype) const
Convert raw tensor data to FP32 if needed.
std::vector< uint8_t > get_tensor_bytes(const std::string &name) const
Get the raw bytes for a tensor, converting to FP32 if needed.
~SafeTensorsLoader()
Destructor. Cleans up all memory-mapped shards.
SafeTensorsLoader & operator=(const SafeTensorsLoader &)=delete
void parse_shard_metadata(Shard &shard, const std::string &shard_key)
Parse the metadata of a shard and populate tensor information.
Thread pool for parallel tensor loading operations.
~ThreadPool()
Destructor that ensures proper cleanup of threads.
std::future< typename std::result_of< F(Args...)>::type > submit(F &&f, Args &&... args)
Submits a task to the thread pool.
std::queue< std::function< void()> > tasks_
std::vector< std::thread > workers_
std::condition_variable condition_
std::mutex queue_mutex_
Logging utilities for the TinyLlama implementation.
Model configuration structure holding architecture and hyperparameters.
Definition model.h:80
Information about a tensor stored in the SafeTensors file(s)
Represents a memory-mapped SafeTensors file (shard).
~Shard()
Destructor. Cleans up memory mapping and file handles.
uint64_t metadata_size
Size of the metadata block in bytes.
const uint8_t * tensor_data_block_ptr
Pointer to the start of the tensor data block.
void * mapped_data
Pointer to the memory-mapped data.
std::string file_path
Path to the shard file.
const uint8_t * metadata_ptr
Pointer to the start of the metadata block.
Shard & operator=(Shard &&other) noexcept
Move assignment operator.
size_t file_size
Size of the mapped file in bytes.
const uint8_t * get_tensor_raw_data(size_t local_offset, size_t n_bytes) const
Get a pointer to the raw tensor data within this shard.