diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee202062b..246f2875b93 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -78,7 +78,7 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); + GGML_API struct gguf_context * gguf_init_from_buffer(const void * buffer, size_t size, struct gguf_init_params params); GGML_API void gguf_free(struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 8cc4ef1cf44..6c033cfc314 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -217,13 +217,25 @@ struct gguf_context { }; struct gguf_reader { - FILE * file; + FILE * file = nullptr; + const uint8_t * buffer = nullptr; + size_t buffer_size = 0; + mutable size_t buffer_pos = 0; gguf_reader(FILE * file) : file(file) {} + gguf_reader(const void * buffer, size_t size) : buffer((const uint8_t *)buffer), buffer_size(size) {} template bool read(T & dst) const { - return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + if (file) { + return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + } + if (buffer_pos + sizeof(T) > buffer_size) { + return false; + } + memcpy(&dst, buffer + buffer_pos, sizeof(T)); + buffer_pos += sizeof(T); + return true; } template @@ -278,11 +290,27 @@ struct gguf_reader { return false; } dst.resize(size); - return fread(dst.data(), 1, dst.length(), file) == dst.length(); + if (file) { + return fread(dst.data(), 1, dst.length(), file) == dst.length(); + } + if (buffer_pos + dst.length() > buffer_size) { + return false; + } + memcpy(dst.data(), buffer + buffer_pos, dst.length()); + buffer_pos += dst.length(); + return true; } bool read(void * dst, const size_t size) const { - return fread(dst, 1, size, file) == size; + if (file) { + return fread(dst, 1, size, file) == size; + } + if (buffer_pos + size > buffer_size) { + return false; + } + memcpy(dst, buffer + buffer_pos, size); + buffer_pos += size; + return true; } }; @@ -316,8 +344,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vectorinfo.size()) == n_tensors); // we require the data section to be aligned, so take into account any padding - if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) { - GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); - gguf_free(ctx); - return nullptr; + const size_t offset_curr = gr.file ? ftell(gr.file) : gr.buffer_pos; + const size_t offset_aligned = GGML_PAD(offset_curr, ctx->alignment); + + if (gr.file) { + if (fseek(gr.file, offset_aligned, SEEK_SET) != 0) { + GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); + gguf_free(ctx); + return nullptr; + } + } else { + if (offset_aligned > gr.buffer_size) { + GGML_LOG_ERROR("%s: buffer overflow when seeking to data section\n", __func__); + gguf_free(ctx); + return nullptr; + } + gr.buffer_pos = offset_aligned; } // store the current file offset - this is where the data section starts - ctx->offset = ftell(file); + ctx->offset = offset_aligned; // compute the total size of the data section, taking into account the alignment { @@ -730,6 +769,16 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par return ctx; } +struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) { + const struct gguf_reader gr(file); + return gguf_init_from_reader_impl(gr, params); +} + +struct gguf_context * gguf_init_from_buffer(const void * buffer, size_t size, struct gguf_init_params params) { + const struct gguf_reader gr(buffer, size); + return gguf_init_from_reader_impl(gr, params); +} + struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { FILE * file = ggml_fopen(fname, "rb"); diff --git a/include/llama.h b/include/llama.h index b52eaacfa7e..627101407db 100644 --- a/include/llama.h +++ b/include/llama.h @@ -438,6 +438,11 @@ extern "C" { const char * path_model, struct llama_model_params params); + LLAMA_API struct llama_model * llama_model_load_from_buffer( + const void * buffer, + size_t size, + struct llama_model_params params); + // Load the model from multiple splits (support custom naming scheme) // The paths must be in the correct order LLAMA_API struct llama_model * llama_model_load_from_splits( diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 47497cf953f..305d8f55109 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -169,7 +169,14 @@ struct llama_file::impl { seek(0, SEEK_SET); } + impl(const void * buffer, size_t size) : buffer(buffer), size(size) { + fp = NULL; + } + size_t tell() const { + if (buffer) { + return buffer_pos; + } // TODO: this ifdef is never true? #ifdef _WIN32 __int64 ret = _ftelli64(fp); @@ -184,6 +191,16 @@ struct llama_file::impl { } void seek(size_t offset, int whence) const { + if (buffer) { + if (whence == SEEK_SET) { + buffer_pos = offset; + } else if (whence == SEEK_CUR) { + buffer_pos += offset; + } else if (whence == SEEK_END) { + buffer_pos = size + offset; + } + return; + } // TODO: this ifdef is never true? #ifdef _WIN32 int ret = _fseeki64(fp, (__int64) offset, whence); @@ -199,6 +216,14 @@ struct llama_file::impl { if (len == 0) { return; } + if (buffer) { + if (buffer_pos + len > size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + memcpy(ptr, (const char *)buffer + buffer_pos, len); + buffer_pos += len; + return; + } errno = 0; std::size_t ret = std::fread(ptr, len, 1, fp); if (ferror(fp)) { @@ -219,6 +244,9 @@ struct llama_file::impl { if (len == 0) { return; } + if (buffer) { + throw std::runtime_error("write to buffer not supported"); + } errno = 0; size_t ret = std::fwrite(ptr, len, 1, fp); if (ret != 1) { @@ -239,13 +267,17 @@ struct llama_file::impl { FILE * fp; size_t size; + const void * buffer = nullptr; + mutable size_t buffer_pos = 0; }; llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} +llama_file::llama_file(const void * buffer, size_t size) : pimpl(std::make_unique(buffer, size)) {} llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } +const void * llama_file::buffer() const { return pimpl->buffer; } int llama_file::file_id() const { #ifdef _WIN32 @@ -272,9 +304,15 @@ void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); } struct llama_mmap::impl { #ifdef _POSIX_MAPPED_FILES std::vector> mapped_fragments; + bool is_buffer = false; impl(struct llama_file * file, size_t prefetch, bool numa) { size = file->size(); + if (file->buffer()) { + addr = const_cast(file->buffer()); + is_buffer = true; + return; + } int fd = file->file_id(); int flags = MAP_SHARED; if (numa) { prefetch = 0; } @@ -319,6 +357,9 @@ struct llama_mmap::impl { } void unmap_fragment(size_t first, size_t last) { + if (is_buffer) { + return; + } int page_size = sysconf(_SC_PAGESIZE); align_range(&first, &last, page_size); size_t len = last - first; @@ -355,6 +396,9 @@ struct llama_mmap::impl { } ~impl() { + if (is_buffer) { + return; + } for (const auto & frag : mapped_fragments) { if (munmap((char *) addr + frag.first, frag.second - frag.first)) { LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 4e5aec3f440..acd9754278b 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -14,11 +14,14 @@ using llama_mlocks = std::vector>; struct llama_file { llama_file(const char * fname, const char * mode); + llama_file(const void * buffer, size_t size); ~llama_file(); size_t tell() const; size_t size() const; + const void * buffer() const; + int file_id() const; // fileno overload void seek(size_t offset, int whence) const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index aa3a65f87a5..6aaec7771b0 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -468,6 +468,174 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); +llama_model_loader::llama_model_loader( + const void * buffer, + size_t size, + bool use_mmap, + bool check_tensors, + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { + int trace = 0; + if (getenv("LLAMA_TRACE")) { + trace = atoi(getenv("LLAMA_TRACE")); + } + + if (param_overrides_p != nullptr) { + for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { + kv_overrides.insert({std::string(p->key), *p}); + } + } + + tensor_buft_overrides = param_tensor_buft_overrides_p; + + // Load the main GGUF + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta.reset(gguf_init_from_buffer(buffer, size, params)); + if (!meta) { + throw std::runtime_error(format("%s: failed to load model from buffer", __func__)); + } + + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + files.emplace_back(new llama_file(buffer, size)); + contexts.emplace_back(ctx); + + // Save tensors data offset + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); + } + + n_kv = gguf_get_n_kv(meta.get()); + n_tensors = weights_map.size(); + + fver = (enum llama_fver) gguf_get_version(meta.get()); + + LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from buffer (version %s)\n", + __func__, n_kv, n_tensors, llama_file_version_name(fver)); + + // determine file type based on the number of tensors for each quantization and print meta data + // TODO: make optional + { + std::map n_type; + + uint32_t n_type_max = 0; + enum ggml_type type_max = GGML_TYPE_F32; + + for (const auto & it : weights_map) { + const llama_tensor_weight & w = it.second; + const ggml_tensor * tensor = w.tensor; + + enum ggml_type type = tensor->type; + + n_type[type]++; + + if (n_type_max < n_type[type]) { + n_type_max = n_type[type]; + type_max = type; + } + + if (trace > 0) { + const uint16_t sid = w.idx; + LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__, + sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), + ggml_nbytes(tensor)/1024.0f/1024.0f); + } + } + + switch (type_max) { + case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; + case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; + case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; + case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; + case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; + case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; + case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; + case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; + case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; + case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; + case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; + case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; + case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; + case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; + case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; + case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; + case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; + case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; + case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; + case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; + case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; + default: + { + LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); + ftype = LLAMA_FTYPE_ALL_F32; + } break; + } + + // this is a way to mark that we have "guessed" the file type + ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); + + { + uint32_t ftype_val = 0; + if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) { + ftype = (llama_ftype) ftype_val; + } + } + + LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(meta.get(), i); + const enum gguf_type type = gguf_get_kv_type(meta.get(), i); + const std::string type_name = + type == GGUF_TYPE_ARRAY + ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) + : gguf_type_name(type); + + std::string value = gguf_kv_to_str(meta.get(), i); + const size_t MAX_VALUE_LEN = 40; + if (value.size() > MAX_VALUE_LEN) { + value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); + } + replace_all(value, "\n", "\\n"); + + LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + } + + // print type counts + for (auto & kv : n_type) { + if (kv.second == 0) { + continue; + } + + LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + } + } + + if (!llama_mmap::SUPPORTED) { + LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); + use_mmap = false; + } + + this->use_mmap = use_mmap; + this->check_tensors = check_tensors; +} + llama_model_loader::llama_model_loader( const std::string & fname, std::vector & splits, diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index c9189f6cb44..c2ee092e0a0 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -100,6 +100,14 @@ struct llama_model_loader { const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); + llama_model_loader( + const void * buffer, + size_t size, + bool use_mmap, + bool check_tensors, + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); + template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, bool required = true); diff --git a/src/llama.cpp b/src/llama.cpp index ab2e9868af4..312b2cfea75 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -99,6 +99,59 @@ int64_t llama_time_us(void) { } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static int llama_model_load(const void * buffer, size_t size, llama_model & model, llama_model_params & params) { + // loading time will be recalculated after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = 0; + time_meas tm(model.t_load_us); + + model.t_start_us = tm.t_start_us; + + try { + llama_model_loader ml(buffer, size, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + + ml.print_info(); + + model.hparams.vocab_only = params.vocab_only; + + try { + model.load_arch(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model architecture: " + std::string(e.what())); + } + try { + model.load_hparams(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); + } + if (model.arch == LLM_ARCH_CLIP) { + throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead"); + } + try { + model.load_vocab(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); + } + + model.load_stats(ml); + model.print_info(); + + if (params.vocab_only) { + LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); + return 0; + } + + if (!model.load_tensors(ml)) { + return -2; + } + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); + return -1; + } + + return 0; +} + static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration @@ -292,6 +345,153 @@ static struct llama_model * llama_model_load_from_file_impl( return model; } +static struct llama_model * llama_model_load_from_buffer_impl( + const void * buffer, + size_t size, + struct llama_model_params params) { + ggml_time_init(); + + if (!params.vocab_only && ggml_backend_reg_count() == 0) { + LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__); + return nullptr; + } + + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + LLAMA_LOG_CONT("."); + if (percentage >= 100) { + LLAMA_LOG_CONT("\n"); + } + } + return true; + }; + } + + llama_model * model = new llama_model(params); + + // create list of devices to use with this model + if (params.devices) { + for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { + model->devices.push_back(*dev); + } + } else { + // default device selection + + // build list of available devices + std::vector gpus; + std::vector igpus; + std::vector rpc_servers; + + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + switch (ggml_backend_dev_type(dev)) { + case GGML_BACKEND_DEVICE_TYPE_CPU: + case GGML_BACKEND_DEVICE_TYPE_ACCEL: + // skip CPU backends since they are handled separately + break; + + case GGML_BACKEND_DEVICE_TYPE_GPU: { + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (ggml_backend_reg_name(reg) == std::string("RPC")) { + rpc_servers.push_back(dev); + } else { + // check if there is already a GPU with the same device id + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) { + ggml_backend_dev_props d_props; + ggml_backend_dev_get_props(d, &d_props); + if (props.device_id && d_props.device_id) { + return strcmp(props.device_id, d_props.device_id) == 0; + } + return false; + }); + + if (it != gpus.end()) { + LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n", + __func__, + ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), + props.device_id ? props.device_id : "unknown id", + ggml_backend_dev_name(*it), ggml_backend_dev_description(*it)); + } else { + gpus.push_back(dev); + } + } + break; + } + + case GGML_BACKEND_DEVICE_TYPE_IGPU: + igpus.push_back(dev); + break; + } + } + + // add RPC servers at the front of the list to minimize network transfers + model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); + + // add GPUs + model->devices.insert(model->devices.end(), gpus.begin(), gpus.end()); + + // add integrated GPUs only if no other devices were found + if (model->devices.empty()) { + model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); + } + } + + // if using single GPU mode, remove all except the main GPU + if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { + if (params.main_gpu < 0) { + model->devices.clear(); + } else { + if (params.main_gpu >= (int)model->devices.size()) { + LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size()); + llama_model_free(model); + return nullptr; + } + ggml_backend_dev_t main_gpu = model->devices[params.main_gpu]; + model->devices.clear(); + model->devices.push_back(main_gpu); + } + } + + for (auto * dev : model->devices) { + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__, + ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), + props.device_id ? props.device_id : "unknown id", + props.memory_free/1024/1024); + } + + const int status = llama_model_load(buffer, size, *model, params); + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } + + llama_model_free(model); + return nullptr; + } + + return model; +} + +struct llama_model * llama_model_load_from_buffer( + const void * buffer, + size_t size, + struct llama_model_params params) { + return llama_model_load_from_buffer_impl(buffer, size, params); +} + // deprecated struct llama_model * llama_load_model_from_file( const char * path_model,