Skip to content

Commit

Permalink
llama : refactor src/llama.cpp (ggerganov#10902)
Browse files Browse the repository at this point in the history
* llama : scatter llama.cpp into multiple modules (wip)

* llama : control-vector -> adapter

* llama : arch

* llama : mmap

ggml-ci

* ci : remove BUILD_SHARED_LIBS=OFF

ggml-ci

* llama : arch (cont)

ggml-ci

* llama : chat

ggml-ci

* llama : model

ggml-ci

* llama : hparams

ggml-ci

* llama : adapter

ggml-ci

* examples : fix

ggml-ci

* rebase

ggml-ci

* minor

* llama : kv cache

ggml-ci

* llama : impl

ggml-ci

* llama : batch

ggml-ci

* cont

ggml-ci

* llama : context

ggml-ci

* minor

* llama : context (cont)

ggml-ci

* llama : model loader

ggml-ci

* common : update lora

ggml-ci

* llama : quant

ggml-ci

* llama : quant (cont)

ggml-ci

* minor [no ci]
  • Loading branch information
ggerganov authored Jan 3, 2025
1 parent 2f0ee84 commit f66f582
Show file tree
Hide file tree
Showing 61 changed files with 20,419 additions and 19,875 deletions.
28 changes: 13 additions & 15 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ jobs:
-DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
-DBUILD_SHARED_LIBS=OFF
-DGGML_RPC=ON
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
Expand Down Expand Up @@ -123,8 +122,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DBUILD_SHARED_LIBS=OFF
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
Expand Down Expand Up @@ -181,7 +179,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
cmake --build . --config Release -j $(nproc)
- name: Test
Expand Down Expand Up @@ -651,23 +649,23 @@ jobs:
matrix:
include:
- build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
- build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
- build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
- build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
- build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
- build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

Expand Down Expand Up @@ -914,7 +912,7 @@ jobs:
shell: cmd
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
Expand Down
4 changes: 2 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1512,15 +1512,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--lora"}, "FNAME",
"path to LoRA adapter (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & value) {
params.lora_adapters.push_back({ std::string(value), 1.0 });
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
{"--lora-scaled"}, "FNAME", "SCALE",
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & fname, const std::string & scale) {
params.lora_adapters.push_back({ fname, std::stof(scale) });
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
Expand Down
25 changes: 13 additions & 12 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -922,20 +922,21 @@ struct common_init_result common_init_from_params(common_params & params) {

// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
common_lora_adapter_container loaded_la;
loaded_la.path = la.path;
loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
llama_lora_adapter_ptr lora;
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
return iparams;
}
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters

la.ptr = lora.get();
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}

if (!params.lora_init_without_apply) {
common_lora_adapters_apply(lctx, iparams.lora_adapters);
common_lora_adapters_apply(lctx, params.lora_adapters);
}

if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
Expand Down Expand Up @@ -996,17 +997,17 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_perf_context_reset(lctx);
}

iparams.model = model;
iparams.context = lctx;
iparams.model.reset(model);
iparams.context.reset(lctx);

return iparams;
}

void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) {
for (auto & la : lora) {
if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.adapter, la.scale);
llama_lora_adapter_set(ctx, la.ptr, la.scale);
}
}
}
Expand Down
26 changes: 15 additions & 11 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#pragma once

#include "llama.h"
#include "llama-cpp.h"

#include <string>
#include <vector>
Expand All @@ -27,10 +27,8 @@
struct common_lora_adapter_info {
std::string path;
float scale;
};

struct common_lora_adapter_container : common_lora_adapter_info {
struct llama_lora_adapter * adapter;
struct llama_lora_adapter * ptr;
};

using llama_tokens = std::vector<llama_token>;
Expand Down Expand Up @@ -478,10 +476,12 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils
//

// note: defines object's lifetime
struct common_init_result {
struct llama_model * model = nullptr;
struct llama_context * context = nullptr;
std::vector<common_lora_adapter_container> lora_adapters;
llama_model_ptr model;
llama_context_ptr context;

std::vector<llama_lora_adapter_ptr> lora;
};

struct common_init_result common_init_from_params(common_params & params);
Expand All @@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
const struct llama_model_params & params);

// clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);

//
// Batch utils
Expand Down Expand Up @@ -640,6 +640,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils
//

static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
namespace {

const char * const LLM_KV_SPLIT_NO = "split.no";
const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

}
10 changes: 5 additions & 5 deletions examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,12 @@ static void print_matrix(struct ggml_tensor * probs) {
}
}

struct llama_file {
struct my_llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;

llama_file(const char * fname, const char * mode) {
my_llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
size = 0;
Expand Down Expand Up @@ -500,15 +500,15 @@ struct llama_file {
return std::string(chars.data(), len);
}

~llama_file() {
~my_llama_file() {
if (fp) {
std::fclose(fp);
}
}
};

static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb");
my_llama_file file(filename, "rb");
if (file.size < 4) {
return false;
}
Expand Down Expand Up @@ -576,7 +576,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
} else {
// assume llama2.c vocabulary
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb");
my_llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
}
Expand Down
7 changes: 3 additions & 4 deletions examples/cvector-generator/cvector-generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,12 +415,13 @@ int main(int argc, char ** argv) {
// load the model to get hparams
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

// int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer(model);
int n_embd = llama_n_embd(model);

// get model hint param (a.k.a model arch name)
char model_hint[128];
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
Expand Down Expand Up @@ -474,8 +475,6 @@ int main(int argc, char ** argv) {

// done with the model, we can now free it to make gain some memory
printf("Done evaluate prompts, unload model...\n");
llama_free(ctx);
llama_free_model(model);

bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;

Expand Down
7 changes: 3 additions & 4 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,9 @@ int main(int argc, char ** argv) {
// load the model
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
Expand Down Expand Up @@ -316,8 +317,6 @@ int main(int argc, char ** argv) {

// clean up
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();

return 0;
Expand Down
8 changes: 3 additions & 5 deletions examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,9 @@ int main(int argc, char ** argv) {
// init
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__);
return 1;
Expand All @@ -184,9 +185,6 @@ int main(int argc, char ** argv) {
LOG("\n");
llama_perf_context_print(ctx);

llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
Expand Down
7 changes: 3 additions & 4 deletions examples/gguf-split/gguf-split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
#include "common.h"

#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>

#include <stdio.h>
#include <string.h>
#include <climits>

#include <cstdio>
#include <cstring>
#include <stdexcept>

#if defined(_WIN32)
Expand Down
11 changes: 5 additions & 6 deletions examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,9 +430,10 @@ static void process_logits(

static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const int n_ctx = llama_n_ctx(ctx);

GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__);

Expand Down Expand Up @@ -618,8 +619,9 @@ int main(int argc, char ** argv) {
// init
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__);
return 1;
Expand Down Expand Up @@ -655,9 +657,6 @@ int main(int argc, char ** argv) {
LOG("\n");
llama_perf_context_print(ctx);

llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
Expand Down
7 changes: 2 additions & 5 deletions examples/infill/infill.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ int main(int argc, char ** argv) {
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
common_init_result llama_init = common_init_from_params(params);

model = llama_init.model;
ctx = llama_init.context;
model = llama_init.model.get();
ctx = llama_init.context.get();

if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__);
Expand Down Expand Up @@ -581,9 +581,6 @@ int main(int argc, char ** argv) {
LOG("\n");
common_perf_print(ctx, smpl);

llama_free(ctx);
llama_free_model(model);

common_sampler_free(smpl);
llama_backend_free();

Expand Down
Loading

0 comments on commit f66f582

Please sign in to comment.