Skip to content

Commit 860bf07

Browse files
committed
First version
1 parent 9361750 commit 860bf07

12 files changed

+833
-0
lines changed

.gitignore

+11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
# Folders
2+
build/
3+
tmp/
4+
5+
# Visual Studio Code
6+
.vscode
7+
8+
# MacOS
9+
.DS_Store
10+
111
# Prerequisites
212
*.d
313

@@ -30,3 +40,4 @@
3040
*.exe
3141
*.out
3242
*.app
43+

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "llama.cpp"]
2+
path = llama.cpp
3+
url = https://github.com/ggerganov/llama.cpp

CMakeLists.txt

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
cmake_minimum_required (VERSION 3.0)
2+
3+
if(APPLE)
4+
# Build a Universal binary on macOS
5+
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
6+
endif()
7+
8+
project(llama-chat VERSION 0.1.0)
9+
10+
set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
11+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
12+
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
13+
14+
15+
# options
16+
option(LLAMA_AVX "llama: enable AVX" ON)
17+
option(LLAMA_AVX2 "llama: enable AVX2" OFF)
18+
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
19+
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
20+
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
21+
option(LLAMA_FMA "llama: enable FMA" OFF)
22+
23+
24+
# sanitizers
25+
#set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE)
26+
set(BUILD_SHARED_LIBS ON FORCE)
27+
28+
29+
if (GGML_SANITIZE_THREAD)
30+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
31+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
32+
endif()
33+
34+
if (GGML_SANITIZE_ADDRESS)
35+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
36+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
37+
endif()
38+
39+
if (GGML_SANITIZE_UNDEFINED)
40+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
41+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
42+
endif()
43+
44+
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
45+
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
46+
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
47+
48+
# dependencies
49+
50+
set(CMAKE_C_STANDARD 17)
51+
set(CMAKE_CXX_STANDARD 20)
52+
53+
find_package(Threads REQUIRED)
54+
55+
# main
56+
57+
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
58+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
59+
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
60+
endif ()
61+
62+
add_subdirectory(llama.cpp)
63+
add_subdirectory(src)

llama.cpp

Submodule llama.cpp added at 4afcc37

llm/llamamodel.cpp

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#include "llamamodel.h"
2+
3+
#include "../llama.cpp/examples/common.h"
4+
#include "../llama.cpp/llama.h"
5+
#include "../llama.cpp/ggml.h"
6+
7+
#include <cassert>
8+
#include <cmath>
9+
#include <cstdio>
10+
#include <cstring>
11+
#include <fstream>
12+
#include <map>
13+
#include <string>
14+
#include <vector>
15+
#include <iostream>
16+
#include <unistd.h>
17+
#include <random>
18+
#include <thread>
19+
20+
struct LLamaPrivate {
21+
const std::string modelPath;
22+
bool modelLoaded;
23+
llama_context *ctx = nullptr;
24+
llama_context_params params;
25+
int64_t n_threads = 0;
26+
};
27+
28+
LLamaModel::LLamaModel()
29+
: d_ptr(new LLamaPrivate) {
30+
31+
d_ptr->modelLoaded = false;
32+
}
33+
34+
bool LLamaModel::loadModel(const std::string &modelPath, std::istream &fin)
35+
{
36+
std::cerr << "LLAMA ERROR: loading llama model from stream unsupported!\n";
37+
return false;
38+
}
39+
40+
bool LLamaModel::loadModel(const std::string &modelPath)
41+
{
42+
// load the model
43+
d_ptr->params = llama_context_default_params();
44+
45+
gpt_params params;
46+
d_ptr->params.n_ctx = 2048;
47+
d_ptr->params.n_parts = params.n_parts;
48+
d_ptr->params.seed = params.seed;
49+
d_ptr->params.f16_kv = params.memory_f16;
50+
d_ptr->params.use_mmap = true;//params.use_mmap;
51+
d_ptr->params.use_mlock = true;//params.use_mlock;
52+
53+
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
54+
if (!d_ptr->ctx) {
55+
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
56+
return false;
57+
}
58+
59+
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
60+
d_ptr->modelLoaded = true;
61+
return true;
62+
}
63+
64+
void LLamaModel::setThreadCount(int32_t n_threads) {
65+
d_ptr->n_threads = n_threads;
66+
}
67+
68+
int32_t LLamaModel::threadCount() {
69+
return d_ptr->n_threads;
70+
}
71+
72+
LLamaModel::~LLamaModel()
73+
{
74+
}
75+
76+
bool LLamaModel::isModelLoaded() const
77+
{
78+
return d_ptr->modelLoaded;
79+
}
80+
81+
void LLamaModel::prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
82+
PromptContext &promptCtx, int32_t n_predict, int32_t top_k, float top_p, float temp, int32_t n_batch) {
83+
84+
if (!isModelLoaded()) {
85+
std::cerr << "LLAMA ERROR: prompt won't work with an unloaded model!\n";
86+
return;
87+
}
88+
89+
gpt_params params;
90+
params.prompt = prompt;
91+
92+
// Add a space in front of the first character to match OG llama tokenizer behavior
93+
params.prompt.insert(0, 1, ' ');
94+
95+
// tokenize the prompt
96+
auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false);
97+
const int n_ctx = llama_n_ctx(d_ptr->ctx);
98+
99+
if ((int) embd_inp.size() > n_ctx - 4) {
100+
std::cerr << "LLAMA ERROR: prompt is too long\n";
101+
return;
102+
}
103+
104+
n_predict = std::min(n_predict, n_ctx - (int) embd_inp.size());
105+
promptCtx.n_past = std::min(promptCtx.n_past, n_ctx);
106+
107+
// number of tokens to keep when resetting context
108+
params.n_keep = (int)embd_inp.size();
109+
110+
// process the prompt in batches
111+
size_t i = 0;
112+
const int64_t t_start_prompt_us = ggml_time_us();
113+
while (i < embd_inp.size()) {
114+
size_t batch_end = std::min(i + n_batch, embd_inp.size());
115+
std::vector<llama_token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
116+
117+
// Check if the context has run out...
118+
if (promptCtx.n_past + batch.size() > n_ctx) {
119+
// FIXME: will produce gibberish after this
120+
promptCtx.n_past = std::min(promptCtx.n_past, int(n_ctx - batch.size()));
121+
std::cerr << "LLAMA WARNING: reached the end of the context window!\n";
122+
}
123+
124+
if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {
125+
std::cerr << "LLAMA ERROR: Failed to process prompt\n";
126+
return;
127+
}
128+
129+
// We pass a null string for each token to see if the user has asked us to stop...
130+
size_t tokens = batch_end - i;
131+
for (size_t t = 0; t < tokens; ++t)
132+
if (!response(""))
133+
return;
134+
promptCtx.n_past += batch.size();
135+
i = batch_end;
136+
}
137+
138+
// predict next tokens
139+
int32_t totalPredictions = 0;
140+
for (int i = 0; i < n_predict; i++) {
141+
// sample next token
142+
llama_token id = llama_sample_top_p_top_k(d_ptr->ctx, {}, 0, top_k, top_p, temp, 1.0f);
143+
144+
// Check if the context has run out...
145+
if (promptCtx.n_past + 1 > n_ctx) {
146+
// FIXME: will produce gibberish after this
147+
promptCtx.n_past = std::min(promptCtx.n_past, n_ctx - 1);
148+
std::cerr << "LLAMA WARNING: reached the end of the context window!\n";
149+
}
150+
151+
if (llama_eval(d_ptr->ctx, &id, 1, promptCtx.n_past, d_ptr->n_threads)) {
152+
std::cerr << "LLAMA ERROR: Failed to predict next token\n";
153+
return;
154+
}
155+
156+
promptCtx.n_past += 1;
157+
// display text
158+
++totalPredictions;
159+
if (id == llama_token_eos() || !response(llama_token_to_str(d_ptr->ctx, id)))
160+
return;
161+
}
162+
}

llm/llamamodel.h

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#ifndef LLAMAMODEL_H
2+
#define LLAMAMODEL_H
3+
4+
#include <string>
5+
#include <functional>
6+
#include <vector>
7+
#include "llmodel.h"
8+
9+
class LLamaPrivate;
10+
class LLamaModel : public LLModel {
11+
public:
12+
LLamaModel();
13+
~LLamaModel();
14+
15+
bool loadModel(const std::string &modelPath) override;
16+
bool loadModel(const std::string &modelPath, std::istream &fin) override;
17+
bool isModelLoaded() const override;
18+
void prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
19+
PromptContext &ctx, int32_t n_predict = 200, int32_t top_k = 50400, float top_p = 1.0f,
20+
float temp = 0.0f, int32_t n_batch = 9) override;
21+
void setThreadCount(int32_t n_threads) override;
22+
int32_t threadCount() override;
23+
24+
private:
25+
LLamaPrivate *d_ptr;
26+
};
27+
28+
#endif // LLAMAMODEL_H

llm/llmodel.h

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#ifndef LLMODEL_H
2+
#define LLMODEL_H
3+
4+
#include <string>
5+
#include <functional>
6+
#include <vector>
7+
8+
class LLModel {
9+
public:
10+
explicit LLModel() {}
11+
virtual ~LLModel() {}
12+
13+
virtual bool loadModel(const std::string &modelPath) = 0;
14+
virtual bool loadModel(const std::string &modelPath, std::istream &fin) = 0;
15+
virtual bool isModelLoaded() const = 0;
16+
struct PromptContext {
17+
std::vector<float> logits;
18+
int32_t n_past = 0; // number of tokens in past conversation
19+
};
20+
virtual void prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
21+
PromptContext &ctx, int32_t n_predict = 200, int32_t top_k = 40, float top_p = 0.9f,
22+
float temp = 0.9f, int32_t n_batch = 9) = 0;
23+
virtual void setThreadCount(int32_t n_threads) {}
24+
virtual int32_t threadCount() { return 1; }
25+
};
26+
27+
#endif // LLMODEL_H

src/CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
add_executable(chat chat.cpp header.h utils.h parse_json.h ../llm/llamamodel.h ../llm/llamamodel.cpp ../llama.cpp/examples/common.cpp ../llm/llmodel.h )
3+
target_link_libraries(chat PRIVATE llama)
4+
5+

0 commit comments

Comments
 (0)