First version

kuvaus · kuvaus · commit 860bf079cfbe · 2023-04-26T13:34:33.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,13 @@
+# Folders
+build/
+tmp/
+
+# Visual Studio Code
+.vscode
+
+# MacOS 
+.DS_Store
+
 # Prerequisites
 *.d
 
@@ -30,3 +40,4 @@
 *.exe
 *.out
 *.app
+
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "llama.cpp"]
+	path = llama.cpp
+	url = https://github.com/ggerganov/llama.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,63 @@
+cmake_minimum_required (VERSION 3.0)
+
+if(APPLE)
+  # Build a Universal binary on macOS
+  set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
+endif()
+
+project(llama-chat VERSION 0.1.0)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+
+
+# options
+option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
+option(LLAMA_AVX2                   "llama: enable AVX2"                                    OFF)
+option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
+option(LLAMA_FMA                    "llama: enable FMA"                                     OFF)
+
+
+# sanitizers
+#set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE)
+set(BUILD_SHARED_LIBS ON FORCE)
+
+
+if (GGML_SANITIZE_THREAD)
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
+endif()
+
+if (GGML_SANITIZE_ADDRESS)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+endif()
+
+if (GGML_SANITIZE_UNDEFINED)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
+endif()
+
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
+
+# dependencies
+
+set(CMAKE_C_STANDARD   17)
+set(CMAKE_CXX_STANDARD 20)
+
+find_package(Threads REQUIRED)
+
+# main
+
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
+endif ()
+
+add_subdirectory(llama.cpp)
+add_subdirectory(src)
diff --git a/llama.cpp b/llama.cpp
@@ -0,0 +1 @@
+Subproject commit 4afcc378698e057fcde64e23eb664e5af8dd6956
diff --git a/llm/llamamodel.cpp b/llm/llamamodel.cpp
@@ -0,0 +1,162 @@
+#include "llamamodel.h"
+
+#include "../llama.cpp/examples/common.h"
+#include "../llama.cpp/llama.h"
+#include "../llama.cpp/ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <unistd.h>
+#include <random>
+#include <thread>
+
+struct LLamaPrivate {
+    const std::string modelPath;
+    bool modelLoaded;
+    llama_context *ctx = nullptr;
+    llama_context_params params;
+    int64_t n_threads = 0;
+};
+
+LLamaModel::LLamaModel()
+    : d_ptr(new LLamaPrivate) {
+
+    d_ptr->modelLoaded = false;
+}
+
+bool LLamaModel::loadModel(const std::string &modelPath, std::istream &fin)
+{
+    std::cerr << "LLAMA ERROR: loading llama model from stream unsupported!\n";
+    return false;
+}
+
+bool LLamaModel::loadModel(const std::string &modelPath)
+{
+    // load the model
+    d_ptr->params = llama_context_default_params();
+
+    gpt_params params;
+    d_ptr->params.n_ctx      = 2048;
+    d_ptr->params.n_parts    = params.n_parts;
+    d_ptr->params.seed       = params.seed;
+    d_ptr->params.f16_kv     = params.memory_f16;
+    d_ptr->params.use_mmap   = true;//params.use_mmap;
+    d_ptr->params.use_mlock  = true;//params.use_mlock;
+
+    d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
+    if (!d_ptr->ctx) {
+        std::cerr << "LLAMA ERROR: failed to load model from " <<  modelPath << std::endl;
+        return false;
+    }
+
+    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    d_ptr->modelLoaded = true;
+    return true;
+}
+
+void LLamaModel::setThreadCount(int32_t n_threads) {
+    d_ptr->n_threads = n_threads;
+}
+
+int32_t LLamaModel::threadCount() {
+    return d_ptr->n_threads;
+}
+
+LLamaModel::~LLamaModel()
+{
+}
+
+bool LLamaModel::isModelLoaded() const
+{
+    return d_ptr->modelLoaded;
+}
+
+void LLamaModel::prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
+        PromptContext &promptCtx, int32_t n_predict, int32_t top_k, float top_p, float temp, int32_t n_batch) {
+
+    if (!isModelLoaded()) {
+        std::cerr << "LLAMA ERROR: prompt won't work with an unloaded model!\n";
+        return;
+    }
+
+    gpt_params params;
+    params.prompt = prompt;
+
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
+
+    // tokenize the prompt
+    auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false);
+    const int n_ctx = llama_n_ctx(d_ptr->ctx);
+
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        std::cerr << "LLAMA ERROR: prompt is too long\n";
+        return;
+    }
+
+    n_predict = std::min(n_predict, n_ctx - (int) embd_inp.size());
+    promptCtx.n_past = std::min(promptCtx.n_past, n_ctx);
+
+    // number of tokens to keep when resetting context
+    params.n_keep = (int)embd_inp.size();
+
+    // process the prompt in batches
+    size_t i = 0;
+    const int64_t t_start_prompt_us = ggml_time_us();
+    while (i < embd_inp.size()) {
+        size_t batch_end = std::min(i + n_batch, embd_inp.size());
+        std::vector<llama_token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
+
+        // Check if the context has run out...
+        if (promptCtx.n_past + batch.size() > n_ctx) {
+            // FIXME: will produce gibberish after this
+            promptCtx.n_past = std::min(promptCtx.n_past, int(n_ctx - batch.size()));
+            std::cerr << "LLAMA WARNING: reached the end of the context window!\n";
+        }
+
+        if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {
+            std::cerr << "LLAMA ERROR: Failed to process prompt\n";
+            return;
+        }
+
+        // We pass a null string for each token to see if the user has asked us to stop...
+        size_t tokens = batch_end - i;
+        for (size_t t = 0; t < tokens; ++t)
+            if (!response(""))
+                return;
+        promptCtx.n_past += batch.size();
+        i = batch_end;
+    }
+
+    // predict next tokens
+    int32_t totalPredictions = 0;
+    for (int i = 0; i < n_predict; i++) {
+        // sample next token
+        llama_token id = llama_sample_top_p_top_k(d_ptr->ctx, {}, 0, top_k, top_p, temp, 1.0f);
+
+        // Check if the context has run out...
+        if (promptCtx.n_past + 1 > n_ctx) {
+            // FIXME: will produce gibberish after this
+            promptCtx.n_past = std::min(promptCtx.n_past, n_ctx - 1);
+            std::cerr << "LLAMA WARNING: reached the end of the context window!\n";
+        }
+
+        if (llama_eval(d_ptr->ctx, &id, 1, promptCtx.n_past, d_ptr->n_threads)) {
+            std::cerr << "LLAMA ERROR: Failed to predict next token\n";
+            return;
+        }
+
+        promptCtx.n_past += 1;
+        // display text
+        ++totalPredictions;
+        if (id == llama_token_eos() || !response(llama_token_to_str(d_ptr->ctx, id)))
+            return;
+    }
+}
diff --git a/llm/llamamodel.h b/llm/llamamodel.h
@@ -0,0 +1,28 @@
+#ifndef LLAMAMODEL_H
+#define LLAMAMODEL_H
+
+#include <string>
+#include <functional>
+#include <vector>
+#include "llmodel.h"
+
+class LLamaPrivate;
+class LLamaModel : public LLModel {
+public:
+    LLamaModel();
+    ~LLamaModel();
+
+    bool loadModel(const std::string &modelPath) override;
+    bool loadModel(const std::string &modelPath, std::istream &fin) override;
+    bool isModelLoaded() const override;
+    void prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
+        PromptContext &ctx, int32_t n_predict = 200, int32_t top_k = 50400, float top_p = 1.0f,
+        float temp = 0.0f, int32_t n_batch = 9) override;
+    void setThreadCount(int32_t n_threads) override;
+    int32_t threadCount() override;
+
+private:
+    LLamaPrivate *d_ptr;
+};
+
+#endif // LLAMAMODEL_H
diff --git a/llm/llmodel.h b/llm/llmodel.h
@@ -0,0 +1,27 @@
+#ifndef LLMODEL_H
+#define LLMODEL_H
+
+#include <string>
+#include <functional>
+#include <vector>
+
+class LLModel {
+public:
+    explicit LLModel() {}
+    virtual ~LLModel() {}
+
+    virtual bool loadModel(const std::string &modelPath) = 0;
+    virtual bool loadModel(const std::string &modelPath, std::istream &fin) = 0;
+    virtual bool isModelLoaded() const = 0;
+    struct PromptContext {
+        std::vector<float> logits;
+        int32_t n_past = 0; // number of tokens in past conversation
+    };
+    virtual void prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
+        PromptContext &ctx, int32_t n_predict = 200, int32_t top_k = 40, float top_p = 0.9f,
+        float temp = 0.9f, int32_t n_batch = 9) = 0;
+    virtual void setThreadCount(int32_t n_threads) {}
+    virtual int32_t threadCount() { return 1; }
+};
+
+#endif // LLMODEL_H
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -0,0 +1,5 @@
+
+add_executable(chat chat.cpp header.h utils.h parse_json.h ../llm/llamamodel.h ../llm/llamamodel.cpp ../llama.cpp/examples/common.cpp ../llm/llmodel.h  )
+target_link_libraries(chat PRIVATE  llama)
+
+
diff --git a/src/chat.cpp b/src/chat.cpp
diff --git a/src/header.h b/src/header.h
diff --git a/src/parse_json.h b/src/parse_json.h
diff --git a/src/utils.h b/src/utils.h

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "llama.cpp"]`
	`2`	`+ path = llama.cpp`
	`3`	`+ url = https://github.com/ggerganov/llama.cpp`