ggml-org · pudepiedj · Oct 4, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023
diff --git a/Makefile b/Makefile
@@ -569,6 +569,9 @@ perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.
 embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+cmap-example: examples/cmap-example/cmap-example.cpp          build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 

diff --git a/common/common.cpp b/common/common.cpp
@@ -626,6 +626,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
+// There were missing items from this list of helps so the wording needs checking (all inserted at the end, so reposition too):
+// --embedding, --beams, --ppl-stride, --ppl-output-type, --memory-f32, --no-mmap, --mlock, --use-color, --nprobs, --alias, --infill, --prompt-file
+// some corresponding changes to the sequence of fprintf() code may be needed
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
@@ -672,7 +675,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
     printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
     printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    printf("  -l T, --logit-bias T  T = TOKEN_ID(plus/minus)BIAS\n");
     printf("                        modifies the likelihood of token appearing in the completion,\n");
     printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
     printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
@@ -687,7 +690,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    printf("  --no-penalize-nl      do not penalize newline token\n");
+    printf("  --no-penalize-nl      do not penalize newline token (default is DO penalise nl token)\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
@@ -734,6 +737,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  --ppl-stride          stride for ppl calcs. 0 (default): the pre-existing approach will be used.\n");
+    printf("  --ppl-output-type     0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n");
+    printf("  --embedding           0 (default): get only sentence embedding\n");
+    printf("  --beams N             0 (default): if non-zero use beam search of given width N.\n");
+    printf("  --memory-f32          0 (default): if true (= 1) disable f16 memory.\n");
+    printf("  --no-mmap             0 (default): if true use mmap for faster loads.\n");
+    printf("  --mlock               0 (default): if true keep model in memory.\n");
+    printf("  --use-color           0 (default): use color to distinguish generations from inputs\n");
+    printf("  --nprobs N            if > 0 output the probabilities of the top N tokens\n");
+    printf("  --alias               model alias (default: 'unknown')\n");
+    printf("  --infill              0 (defaut) use infill mode\n");
+    printf("  --prompt-file         name of external prompt file\n");
     printf("\n");
 }
 

diff --git a/common/common.h b/common/common.h
@@ -35,21 +35,21 @@ int32_t get_num_physical_cores();
 
 struct gpt_params {
     uint32_t seed                           = -1;   // RNG seed
-    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_batch                 = -1;   // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads                       = get_num_physical_cores(); // user-defined or num of internal physical cores
+    int32_t n_threads_batch                 = -1;   // num threads for batch proc (-1 = use n_threads)
     int32_t n_predict                       = -1;   // new tokens to predict
     int32_t n_ctx                           = 512;  // context size
-    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch                         = 512;  // batch size for prompt proc (>=32 to use BLAS)
     int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel                      = 1;    // number of parallel sequences to decode
     int32_t n_sequences                     = 1;    // number of sequences to decode
-    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t n_gpu_layers                    = -1;   // num layers stored in VRAM (-1 for default)
+    int32_t n_gpu_layers_draft              = -1;   // num layers stored in VRAM for draft mod (-1 for default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t n_probs                         = 0;    // if > 0, output probabilities of top n_probs tokens.
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f; // RoPE base frequency
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
@@ -61,7 +61,7 @@ struct gpt_params {
     float   typical_p         = 1.00f; // 1.0 = disabled
     float   temp              = 0.80f; // 1.0 = disabled
     float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable, -1 = cxt size)
     float   frequency_penalty = 0.00f; // 0.0 = disabled
     float   presence_penalty  = 0.00f; // 0.0 = disabled
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
@@ -75,10 +75,11 @@ struct gpt_params {
     std::string cfg_negative_prompt;       // string to help guidance
     float       cfg_scale         = 1.f;   // How strong is guidance
 
+    std::string help              = "";  // universal help parameter
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
-    std::string prompt            = "";
+    std::string prompt            = "";  // user-provided single prompt
     std::string prompt_file       = "";  // store the external prompt file name
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
     std::string input_prefix      = "";  // string to prefix user inputs with
@@ -90,11 +91,11 @@ struct gpt_params {
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
 
-    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+    int  ppl_stride        = 0;     // stride for ppl calcs. 0: the pre-existing approach will be used.
+    int  ppl_output_type   = 0;     // 0: ppl output as usual, 1: ppl output = num_tokens, ppl, one per line
                                     //                                       (which is more convenient to use for plotting)
                                     //
-    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    bool hellaswag         = false; // compute HellaSwag score from datafile given in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
@@ -109,7 +110,7 @@ struct gpt_params {
     bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
     bool interactive_first = false; // wait for user input immediately
     bool multiline_input   = false; // reverse the usage of `\`
-    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
+    bool simple_io         = false; // improves compat'y with subprocs and ltd consoles
     bool cont_batching     = false; // insert new sequences for decoding on-the-fly
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -30,6 +30,7 @@ else()
     add_subdirectory(embd-input)
     add_subdirectory(llama-bench)
     add_subdirectory(beam-search)
+    add_subdirectory(cmap-example)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()

diff --git a/examples/cmap-example/CMakeLists.txt b/examples/cmap-example/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET cmap-example)
+add_executable(${TARGET} cmap-example.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/cmap-example/cmap-example.cpp b/examples/cmap-example/cmap-example.cpp
@@ -0,0 +1,124 @@
+// example of a C/C++ equivalent data structure to the python dict in readcommonh.py
+
+#include <map>
+#include <list>
+#include <string>
+#include <bitset>
+#include <vector>
+#include <cstdio>
+#include <cmath>
+#include <fstream>
+#include <sstream>
+#include <regex>
+// there may be good reasons not to sort the parameters, but here we use map
+#include <map>
+#include <numeric>
+
+std::vector<std::string> split_string(const std::string& str, const std::string& delimiter) {
+    std::vector<std::string> tokens;
+    std::size_t start = 0, end = 0;
+    bool inside_tags = false;  // flag to track if we are inside "<>"
+
+    while ((end = str.find(delimiter, start)) != std::string::npos) {
+        std::string token = str.substr(start, end - start);
+
+        // if (!token.empty()) { // Add condition to exclude empty substrings
+        //    tokens.push_back(token);
+
+        if (!inside_tags && !token.empty()) { // Add condition to exclude empty substrings and if not inside "<>"
+            tokens.push_back(token);
+        }
+        // deal with cases where the split character occurs inside <>
+        // Update inside_tags flag based on "<>"
+        size_t open_tag_pos = str.find("<", start);
+        size_t close_tag_pos = str.find(">", start);
+        if (open_tag_pos != std::string::npos && close_tag_pos != std::string::npos && open_tag_pos < end) {
+            inside_tags = true;
+        } else if (close_tag_pos != std::string::npos && close_tag_pos < end) {
+            inside_tags = false;
+        }
+        start = end + delimiter.length();
+    }
+    tokens.push_back(str.substr(start));
+    return tokens;
+}
+
+void print_parameters(const std::map<std::string, std::vector<std::string>>& parameters) {
+        for (const auto& pair : parameters) {
+            const std::string& key = pair.first;
+            const std::vector<std::string>& value = pair.second; // usually has multiple elements
+            printf("key: %25s: values: ", key.c_str());
+            for (const std::string& element : value) {
+                printf("%s ", element.c_str());
+            }
+            printf("\n");
+    }
+}
+
+std::map<std::string, std::vector<std::string>> extract_parameters() {
+    std::ifstream file("common/common.h");
+    std::string line;
+    std::vector<std::string> lines;
+    while (std::getline(file, line)) {
+        lines.push_back(line);
+    }
+
+    std::map<std::string, std::vector<std::string>> parameters;
+    // fix up failure to match logit_bias; may also need to add lora_adapter; now dealt with and ready for deletion
+    // parameters["logit_bias"] = {"std::unordered_map<llama_token, float>" "logit_bias", "=", "0", "//", "way", "to", "alter", "prob", "of", "word", "being", "chosen"};
+    // parameters["lora_adapter"] = {"std::vector<std::tuple<std::string, float>>", "lora_adapter", "=", "", "//", "lora", "adapter", "path", "with", "user-defined", "scale"};
+
+    // are we inside gpt_params?
+    // this for loop finds all the params inside struct gpt-params
+    bool inside = false;
+    for (const std::string& line : lines) {
+        std::vector<std::string> nws_elements = split_string(line, " ");
+        printf("nwe = ");
+        for (const std::string& element : nws_elements) {
+            printf("%s ", element.c_str());
+        }
+        printf("\n");
+
+        if (!nws_elements.empty() && nws_elements[0] == "struct" && nws_elements[1] == "gpt_params") {
+            inside = true;
+        }
+
+        if (nws_elements.size() > 2 && inside) {
+            // cannot use nwe[0] as key because types do not generate unique keys and so overwrite
+            // Here we deliberately add back the key so we can manually change it when it is different (remove eventually)
+            // parameters[nws_elements[1]] = nws_elements;
+            std::vector<std::string> copy = nws_elements; // Create a copy of nws_elements
+            parameters[nws_elements[1]] = copy; // Assign the copy to parameters
+
+            // Remove spurious entry caused by eccentric status of logit_bias
+            if (parameters.count("float>") && parameters["float>"][2] == "logit_bias;") {
+                parameters.erase("float>");
+            }
+            // Remove spurious entry caused by eccentric status of lora_adapter
+            if (parameters.count("float>>") && parameters["float>>"][2] == "lora_adapter;") {
+                parameters.erase("float>>");
+            }
+        }
+
+        // Terminate the harvest; TODO: not robust; need better terminator; this just a crude hack for now
+        if (nws_elements.size() > 2 && nws_elements[2] == "infill") {
+            inside = false;
+            break;
+            }
+        }
+    // now display them (unnecessary operationally; here for development)
+    print_parameters(parameters);
+
+    // return the results (will eventually become a void function)
+    return parameters;
+}
+
+int main() {
+
+    // process the code inserted to replicate readcommonh.py
+    // this does not produce output but here is forced; it just collects the output into parameters and returns 0
+    std::map<std::string, std::vector<std::string>> parameters = extract_parameters();
+    print_parameters(parameters);
+
+    return 0;
+}