llama : unified KV cache + batch inference API

ggerganov · ggerganov · commit d29e76937c3c · 2023-09-18T11:08:15.000+03:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -436,8 +436,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.use_mmap = false;
         } else if (arg == "--numa") {
             params.numa = true;
-        } else if (arg == "--export") {
-            params.export_cgraph = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -685,7 +683,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
-    printf("  --export              export the computation graph to 'llama.ggml'\n");
     printf("  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
     printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
@@ -782,7 +779,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     {
         LOG("warming up the model with an empty run\n");
 
-        const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
         llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
         llama_reset_timings(lctx);
     }
@@ -1182,7 +1179,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
-    fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
     fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
     dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
diff --git a/common/common.h b/common/common.h
@@ -111,7 +111,6 @@ struct gpt_params {
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool numa              = false; // attempt optimizations that help on some NUMA systems
-    bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
@@ -158,7 +158,8 @@ int main(int argc, char ** argv)
     }
     std::cout << std::flush;
 
-    int n_past = llama_get_kv_cache_token_count(ctx);
+    int n_past = 0;
+
     if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
     {
         fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -198,15 +198,6 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
-    // export the cgraph and exit
-    if (params.export_cgraph) {
-        llama_eval_export(ctx, "llama.ggml");
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
-    }
-
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -400,7 +400,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
     return {tokens, ppl, logit_history, prob_history};
 }
 
-std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int> & tokens, int n_past, int n_batch,
         int n_vocab, int n_thread) {
     std::vector<float> result;
     result.reserve(tokens.size() * n_vocab);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -73,10 +73,12 @@ int main(int argc, char ** argv) {
 
     const int n_gen = std::min(32, max_context_size);
 
-    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
+    int n_cur = 0;
+
+    while (n_cur < n_gen) {
         // evaluate the transformer
 
-        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), n_cur, params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return 1;
         }
diff --git a/ggml.c b/ggml.c
@@ -12462,13 +12462,11 @@ static void ggml_compute_forward_alibi_f16(
         return;
     }
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
 
-    assert(n_past >= 0);
-
     const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
     const int ne1 = src0->ne[1]; // seq_len_without_past
     const int ne2 = src0->ne[2]; // n_head -> this is k
@@ -12483,7 +12481,7 @@ static void ggml_compute_forward_alibi_f16(
     //const int nb3 = src0->nb[3];
 
     GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
+    //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
     GGML_ASSERT(n_head == ne2);
 
     // add alibi to src0 (KQ_scaled)
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,8 @@ int main(int argc, char ** argv)`
`158`	`158`	`}`
`159`	`159`	`std::cout << std::flush;`
`160`	`160`
`161`		`- int n_past = llama_get_kv_cache_token_count(ctx);`
	`161`	`+ int n_past = 0;`
	`162`	`+`
`162`	`163`	`if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))`
`163`	`164`	`{`
`164`	`165`	`fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );`
Original file line number	Diff line number	Diff line change
`@@ -400,7 +400,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {`
`400`	`400`	`return {tokens, ppl, logit_history, prob_history};`
`401`	`401`	`}`
`402`	`402`
`403`		`-std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,`
	`403`	`+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int> & tokens, int n_past, int n_batch,`
`404`	`404`	`int n_vocab, int n_thread) {`
`405`	`405`	`std::vector<float> result;`
`406`	`406`	`result.reserve(tokens.size() * n_vocab);`