ggml-org · ochafik · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1879,6 +1879,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.slot_prompt_similarity = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--testing-sampler-delay-millis"}, "N",
+        format("for tests: delay in milliseconds to add to each sampling (default: %d)", params.testing_sampler_delay_millis),
+        [](gpt_params & params, int value) {
+            params.testing_sampler_delay_millis = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--lora-init-without-apply"},
         format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),

diff --git a/common/common.h b/common/common.h
@@ -299,6 +299,8 @@ struct gpt_params {
 
     float slot_prompt_similarity = 0.5f;
 
+    int testing_sampler_delay_millis = 0;
+
     // batched-bench params
     bool is_pp_shared = false;
 

diff --git a/examples/server/httplib.h b/examples/server/httplib.h
@@ -590,6 +590,7 @@ struct Response {
   Headers headers;
   std::string body;
   std::string location; // Redirect location
+  std::function<bool()> is_alive;
 
   bool has_header(const std::string &key) const;
   std::string get_header_value(const std::string &key, size_t id = 0) const;
@@ -639,6 +640,7 @@ class Stream {
 
   virtual bool is_readable() const = 0;
   virtual bool is_writable() const = 0;
+  virtual bool is_alive() const = 0;
 
   virtual ssize_t read(char *ptr, size_t size) = 0;
   virtual ssize_t write(const char *ptr, size_t size) = 0;
@@ -2135,6 +2137,7 @@ class BufferStream final : public Stream {
 
   bool is_readable() const override;
   bool is_writable() const override;
+  bool is_alive() const override;
   ssize_t read(char *ptr, size_t size) override;
   ssize_t write(const char *ptr, size_t size) override;
   void get_remote_ip_and_port(std::string &ip, int &port) const override;
@@ -2945,6 +2948,7 @@ class SocketStream final : public Stream {
 
   bool is_readable() const override;
   bool is_writable() const override;
+  bool is_alive() const override;
   ssize_t read(char *ptr, size_t size) override;
   ssize_t write(const char *ptr, size_t size) override;
   void get_remote_ip_and_port(std::string &ip, int &port) const override;
@@ -2975,6 +2979,7 @@ class SSLSocketStream final : public Stream {
 
   bool is_readable() const override;
   bool is_writable() const override;
+  bool is_alive() const override;
   ssize_t read(char *ptr, size_t size) override;
   ssize_t write(const char *ptr, size_t size) override;
   void get_remote_ip_and_port(std::string &ip, int &port) const override;
@@ -4279,6 +4284,7 @@ inline bool redirect(T &cli, Request &req, Response &res,
   }
 
   Response new_res;
+  new_res.is_alive = res.is_alive;
 
   auto ret = cli.send(new_req, new_res, error);
   if (ret) {
@@ -5484,6 +5490,10 @@ inline bool SocketStream::is_writable() const {
          is_socket_alive(sock_);
 }
 
+inline bool SocketStream::is_alive() const {
+  return is_socket_alive(sock_);
+}
+
 inline ssize_t SocketStream::read(char *ptr, size_t size) {
 #ifdef _WIN32
   size =
@@ -5558,6 +5568,8 @@ inline bool BufferStream::is_readable() const { return true; }
 
 inline bool BufferStream::is_writable() const { return true; }
 
+inline bool BufferStream::is_alive() const { return true; }
+
 inline ssize_t BufferStream::read(char *ptr, size_t size) {
 #if defined(_MSC_VER) && _MSC_VER < 1910
   auto len_read = buffer._Copy_s(ptr, size, size, position);
@@ -6634,6 +6646,7 @@ Server::process_request(Stream &strm, bool close_connection,
   Request req;
 
   Response res;
+  res.is_alive = [&strm]() { return strm.is_alive(); };
   res.version = "HTTP/1.1";
   res.headers = default_headers_;
 
@@ -8348,6 +8361,10 @@ inline bool SSLSocketStream::is_writable() const {
          is_socket_alive(sock_);
 }
 
+inline bool SSLSocketStream::is_alive() const {
+  return is_socket_alive(sock_);
+}
+
 inline ssize_t SSLSocketStream::read(char *ptr, size_t size) {
   if (SSL_pending(ssl_) > 0) {
     return SSL_read(ssl_, ptr, static_cast<int>(size));

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -104,6 +104,7 @@ struct server_task {
     json data;
 
     server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
+    std::function<bool()> is_alive;
 
     // utility function
     static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
@@ -173,7 +174,7 @@ struct server_slot {
     std::vector<completion_token_output> generated_token_probs;
 
     server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
-
+    std::function<bool()> is_alive;
     bool has_next_token = true;
     bool truncated      = false;
     bool stopped_eos    = false;
@@ -876,6 +877,7 @@ struct server_context {
         // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
         auto default_sparams = params.sparams;
         const auto & data = task.data;
+        slot.is_alive = task.is_alive;
 
         if (data.count("__oaicompat") != 0) {
             slot.oaicompat = true;
@@ -1117,6 +1119,13 @@ struct server_context {
     }
 
     bool process_token(completion_token_output & result, server_slot & slot) {
+        if (slot.is_alive && !slot.is_alive()) {
+            slot.truncated = false;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped by client disconnection, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
+            return slot.has_next_token;
+        }
         // remember which tokens were sampled - used for repetition penalties during sampling
         const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
         slot.sampled = result.tok;
@@ -1461,13 +1470,14 @@ struct server_context {
     // Functions to create new task(s) and receive result(s)
     //
 
-    std::vector<server_task> create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type) {
+    std::vector<server_task> create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type, const std::function<bool()> & is_alive) {
         std::vector<server_task> tasks;
         auto create_task = [&](json & task_data, bool replace_prompt, json prompt) {
             server_task task;
             task.id        = queue_tasks.get_new_id();
             task.cmpl_type = cmpl_type;
             task.type      = SERVER_TASK_TYPE_COMPLETION;
+            task.is_alive  = is_alive;
             if (replace_prompt) {
                 task.data  = task_data;
                 task.data["prompt"] = std::move(prompt);
@@ -1866,6 +1876,13 @@ struct server_context {
             system_prompt_update();
         }
 
+        for (auto & slot : slots) {
+            if (slot.is_processing() && slot.is_alive && !slot.is_alive()) {
+                SLT_WRN(slot, "%s", "slot connection died\n");
+                slot.release();
+            }
+        }
+
         // check if all slots are idle
         {
             bool all_idle = true;
@@ -2337,6 +2354,10 @@ struct server_context {
                 }
 
                 completion_token_output result;
+                if (params.testing_sampler_delay_millis > 0) {
+                    SRV_DBG("sleeping for %dms before sampling (for tests!)\n", params.testing_sampler_delay_millis);
+                    std::this_thread::sleep_for(std::chrono::milliseconds(params.testing_sampler_delay_millis));
+                }
                 const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
 
                 gpt_sampler_accept(slot.smpl, id, true);
@@ -2893,7 +2914,7 @@ int main(int argc, char ** argv) {
             return;
         }
 
-        std::vector<server_task> tasks = ctx_server.create_tasks_cmpl(data, cmpl_type);
+        std::vector<server_task> tasks = ctx_server.create_tasks_cmpl(data, cmpl_type, res.is_alive);
         ctx_server.queue_results.add_waiting_tasks(tasks);
         ctx_server.queue_tasks.post(tasks);
 
@@ -2956,7 +2977,7 @@ int main(int argc, char ** argv) {
 
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
 
-        std::vector<server_task> tasks = ctx_server.create_tasks_cmpl(data, SERVER_TASK_CMPL_TYPE_NORMAL);
+        std::vector<server_task> tasks = ctx_server.create_tasks_cmpl(data, SERVER_TASK_CMPL_TYPE_NORMAL, res.is_alive);
         ctx_server.queue_results.add_waiting_tasks(tasks);
         ctx_server.queue_tasks.post(tasks);
 
@@ -3099,7 +3120,7 @@ int main(int argc, char ** argv) {
         json responses = json::array();
         bool error = false;
         {
-            std::vector<server_task> tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_EMBEDDING);
+            std::vector<server_task> tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_EMBEDDING, res.is_alive);
             ctx_server.queue_results.add_waiting_tasks(tasks);
             ctx_server.queue_tasks.post(tasks);
 
@@ -3176,7 +3197,7 @@ int main(int argc, char ** argv) {
         json responses = json::array();
         bool error = false;
         {
-            std::vector<server_task> tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_RERANK);
+            std::vector<server_task> tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_RERANK, res.is_alive);
             ctx_server.queue_results.add_waiting_tasks(tasks);
             ctx_server.queue_tasks.post(tasks);
 

diff --git a/examples/server/tests/features/cancel.feature b/examples/server/tests/features/cancel.feature
@@ -0,0 +1,57 @@
+@llama.cpp
+@server
+Feature: Cancellation of llama.cpp server requests
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file test-model.gguf
+    And   a model alias tinyllama-2
+    And   BOS token is 1
+    And   42 as server seed
+      # KV Cache corresponds to the total amount of tokens
+      # that can be stored across all independent sequences: #4130
+      # see --ctx-size and #5568
+    And   256 KV cache size
+    And   32 as batch size
+    And   2 slots
+    And   64 server max tokens to predict
+    And   prometheus compatible metrics exposed
+    And   300 milliseconds delay in sampler for testing
+    Then  the server is starting
+    Then  the server is healthy
+
+
+  Scenario Outline: Cancelling an OAI chat completion request frees up slot (streaming <enable_streaming>)
+    Given a model llama-2
+    And   a user prompt Once upon a time
+    And   a system prompt You tell lengthy stories
+    And   256 max tokens to predict
+    And   256 server max tokens to predict
+    And   streaming is <enable_streaming>
+    And   disconnect after 100 milliseconds
+    Given concurrent OAI completions requests
+    And   wait for 700 milliseconds
+    Then  all slots are idle
+
+    Examples: Prompts
+      | enable_streaming |
+      | disabled         |
+      | enabled          |
+
+
+  Scenario Outline: Cancelling a completion request frees up slot (streaming <enable_streaming>)
+    Given a model llama-2
+    Given a prompt Once upon a time
+    And   256 max tokens to predict
+    And   256 server max tokens to predict
+    And   streaming is <enable_streaming>
+    And   disconnect after 100 milliseconds
+    Given a completion request with no api error
+    And   wait for 700 milliseconds
+    Then  all slots are idle
+
+    Examples: Prompts
+      | enable_streaming |
+      | disabled         |
+      | enabled          |