From c653c975847f9f6a81382033a9c8f5bd81bf70f2 Mon Sep 17 00:00:00 2001
From: aresnow1 <109642806+aresnow1@users.noreply.github.com>
Date: Thu, 22 Feb 2024 15:40:53 +0800
Subject: [PATCH] FEAT: Support gemma series model (#1024)

---
 doc/source/models/builtin/llm/gemma-it.rst    | 45 +++++++++++++++++++
 doc/source/models/builtin/llm/index.rst       |  7 +++
 .../models/builtin/llm/llama-2-chat.rst       |  2 +-
 .../models/builtin/llm/qwen1.5-chat.rst       | 38 ++++++++--------
 xinference/deploy/docker/Dockerfile           |  2 +-
 xinference/model/llm/llm_family.json          | 45 +++++++++++++++++++
 xinference/model/llm/utils.py                 |  9 ++++
 7 files changed, 127 insertions(+), 21 deletions(-)
 create mode 100644 doc/source/models/builtin/llm/gemma-it.rst

diff --git a/doc/source/models/builtin/llm/gemma-it.rst b/doc/source/models/builtin/llm/gemma-it.rst
new file mode 100644
index 0000000000..676e91b4a4
--- /dev/null
+++ b/doc/source/models/builtin/llm/gemma-it.rst
@@ -0,0 +1,45 @@
+.. _models_llm_gemma-it:
+
+========================================
+gemma-it
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** gemma-it
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 2
+- **Quantizations:** none, 4-bit, 8-bit
+- **Model ID:** google/gemma-2b-it
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/google/gemma-2b-it>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name gemma-it --size-in-billions 2 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none, 4-bit, 8-bit
+- **Model ID:** google/gemma-7b-it
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/google/gemma-7b-it>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name gemma-it --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 78bcf3ba8a..b9a881279a 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -96,6 +96,11 @@ The following is a list of built-in LLM in Xinference:
      - 2048
      - Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting.
 
+   * - :ref:`gemma-it <models_llm_gemma-it>`
+     - chat
+     - 8192
+     - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.
+
    * - :ref:`glaive-coder <models_llm_glaive-coder>`
      - chat
      - 100000
@@ -358,6 +363,8 @@ The following is a list of built-in LLM in Xinference:
   
    falcon-instruct
   
+   gemma-it
+  
    glaive-coder
   
    gorilla-openfunctions-v1
diff --git a/doc/source/models/builtin/llm/llama-2-chat.rst b/doc/source/models/builtin/llm/llama-2-chat.rst
index 730509d47e..af6e2d914f 100644
--- a/doc/source/models/builtin/llm/llama-2-chat.rst
+++ b/doc/source/models/builtin/llm/llama-2-chat.rst
@@ -139,7 +139,7 @@ Model Spec 9 (ggufv2, 70 Billion)
 
 - **Model Format:** ggufv2
 - **Model Size (in billions):** 70
-- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M
 - **Model ID:** TheBloke/Llama-2-70B-Chat-GGUF
 - **Model Hubs**:  `Hugging Face <https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGUF>`__
 
diff --git a/doc/source/models/builtin/llm/qwen1.5-chat.rst b/doc/source/models/builtin/llm/qwen1.5-chat.rst
index 8ba5a44c1e..ae99e2ab89 100644
--- a/doc/source/models/builtin/llm/qwen1.5-chat.rst
+++ b/doc/source/models/builtin/llm/qwen1.5-chat.rst
@@ -284,49 +284,49 @@ chosen quantization method from the options listed above::
    xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format awq --quantization ${quantization}
 
 
-Model Spec 19 (ggufv2, 1_8 Billion)
+Model Spec 19 (ggufv2, 0_5 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
-- **Model Size (in billions):** 1_8
-- **Quantizations:** q8_0
+- **Model Size (in billions):** 0_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
 - **Model ID:** Qwen/Qwen1.5-0.5B-Chat-GGUF
 - **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GGUF>`__
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format ggufv2 --quantization ${quantization}
+   xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format ggufv2 --quantization ${quantization}
 
 
-Model Spec 20 (ggufv2, 4 Billion)
+Model Spec 20 (ggufv2, 1_8 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
-- **Model Size (in billions):** 4
-- **Quantizations:** q8_0
-- **Model ID:** Qwen/Qwen1.5-4B-Chat-GGUF
-- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GGUF>`__
+- **Model Size (in billions):** 1_8
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Model ID:** Qwen/Qwen1.5-1.8B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GGUF>`__
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format ggufv2 --quantization ${quantization}
+   xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format ggufv2 --quantization ${quantization}
 
 
-Model Spec 21 (ggufv2, 7 Billion)
+Model Spec 21 (ggufv2, 4 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
-- **Model Size (in billions):** 7
-- **Quantizations:** q5_k_m
-- **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF
-- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GGUF>`__
+- **Model Size (in billions):** 4
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Model ID:** Qwen/Qwen1.5-4B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GGUF>`__
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+   xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format ggufv2 --quantization ${quantization}
 
 
 Model Spec 22 (ggufv2, 7 Billion)
@@ -334,7 +334,7 @@ Model Spec 22 (ggufv2, 7 Billion)
 
 - **Model Format:** ggufv2
 - **Model Size (in billions):** 7
-- **Quantizations:** q5_k_m
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
 - **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF
 - **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GGUF>`__
 
@@ -349,7 +349,7 @@ Model Spec 23 (ggufv2, 14 Billion)
 
 - **Model Format:** ggufv2
 - **Model Size (in billions):** 14
-- **Quantizations:** q5_k_m
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
 - **Model ID:** Qwen/Qwen1.5-14B-Chat-GGUF
 - **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GGUF>`__
 
@@ -364,7 +364,7 @@ Model Spec 24 (ggufv2, 72 Billion)
 
 - **Model Format:** ggufv2
 - **Model Size (in billions):** 72
-- **Quantizations:** q2_k
+- **Quantizations:** q2_k, q3_k_m
 - **Model ID:** Qwen/Qwen1.5-72B-Chat-GGUF
 - **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GGUF>`__
 
diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile
index be08b02859..6ef45acf5b 100644
--- a/xinference/deploy/docker/Dockerfile
+++ b/xinference/deploy/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
+FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel
 
 COPY . /opt/inference
 
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index cdf6aeafcd..724d6d06ef 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -3753,5 +3753,50 @@
         "<|im_sep|>"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "gemma-it",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 2,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-2b-it"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-7b-it"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "gemma",
+      "roles": [
+        "user",
+        "model"
+      ],
+      "stop": [
+        "<end_of_turn>",
+        "<start_of_turn>"
+      ]
+    }
   }
 ]
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index d5e6df175c..fcdbf8f785 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -402,6 +402,15 @@ def get_role(role_name: str):
                     else:
                         ret += role + ": </s>"
             return ret
+        elif prompt_style.style_name == "gemma":
+            ret = ""
+            for message in chat_history:
+                content = message["content"]
+                role = get_role(message["role"])
+                ret += "<start_of_turn>" + role + "\n"
+                if content:
+                    ret += content + "<end_of_turn>\n"
+            return ret
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")