From c653c975847f9f6a81382033a9c8f5bd81bf70f2 Mon Sep 17 00:00:00 2001 From: aresnow1 <109642806+aresnow1@users.noreply.github.com> Date: Thu, 22 Feb 2024 15:40:53 +0800 Subject: [PATCH] FEAT: Support gemma series model (#1024) --- doc/source/models/builtin/llm/gemma-it.rst | 45 +++++++++++++++++++ doc/source/models/builtin/llm/index.rst | 7 +++ .../models/builtin/llm/llama-2-chat.rst | 2 +- .../models/builtin/llm/qwen1.5-chat.rst | 38 ++++++++-------- xinference/deploy/docker/Dockerfile | 2 +- xinference/model/llm/llm_family.json | 45 +++++++++++++++++++ xinference/model/llm/utils.py | 9 ++++ 7 files changed, 127 insertions(+), 21 deletions(-) create mode 100644 doc/source/models/builtin/llm/gemma-it.rst diff --git a/doc/source/models/builtin/llm/gemma-it.rst b/doc/source/models/builtin/llm/gemma-it.rst new file mode 100644 index 0000000000..676e91b4a4 --- /dev/null +++ b/doc/source/models/builtin/llm/gemma-it.rst @@ -0,0 +1,45 @@ +.. _models_llm_gemma-it: + +======================================== +gemma-it +======================================== + +- **Context Length:** 8192 +- **Model Name:** gemma-it +- **Languages:** en +- **Abilities:** chat +- **Description:** Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 2 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 2 +- **Quantizations:** none, 4-bit, 8-bit +- **Model ID:** google/gemma-2b-it +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name gemma-it --size-in-billions 2 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** none, 4-bit, 8-bit +- **Model ID:** google/gemma-7b-it +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name gemma-it --size-in-billions 7 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 78bcf3ba8a..b9a881279a 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -96,6 +96,11 @@ The following is a list of built-in LLM in Xinference: - 2048 - Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting. + * - :ref:`gemma-it ` + - chat + - 8192 + - Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. + * - :ref:`glaive-coder ` - chat - 100000 @@ -358,6 +363,8 @@ The following is a list of built-in LLM in Xinference: falcon-instruct + gemma-it + glaive-coder gorilla-openfunctions-v1 diff --git a/doc/source/models/builtin/llm/llama-2-chat.rst b/doc/source/models/builtin/llm/llama-2-chat.rst index 730509d47e..af6e2d914f 100644 --- a/doc/source/models/builtin/llm/llama-2-chat.rst +++ b/doc/source/models/builtin/llm/llama-2-chat.rst @@ -139,7 +139,7 @@ Model Spec 9 (ggufv2, 70 Billion) - **Model Format:** ggufv2 - **Model Size (in billions):** 70 -- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M - **Model ID:** TheBloke/Llama-2-70B-Chat-GGUF - **Model Hubs**: `Hugging Face `__ diff --git a/doc/source/models/builtin/llm/qwen1.5-chat.rst b/doc/source/models/builtin/llm/qwen1.5-chat.rst index 8ba5a44c1e..ae99e2ab89 100644 --- a/doc/source/models/builtin/llm/qwen1.5-chat.rst +++ b/doc/source/models/builtin/llm/qwen1.5-chat.rst @@ -284,49 +284,49 @@ chosen quantization method from the options listed above:: xinference launch --model-name qwen1.5-chat --size-in-billions 72 --model-format awq --quantization ${quantization} -Model Spec 19 (ggufv2, 1_8 Billion) +Model Spec 19 (ggufv2, 0_5 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggufv2 -- **Model Size (in billions):** 1_8 -- **Quantizations:** q8_0 +- **Model Size (in billions):** 0_5 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0 - **Model ID:** Qwen/Qwen1.5-0.5B-Chat-GGUF - **Model Hubs**: `Hugging Face `__, `ModelScope `__ Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format ggufv2 --quantization ${quantization} + xinference launch --model-name qwen1.5-chat --size-in-billions 0_5 --model-format ggufv2 --quantization ${quantization} -Model Spec 20 (ggufv2, 4 Billion) +Model Spec 20 (ggufv2, 1_8 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggufv2 -- **Model Size (in billions):** 4 -- **Quantizations:** q8_0 -- **Model ID:** Qwen/Qwen1.5-4B-Chat-GGUF -- **Model Hubs**: `Hugging Face `__, `ModelScope `__ +- **Model Size (in billions):** 1_8 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0 +- **Model ID:** Qwen/Qwen1.5-1.8B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format ggufv2 --quantization ${quantization} + xinference launch --model-name qwen1.5-chat --size-in-billions 1_8 --model-format ggufv2 --quantization ${quantization} -Model Spec 21 (ggufv2, 7 Billion) +Model Spec 21 (ggufv2, 4 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggufv2 -- **Model Size (in billions):** 7 -- **Quantizations:** q5_k_m -- **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF -- **Model Hubs**: `Hugging Face `__, `ModelScope `__ +- **Model Size (in billions):** 4 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0 +- **Model ID:** Qwen/Qwen1.5-4B-Chat-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name qwen1.5-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + xinference launch --model-name qwen1.5-chat --size-in-billions 4 --model-format ggufv2 --quantization ${quantization} Model Spec 22 (ggufv2, 7 Billion) @@ -334,7 +334,7 @@ Model Spec 22 (ggufv2, 7 Billion) - **Model Format:** ggufv2 - **Model Size (in billions):** 7 -- **Quantizations:** q5_k_m +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0 - **Model ID:** Qwen/Qwen1.5-7B-Chat-GGUF - **Model Hubs**: `Hugging Face `__, `ModelScope `__ @@ -349,7 +349,7 @@ Model Spec 23 (ggufv2, 14 Billion) - **Model Format:** ggufv2 - **Model Size (in billions):** 14 -- **Quantizations:** q5_k_m +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0 - **Model ID:** Qwen/Qwen1.5-14B-Chat-GGUF - **Model Hubs**: `Hugging Face `__, `ModelScope `__ @@ -364,7 +364,7 @@ Model Spec 24 (ggufv2, 72 Billion) - **Model Format:** ggufv2 - **Model Size (in billions):** 72 -- **Quantizations:** q2_k +- **Quantizations:** q2_k, q3_k_m - **Model ID:** Qwen/Qwen1.5-72B-Chat-GGUF - **Model Hubs**: `Hugging Face `__, `ModelScope `__ diff --git a/xinference/deploy/docker/Dockerfile b/xinference/deploy/docker/Dockerfile index be08b02859..6ef45acf5b 100644 --- a/xinference/deploy/docker/Dockerfile +++ b/xinference/deploy/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel +FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel COPY . /opt/inference diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index cdf6aeafcd..724d6d06ef 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -3753,5 +3753,50 @@ "<|im_sep|>" ] } + }, + { + "version": 1, + "context_length": 8192, + "model_name": "gemma-it", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 2, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-2b-it" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "none", + "4-bit", + "8-bit" + ], + "model_id": "google/gemma-7b-it" + } + ], + "prompt_style": { + "style_name": "gemma", + "roles": [ + "user", + "model" + ], + "stop": [ + "", + "" + ] + } } ] diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index d5e6df175c..fcdbf8f785 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -402,6 +402,15 @@ def get_role(role_name: str): else: ret += role + ": " return ret + elif prompt_style.style_name == "gemma": + ret = "" + for message in chat_history: + content = message["content"] + role = get_role(message["role"]) + ret += "" + role + "\n" + if content: + ret += content + "\n" + return ret else: raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")