Merge remote-tracking branch 'refs/remotes/upstream/main'

amumu96 · Nov 25, 2024 · 67cfbd1 · 67cfbd1
2 parents 6a1a865 + e8c480b
commit 67cfbd1
Show file tree

Hide file tree

Showing 96 changed files with 6,127 additions and 499 deletions.
diff --git a/.github/workflows/docker-cd.yaml b/.github/workflows/docker-cd.yaml
@@ -73,26 +73,6 @@ jobs:
             echo "XINFERENCE_GIT_TAG=${GIT_TAG}" >> $GITHUB_ENV
           fi
 
-      - name: Log in to Aliyun Docker Hub
-        uses: docker/login-action@v1
-        with:
-          registry: registry.cn-hangzhou.aliyuncs.com
-          username: ${{ secrets.DOCKERHUB_ALIYUN_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_ALIYUN_PASSWORD }}
-
-      - name: Push docker image to Aliyun
-        shell: bash
-        if: ${{ github.repository == 'xorbitsai/inference' }}
-        env:
-          DOCKER_ORG: registry.cn-hangzhou.aliyuncs.com/xprobe_xinference
-        run: |
-          if [[ -n "$XINFERENCE_GIT_TAG" ]]; then
-            docker tag "xprobe/xinference:${XINFERENCE_GIT_TAG}" "$DOCKER_ORG/xinference:latest"
-            docker push "$DOCKER_ORG/xinference:latest"
-            docker tag "xprobe/xinference:${XINFERENCE_GIT_TAG}-cpu" "$DOCKER_ORG/xinference:latest-cpu"
-            docker push "$DOCKER_ORG/xinference:latest-cpu"
-          fi
-
       - name: Clean docker image cache
         shell: bash
         if: ${{ github.repository == 'xorbitsai/inference' }}

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -74,13 +74,13 @@ jobs:
       fail-fast: false
       matrix:
         os: [ "ubuntu-latest", "macos-12", "windows-latest" ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
         module: [ "xinference" ]
         exclude:
-          - { os: macos-12, python-version: 3.9 }
           - { os: macos-12, python-version: 3.10 }
-          - { os: windows-latest, python-version: 3.9 }
+          - { os: macos-12, python-version: 3.11 }
           - { os: windows-latest, python-version: 3.10 }
+          - { os: windows-latest, python-version: 3.11 }
         include:
           - { os: self-hosted, module: gpu, python-version: 3.9}
           - { os: macos-latest, module: metal, python-version: "3.10" }
@@ -99,6 +99,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
           activate-environment: ${{ env.CONDA_ENV }}
 
+      # Important for python == 3.12
+      - name: Update pip and setuptools
+        if: ${{ matrix.python-version == '3.12' }}
+        run: |
+          python -m pip install -U pip setuptools
+
       - name: Install dependencies
         env:
           MODULE: ${{ matrix.module }}
@@ -112,14 +118,15 @@ jobs:
             sudo rm -rf "$AGENT_TOOLSDIRECTORY"
           fi
           if [ "$MODULE" == "metal" ]; then
+            conda install -c conda-forge "ffmpeg<7"
             pip install mlx-lm
+            pip install mlx-whisper
           fi
           pip install "llama-cpp-python==0.2.77" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
           pip install transformers
           pip install attrdict
           pip install "timm>=0.9.16"
-          pip install torch
-          pip install torchvision
+          pip install torch torchvision
           pip install accelerate
           pip install sentencepiece
           pip install transformers_stream_generator
@@ -133,7 +140,6 @@ jobs:
           pip install -e ".[dev]"
           pip install "jinja2==3.1.2"
           pip install tensorizer
-          pip install eva-decord
           pip install jj-pytorchvideo
           pip install qwen-vl-utils
           pip install datamodel_code_generator
@@ -162,7 +168,7 @@ jobs:
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing<1.0.4
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U librosa
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U xxhash
-            ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2"
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2.1"
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U HyperPyYAML
             ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y matcha-tts
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxruntime-gpu==1.16.0; sys_platform == 'linux'
@@ -176,6 +182,12 @@ jobs:
             ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper"
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U accelerate
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U verovio
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U cachetools
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U silero-vad
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U pydantic
+            ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
+              --disable-warnings \
+              --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/core/tests/test_continuous_batching.py && \
             ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
               --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_stable_diffusion.py && \
@@ -200,13 +212,16 @@ jobs:
           elif [ "$MODULE" == "metal" ]; then
             pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
-              --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_mlx.py
+              --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_mlx.py && \
+            pytest --timeout=1500 \
+              -W ignore::PendingDeprecationWarning \
+              --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper_mlx.py
           else
             pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
               --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/client/tests/test_client.py
             pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
-              --cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/image/tests/test_got_ocr2.py --ignore xinference/model/audio/tests xinference
+              --cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/core/tests/test_continuous_batching.py --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/image/tests/test_got_ocr2.py --ignore xinference/model/audio/tests xinference
           fi
         working-directory: .
diff --git a/README.md b/README.md
@@ -180,6 +180,24 @@ Once Xinference is running, there are multiple ways you can try it: via the web
 | [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | Collaborating with other Xorbits users.            |
 | [Twitter](https://twitter.com/xorbitsio)                                                      | Staying up-to-date on new features.                |
 
+## Citation
+
+If this work is helpful, please kindly cite as:
+
+```bibtex
+@inproceedings{lu2024xinference,
+    title = "Xinference: Making Large Model Serving Easy",
+    author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.emnlp-demo.30",
+    pages = "291--300",
+}
+```
+
 ## Contributors
 
 <a href="https://github.com/xorbitsai/inference/graphs/contributors">

diff --git a/README_ja_JP.md b/README_ja_JP.md
@@ -104,6 +104,24 @@ Xinferenceが実行されると、Web UI、cURL、コマンドライン、また
 | [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)      | 他のXorbitsユーザーとの協力。                      |
 | [Twitter](https://twitter.com/xorbitsio)                                                          | 新機能に関する最新情報の入手。                    |
 
+## 引用
+
+この仕事が役立つ場合は、以下のように引用してください：
+
+```bibtex
+@inproceedings{lu2024xinference,
+    title = "Xinference: Making Large Model Serving Easy",
+    author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.emnlp-demo.30",
+    pages = "291--300",
+}
+```
+
 ## 寄稿者
 
 <a href="https://github.com/xorbitsai/inference/graphs/contributors">

diff --git a/README_zh_CN.md b/README_zh_CN.md
@@ -164,6 +164,24 @@ $ xinference-local
 | [微信社群](https://xorbits.cn/assets/images/wechat_work_qr.png)                                     | 与其他 Xorbits 用户交流。                         |
 | [知乎](https://zhihu.com/org/xorbits)                                                         | 了解团队最新的进展。                                  |
 
+## 引用
+
+如果您觉得此项目有帮助，请以如下格式引用我们：
+
+```bibtex
+@inproceedings{lu2024xinference,
+    title = "Xinference: Making Large Model Serving Easy",
+    author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.emnlp-demo.30",
+    pages = "291--300",
+}
+```
+
 ## 贡献者
 
 <a href="https://github.com/xorbitsai/inference/graphs/contributors">

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -121,7 +121,7 @@
         "type": "fontawesome",
     }])
     html_theme_options["external_links"] = [
-        {"name": "产品官网", "url": "https://xorbits.cn/inference"},
+        {"name": "产品官网", "url": "https://xorbits.cn"},
     ]
 
 html_favicon = "_static/favicon.svg"
diff --git a/doc/source/models/builtin/embedding/gte-qwen2.rst b/doc/source/models/builtin/embedding/gte-qwen2.rst
@@ -11,7 +11,7 @@ gte-Qwen2
 Specifications
 ^^^^^^^^^^^^^^
 
-- **Dimensions:** 4096
+- **Dimensions:** 3584
 - **Max Tokens:** 32000
 - **Model ID:** Alibaba-NLP/gte-Qwen2-7B-instruct
 - **Model Hubs**: `Hugging Face <https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct>`__, `ModelScope <https://modelscope.cn/models/iic/gte_Qwen2-7B-instruct>`__

diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -240,6 +240,16 @@ The following is a list of built-in LLM in Xinference:
      - chat, tools
      - 131072
      - The Llama 3.1 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..
+
+   * - :ref:`llama-3.2-vision <models_llm_llama-3.2-vision>`
+     - generate, vision
+     - 131072
+     - The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a collection of pretrained and instruction-tuned image reasoning generative models in 11B and 90B sizes (text + images in / text out)...
+
+   * - :ref:`llama-3.2-vision-instruct <models_llm_llama-3.2-vision-instruct>`
+     - chat, vision
+     - 131072
+     - The Llama 3.2-Vision-instruct instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. The models outperform many of the available open source and closed multimodal models on common industry benchmarks...     
 
    * - :ref:`minicpm-2b-dpo-bf16 <models_llm_minicpm-2b-dpo-bf16>`
      - chat

diff --git a/doc/source/models/builtin/llm/llama-3.2-vision-instruct.rst b/doc/source/models/builtin/llm/llama-3.2-vision-instruct.rst
@@ -0,0 +1,47 @@
+.. _models_llm_llama-3.2-vision-instruct:
+
+========================================
+llama-3.2-vision-instruct
+========================================
+
+- **Context Length:** 131072
+- **Model Name:** llama-3.2-vision-instruct
+- **Languages:** en, de, fr, it, pt, hi, es, th
+- **Abilities:** chat, vision
+- **Description:** The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. The models outperform many of the available open source and closed multimodal models on common industry benchmarks...
+
+Specifications
+^^^^^^^^^^^^^^
+
+Model Spec 1 (pytorch, 11 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 11
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** meta-llama/Meta-Llama-3.2-11B-Vision-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-11B-Vision-Instruct>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-11B-Vision-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine transformers --model-name llama-3.2-vision-instruct --size-in-billions 11 --model-format pytorch --quantization ${quantization}
+   xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision-instruct --size-in-billions 11 --model-format pytorch
+
+Model Spec 2 (pytorch, 90 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 90
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** meta-llama/Meta-Llama-3.2-90B-Vision-Instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-90B-Vision-Instruct>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-90B-Vision-Instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine transformers --model-name llama-3.2-vision-instruct --size-in-billions 90 --model-format pytorch --quantization ${quantization}
+   xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision-instruct --size-in-billions 90 --model-format pytorch
+
diff --git a/doc/source/models/builtin/llm/llama-3.2-vision.rst b/doc/source/models/builtin/llm/llama-3.2-vision.rst
@@ -0,0 +1,47 @@
+.. _models_llm_llama-3.2-vision:
+
+================
+llama-3.2-vision
+================
+
+- **Context Length:** 131072
+- **Model Name:** llama-3.2-vision
+- **Languages:** en, de, fr, it, pt, hi, es, th
+- **Abilities:** generate, vision
+- **Description:** The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. The models outperform many of the available open source and closed multimodal models on common industry benchmarks...
+
+Specifications
+^^^^^^^^^^^^^^
+
+Model Spec 1 (pytorch, 11 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 11
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** meta-llama/Meta-Llama-3.2-11B-Vision
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-11B-Vision>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-11B-Vision>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine transformers --model-name llama-3.2-vision --size-in-billions 11 --model-format pytorch --quantization ${quantization}
+   xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision --size-in-billions 11 --model-format pytorch
+
+Model Spec 2 (pytorch, 90 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 90
+- **Quantizations:** none
+- **Engines**: vLLM, Transformers
+- **Model ID:** meta-llama/Meta-Llama-3.2-90B-Vision
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-90B-Vision>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-90B-Vision>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine transformers --model-name llama-3.2-vision --size-in-billions 90 --model-format pytorch --quantization ${quantization}
+   xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision --size-in-billions 90 --model-format pytorch
+
diff --git a/doc/source/models/model_abilities/audio.rst b/doc/source/models/model_abilities/audio.rst
@@ -331,7 +331,7 @@ Clone voice, launch model ``CosyVoice-300M``.
 
     zero_shot_prompt_text = ""
     # The zero shot prompt file is the voice file
-    # the words said in the file shoule be identical to zero_shot_prompt_text
+    # the words said in the file should be identical to zero_shot_prompt_text
     with open(zero_shot_prompt_file, "rb") as f:
         zero_shot_prompt = f.read()
 
@@ -379,3 +379,34 @@ Instruction based, launch model ``CosyVoice-300M-Instruct``.
     )
 
 More instructions and examples, could be found at https://fun-audio-llm.github.io/ .
+
+
+FishSpeech Usage
+~~~~~~~~~~~~~~~~
+
+Basic usage, refer to :ref:`audio speech usage <audio_speech>`.
+
+Clone voice, launch model ``FishSpeech-1.4``. Please use `prompt_speech` instead of `reference_audio`
+to provide the reference audio to the FishSpeech model.
+
+.. code-block::
+
+    from xinference.client import Client
+
+    client = Client("http://<XINFERENCE_HOST>:<XINFERENCE_PORT>")
+
+    model = client.get_model("<MODEL_UID>")
+
+    reference_text = ""
+    # The reference audio file is the voice file
+    # the words said in the file should be identical to reference_text
+    with open(reference_audio_file, "rb") as f:
+        reference_audio = f.read()
+
+    speech_bytes = model.speech(
+        "<The text to generate audio for>",
+        reference_text=reference_text,
+        prompt_speech=reference_audio,
+        enable_reference_audio=True,
+    )
+-
diff --git a/doc/source/models/model_abilities/vision.rst b/doc/source/models/model_abilities/vision.rst
@@ -31,6 +31,8 @@ The ``vision`` ability is supported with the following models in Xinference:
 * :ref:`MiniCPM-Llama3-V 2.6 <models_llm_minicpm-v-2.6>`
 * :ref:`internvl2 <models_llm_internvl2>`
 * :ref:`qwen2-vl-instruct <models_llm_qwen2-vl-instruct>`
+* :ref:`llama-3.2-vision <models_llm_llama-3.2-vision>`
+* :ref:`llama-3.2-vision-instruct <models_llm_llama-3.2-vision-instruct>`
 
 
 Quickstart