Skip to content

Commit

Permalink
Merge remote-tracking branch 'refs/remotes/upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
amumu96 committed Nov 25, 2024
2 parents 6a1a865 + e8c480b commit 67cfbd1
Show file tree
Hide file tree
Showing 96 changed files with 6,127 additions and 499 deletions.
20 changes: 0 additions & 20 deletions .github/workflows/docker-cd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,26 +73,6 @@ jobs:
echo "XINFERENCE_GIT_TAG=${GIT_TAG}" >> $GITHUB_ENV
fi
- name: Log in to Aliyun Docker Hub
uses: docker/login-action@v1
with:
registry: registry.cn-hangzhou.aliyuncs.com
username: ${{ secrets.DOCKERHUB_ALIYUN_USERNAME }}
password: ${{ secrets.DOCKERHUB_ALIYUN_PASSWORD }}

- name: Push docker image to Aliyun
shell: bash
if: ${{ github.repository == 'xorbitsai/inference' }}
env:
DOCKER_ORG: registry.cn-hangzhou.aliyuncs.com/xprobe_xinference
run: |
if [[ -n "$XINFERENCE_GIT_TAG" ]]; then
docker tag "xprobe/xinference:${XINFERENCE_GIT_TAG}" "$DOCKER_ORG/xinference:latest"
docker push "$DOCKER_ORG/xinference:latest"
docker tag "xprobe/xinference:${XINFERENCE_GIT_TAG}-cpu" "$DOCKER_ORG/xinference:latest-cpu"
docker push "$DOCKER_ORG/xinference:latest-cpu"
fi
- name: Clean docker image cache
shell: bash
if: ${{ github.repository == 'xorbitsai/inference' }}
Expand Down
33 changes: 24 additions & 9 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ jobs:
fail-fast: false
matrix:
os: [ "ubuntu-latest", "macos-12", "windows-latest" ]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
module: [ "xinference" ]
exclude:
- { os: macos-12, python-version: 3.9 }
- { os: macos-12, python-version: 3.10 }
- { os: windows-latest, python-version: 3.9 }
- { os: macos-12, python-version: 3.11 }
- { os: windows-latest, python-version: 3.10 }
- { os: windows-latest, python-version: 3.11 }
include:
- { os: self-hosted, module: gpu, python-version: 3.9}
- { os: macos-latest, module: metal, python-version: "3.10" }
Expand All @@ -99,6 +99,12 @@ jobs:
python-version: ${{ matrix.python-version }}
activate-environment: ${{ env.CONDA_ENV }}

# Important for python == 3.12
- name: Update pip and setuptools
if: ${{ matrix.python-version == '3.12' }}
run: |
python -m pip install -U pip setuptools
- name: Install dependencies
env:
MODULE: ${{ matrix.module }}
Expand All @@ -112,14 +118,15 @@ jobs:
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
fi
if [ "$MODULE" == "metal" ]; then
conda install -c conda-forge "ffmpeg<7"
pip install mlx-lm
pip install mlx-whisper
fi
pip install "llama-cpp-python==0.2.77" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
pip install transformers
pip install attrdict
pip install "timm>=0.9.16"
pip install torch
pip install torchvision
pip install torch torchvision
pip install accelerate
pip install sentencepiece
pip install transformers_stream_generator
Expand All @@ -133,7 +140,6 @@ jobs:
pip install -e ".[dev]"
pip install "jinja2==3.1.2"
pip install tensorizer
pip install eva-decord
pip install jj-pytorchvideo
pip install qwen-vl-utils
pip install datamodel_code_generator
Expand Down Expand Up @@ -162,7 +168,7 @@ jobs:
${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing<1.0.4
${{ env.SELF_HOST_PYTHON }} -m pip install -U librosa
${{ env.SELF_HOST_PYTHON }} -m pip install -U xxhash
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2.1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U HyperPyYAML
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y matcha-tts
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxruntime-gpu==1.16.0; sys_platform == 'linux'
Expand All @@ -176,6 +182,12 @@ jobs:
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper"
${{ env.SELF_HOST_PYTHON }} -m pip install -U accelerate
${{ env.SELF_HOST_PYTHON }} -m pip install -U verovio
${{ env.SELF_HOST_PYTHON }} -m pip install -U cachetools
${{ env.SELF_HOST_PYTHON }} -m pip install -U silero-vad
${{ env.SELF_HOST_PYTHON }} -m pip install -U pydantic
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
--disable-warnings \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/core/tests/test_continuous_batching.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_stable_diffusion.py && \
Expand All @@ -200,13 +212,16 @@ jobs:
elif [ "$MODULE" == "metal" ]; then
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_mlx.py
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/llm/mlx/tests/test_mlx.py && \
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper_mlx.py
else
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/client/tests/test_client.py
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/image/tests/test_got_ocr2.py --ignore xinference/model/audio/tests xinference
--cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/core/tests/test_continuous_batching.py --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/image/tests/test_got_ocr2.py --ignore xinference/model/audio/tests xinference
fi
working-directory: .
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,24 @@ Once Xinference is running, there are multiple ways you can try it: via the web
| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | Collaborating with other Xorbits users. |
| [Twitter](https://twitter.com/xorbitsio) | Staying up-to-date on new features. |

## Citation

If this work is helpful, please kindly cite as:

```bibtex
@inproceedings{lu2024xinference,
title = "Xinference: Making Large Model Serving Easy",
author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.30",
pages = "291--300",
}
```

## Contributors

<a href="https://github.com/xorbitsai/inference/graphs/contributors">
Expand Down
18 changes: 18 additions & 0 deletions README_ja_JP.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,24 @@ Xinferenceが実行されると、Web UI、cURL、コマンドライン、また
| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | 他のXorbitsユーザーとの協力。 |
| [Twitter](https://twitter.com/xorbitsio) | 新機能に関する最新情報の入手。 |

## 引用

この仕事が役立つ場合は、以下のように引用してください:

```bibtex
@inproceedings{lu2024xinference,
title = "Xinference: Making Large Model Serving Easy",
author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.30",
pages = "291--300",
}
```

## 寄稿者

<a href="https://github.com/xorbitsai/inference/graphs/contributors">
Expand Down
18 changes: 18 additions & 0 deletions README_zh_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,24 @@ $ xinference-local
| [微信社群](https://xorbits.cn/assets/images/wechat_work_qr.png) | 与其他 Xorbits 用户交流。 |
| [知乎](https://zhihu.com/org/xorbits) | 了解团队最新的进展。 |

## 引用

如果您觉得此项目有帮助,请以如下格式引用我们:

```bibtex
@inproceedings{lu2024xinference,
title = "Xinference: Making Large Model Serving Easy",
author = "Lu, Weizheng and Xiong, Lingfeng and Zhang, Feng and Qin, Xuye and Chen, Yueguo",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.30",
pages = "291--300",
}
```

## 贡献者

<a href="https://github.com/xorbitsai/inference/graphs/contributors">
Expand Down
2 changes: 1 addition & 1 deletion doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
"type": "fontawesome",
}])
html_theme_options["external_links"] = [
{"name": "产品官网", "url": "https://xorbits.cn/inference"},
{"name": "产品官网", "url": "https://xorbits.cn"},
]

html_favicon = "_static/favicon.svg"
2 changes: 1 addition & 1 deletion doc/source/models/builtin/embedding/gte-qwen2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ gte-Qwen2
Specifications
^^^^^^^^^^^^^^

- **Dimensions:** 4096
- **Dimensions:** 3584
- **Max Tokens:** 32000
- **Model ID:** Alibaba-NLP/gte-Qwen2-7B-instruct
- **Model Hubs**: `Hugging Face <https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct>`__, `ModelScope <https://modelscope.cn/models/iic/gte_Qwen2-7B-instruct>`__
Expand Down
10 changes: 10 additions & 0 deletions doc/source/models/builtin/llm/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,16 @@ The following is a list of built-in LLM in Xinference:
- chat, tools
- 131072
- The Llama 3.1 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..

* - :ref:`llama-3.2-vision <models_llm_llama-3.2-vision>`
- generate, vision
- 131072
- The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a collection of pretrained and instruction-tuned image reasoning generative models in 11B and 90B sizes (text + images in / text out)...

* - :ref:`llama-3.2-vision-instruct <models_llm_llama-3.2-vision-instruct>`
- chat, vision
- 131072
- The Llama 3.2-Vision-instruct instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. The models outperform many of the available open source and closed multimodal models on common industry benchmarks...

* - :ref:`minicpm-2b-dpo-bf16 <models_llm_minicpm-2b-dpo-bf16>`
- chat
Expand Down
47 changes: 47 additions & 0 deletions doc/source/models/builtin/llm/llama-3.2-vision-instruct.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
.. _models_llm_llama-3.2-vision-instruct:

========================================
llama-3.2-vision-instruct
========================================

- **Context Length:** 131072
- **Model Name:** llama-3.2-vision-instruct
- **Languages:** en, de, fr, it, pt, hi, es, th
- **Abilities:** chat, vision
- **Description:** The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. The models outperform many of the available open source and closed multimodal models on common industry benchmarks...

Specifications
^^^^^^^^^^^^^^

Model Spec 1 (pytorch, 11 Billion)
++++++++++++++++++++++++++++++++++++++++

- **Model Format:** pytorch
- **Model Size (in billions):** 11
- **Quantizations:** none
- **Engines**: vLLM, Transformers
- **Model ID:** meta-llama/Meta-Llama-3.2-11B-Vision-Instruct
- **Model Hubs**: `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-11B-Vision-Instruct>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-11B-Vision-Instruct>`__

Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::

xinference launch --model-engine transformers --model-name llama-3.2-vision-instruct --size-in-billions 11 --model-format pytorch --quantization ${quantization}
xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision-instruct --size-in-billions 11 --model-format pytorch

Model Spec 2 (pytorch, 90 Billion)
++++++++++++++++++++++++++++++++++++++++

- **Model Format:** pytorch
- **Model Size (in billions):** 90
- **Quantizations:** none
- **Engines**: vLLM, Transformers
- **Model ID:** meta-llama/Meta-Llama-3.2-90B-Vision-Instruct
- **Model Hubs**: `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-90B-Vision-Instruct>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-90B-Vision-Instruct>`__

Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::

xinference launch --model-engine transformers --model-name llama-3.2-vision-instruct --size-in-billions 90 --model-format pytorch --quantization ${quantization}
xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision-instruct --size-in-billions 90 --model-format pytorch

47 changes: 47 additions & 0 deletions doc/source/models/builtin/llm/llama-3.2-vision.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
.. _models_llm_llama-3.2-vision:

================
llama-3.2-vision
================

- **Context Length:** 131072
- **Model Name:** llama-3.2-vision
- **Languages:** en, de, fr, it, pt, hi, es, th
- **Abilities:** generate, vision
- **Description:** The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. The models outperform many of the available open source and closed multimodal models on common industry benchmarks...

Specifications
^^^^^^^^^^^^^^

Model Spec 1 (pytorch, 11 Billion)
++++++++++++++++++++++++++++++++++++++++

- **Model Format:** pytorch
- **Model Size (in billions):** 11
- **Quantizations:** none
- **Engines**: vLLM, Transformers
- **Model ID:** meta-llama/Meta-Llama-3.2-11B-Vision
- **Model Hubs**: `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-11B-Vision>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-11B-Vision>`__

Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::

xinference launch --model-engine transformers --model-name llama-3.2-vision --size-in-billions 11 --model-format pytorch --quantization ${quantization}
xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision --size-in-billions 11 --model-format pytorch

Model Spec 2 (pytorch, 90 Billion)
++++++++++++++++++++++++++++++++++++++++

- **Model Format:** pytorch
- **Model Size (in billions):** 90
- **Quantizations:** none
- **Engines**: vLLM, Transformers
- **Model ID:** meta-llama/Meta-Llama-3.2-90B-Vision
- **Model Hubs**: `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-90B-Vision>`__, `ModelScope <https://modelscope.cn/models/LLM-Research/Meta-Llama-3.2-90B-Vision>`__

Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::

xinference launch --model-engine transformers --model-name llama-3.2-vision --size-in-billions 90 --model-format pytorch --quantization ${quantization}
xinference launch --model-engine vllm --enforce_eager --max_num_seqs 16 --model-name llama-3.2-vision --size-in-billions 90 --model-format pytorch

33 changes: 32 additions & 1 deletion doc/source/models/model_abilities/audio.rst
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ Clone voice, launch model ``CosyVoice-300M``.
zero_shot_prompt_text = ""
# The zero shot prompt file is the voice file
# the words said in the file shoule be identical to zero_shot_prompt_text
# the words said in the file should be identical to zero_shot_prompt_text
with open(zero_shot_prompt_file, "rb") as f:
zero_shot_prompt = f.read()
Expand Down Expand Up @@ -379,3 +379,34 @@ Instruction based, launch model ``CosyVoice-300M-Instruct``.
)
More instructions and examples, could be found at https://fun-audio-llm.github.io/ .


FishSpeech Usage
~~~~~~~~~~~~~~~~

Basic usage, refer to :ref:`audio speech usage <audio_speech>`.

Clone voice, launch model ``FishSpeech-1.4``. Please use `prompt_speech` instead of `reference_audio`
to provide the reference audio to the FishSpeech model.

.. code-block::
from xinference.client import Client
client = Client("http://<XINFERENCE_HOST>:<XINFERENCE_PORT>")
model = client.get_model("<MODEL_UID>")
reference_text = ""
# The reference audio file is the voice file
# the words said in the file should be identical to reference_text
with open(reference_audio_file, "rb") as f:
reference_audio = f.read()
speech_bytes = model.speech(
"<The text to generate audio for>",
reference_text=reference_text,
prompt_speech=reference_audio,
enable_reference_audio=True,
)
-
2 changes: 2 additions & 0 deletions doc/source/models/model_abilities/vision.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ The ``vision`` ability is supported with the following models in Xinference:
* :ref:`MiniCPM-Llama3-V 2.6 <models_llm_minicpm-v-2.6>`
* :ref:`internvl2 <models_llm_internvl2>`
* :ref:`qwen2-vl-instruct <models_llm_qwen2-vl-instruct>`
* :ref:`llama-3.2-vision <models_llm_llama-3.2-vision>`
* :ref:`llama-3.2-vision-instruct <models_llm_llama-3.2-vision-instruct>`


Quickstart
Expand Down
Loading

0 comments on commit 67cfbd1

Please sign in to comment.