diff --git a/.github/workflows/docker-cd.yaml b/.github/workflows/docker-cd.yaml
index e5ef82e27f..5048a8910c 100644
--- a/.github/workflows/docker-cd.yaml
+++ b/.github/workflows/docker-cd.yaml
@@ -14,6 +14,7 @@ concurrency:
jobs:
build:
+ timeout-minutes: 120
runs-on: self-hosted
strategy:
matrix:
@@ -85,10 +86,6 @@ jobs:
env:
DOCKER_ORG: registry.cn-hangzhou.aliyuncs.com/xprobe_xinference
run: |
- docker tag "xprobe/xinference:${XINFERENCE_IMAGE_TAG}" "${DOCKER_ORG}/xinference:${XINFERENCE_IMAGE_TAG}"
- docker push "${DOCKER_ORG}/xinference:${XINFERENCE_IMAGE_TAG}"
- docker tag "xprobe/xinference:${XINFERENCE_IMAGE_TAG}-cpu" "${DOCKER_ORG}/xinference:${XINFERENCE_IMAGE_TAG}-cpu"
- docker push "${DOCKER_ORG}/xinference:${XINFERENCE_IMAGE_TAG}-cpu"
if [[ -n "$XINFERENCE_GIT_TAG" ]]; then
docker tag "xprobe/xinference:${XINFERENCE_GIT_TAG}" "$DOCKER_ORG/xinference:latest"
docker push "$DOCKER_ORG/xinference:latest"
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index c097314a7a..81a16122c5 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -135,6 +135,9 @@ jobs:
pip install tensorizer
pip install eva-decord
pip install jj-pytorchvideo
+ pip install qwen-vl-utils
+ pip install datamodel_code_generator
+ pip install jsonschema
working-directory: .
- name: Test with pytest
@@ -142,7 +145,7 @@ jobs:
MODULE: ${{ matrix.module }}
run: |
if [ "$MODULE" == "gpu" ]; then
- ${{ env.SELF_HOST_PYTHON }} -m pip install -U "openai>1,<1.40"
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U "openai>1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U modelscope
${{ env.SELF_HOST_PYTHON }} -m pip install -U sse_starlette
${{ env.SELF_HOST_PYTHON }} -m pip install -U xoscar
@@ -154,37 +157,43 @@ jobs:
${{ env.SELF_HOST_CONDA }} install -c conda-forge pynini=2.1.5
${{ env.SELF_HOST_CONDA }} install -c conda-forge "ffmpeg<7"
${{ env.SELF_HOST_PYTHON }} -m pip install -U funasr
- ${{ env.SELF_HOST_PYTHON }} -m pip install -U nemo_text_processing
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U nemo_text_processing<1.1.0
${{ env.SELF_HOST_PYTHON }} -m pip install -U omegaconf~=2.3.0
- ${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing<1.0.4
${{ env.SELF_HOST_PYTHON }} -m pip install -U librosa
${{ env.SELF_HOST_PYTHON }} -m pip install -U xxhash
- ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>0.1"
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2"
${{ env.SELF_HOST_PYTHON }} -m pip install -U HyperPyYAML
- ${{ env.SELF_HOST_PYTHON }} -m pip install -U matcha-tts
+ ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y matcha-tts
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxruntime-gpu==1.16.0; sys_platform == 'linux'
${{ env.SELF_HOST_PYTHON }} -m pip install -U openai-whisper
${{ env.SELF_HOST_PYTHON }} -m pip install -U "torch==2.3.1" "torchaudio==2.3.1"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loguru"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "natsort"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "loralib"
- ${{ env.SELF_HOST_PYTHON }} -m pip install -U "opencc==1.1.6"
- ${{ env.SELF_HOST_PYTHON }} -m pip install -U "faster_whisper"
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ormsgpack"
+ ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y opencc
+ ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y "faster_whisper"
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U accelerate
+ ${{ env.SELF_HOST_PYTHON }} -m pip install -U verovio
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
- --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_stable_diffusion.py
+ --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_stable_diffusion.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
- --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper.py
+ --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/image/tests/test_got_ocr2.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
- --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_funasr.py
+ --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
- --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_chattts.py
+ --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_funasr.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
- --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_cosyvoice.py
+ --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_chattts.py && \
+ ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
+ -W ignore::PendingDeprecationWarning \
+ --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_cosyvoice.py && \
${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_fish_speech.py
@@ -198,6 +207,6 @@ jobs:
--cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/client/tests/test_client.py
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
- --cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/audio/tests xinference
+ --cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/image/tests/test_got_ocr2.py --ignore xinference/model/audio/tests xinference
fi
working-directory: .
diff --git a/MANIFEST.in b/MANIFEST.in
index ea0460dc63..2649794924 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -12,4 +12,5 @@ global-exclude conftest.py
include xinference/locale/*.json
include xinference/model/llm/*.json
include xinference/model/embedding/*.json
+graft xinference/thirdparty
global-include xinference/web/ui/build/**/*
\ No newline at end of file
diff --git a/README.md b/README.md
index fbee3a41e5..d63cbb42ff 100644
--- a/README.md
+++ b/README.md
@@ -3,13 +3,25 @@
# Xorbits Inference: Model Serving Made Easy 🤖
+
+ Xinference Cloud ·
+ Xinference Enterprise ·
+ Self-hosting ·
+ Documentation
+
+
[](https://pypi.org/project/xinference/)
[](https://github.com/xorbitsai/inference/blob/main/LICENSE)
[](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main)
[](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)
[](https://twitter.com/xorbitsio)
-English | [中文介绍](README_zh_CN.md) | [日本語](README_ja_JP.md)
+
+
+
+
+
+
@@ -34,14 +46,14 @@ potential of cutting-edge AI models.
- Support speech recognition model: [#929](https://github.com/xorbitsai/inference/pull/929)
- Metrics support: [#906](https://github.com/xorbitsai/inference/pull/906)
### New Models
+- Built-in support for [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325)
+- Built-in support for [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295)
+- Built-in support for [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292)
+- Built-in support for [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271)
+- Built-in support for [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205)
+- Built-in support for [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263)
- Built-in support for [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
- Built-in support for [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
-- Built-in support for [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
-- Built-in support for [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
-- Built-in support for [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
-- Built-in support for [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
-- Built-in support for [llama3.1](https://ai.meta.com/blog/meta-llama-3-1/): [#1932](https://github.com/xorbitsai/inference/pull/1932)
-- Built-in support for [Mistral Nemo](https://mistral.ai/news/mistral-nemo/): [#1936](https://github.com/xorbitsai/inference/pull/1936)
### Integrations
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
- [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
@@ -85,9 +97,25 @@ with popular third-party libraries including [LangChain](https://python.langchai
| Audio Models | ✅ | ❌ | ❌ | ❌ |
| More OpenAI Functionalities (Function Calling) | ✅ | ❌ | ❌ | ❌ |
-## Getting Started
+## Using Xinference
+
+- **Cloud **
+We host a [Xinference Cloud](https://inference.top) service for anyone to try with zero setup.
+
+- **Self-hosting Xinference Community Edition**
+Quickly get Xinference running in your environment with this [starter guide](#getting-started).
+Use our [documentation](https://inference.readthedocs.io/) for further references and more in-depth instructions.
+
+- **Xinference for enterprise / organizations**
+We provide additional enterprise-centric features. [send us an email](mailto:business@xprobe.io?subject=[GitHub]Business%20License%20Inquiry) to discuss enterprise needs.
-**Please give us a star before you begin, and you'll receive instant notifications for every new release on GitHub!**
+## Staying Ahead
+
+Star Xinference on GitHub and be instantly notified of new releases.
+
+
+
+## Getting Started
* [Docs](https://inference.readthedocs.io/en/latest/index.html)
* [Built-in Models](https://inference.readthedocs.io/en/latest/models/builtin/index.html)
@@ -157,3 +185,7 @@ Once Xinference is running, there are multiple ways you can try it: via the web
+
+## Star History
+
+[](https://star-history.com/#xorbitsai/inference&Date)
\ No newline at end of file
diff --git a/README_ja_JP.md b/README_ja_JP.md
index f5cafc4150..c80601a9c7 100644
--- a/README_ja_JP.md
+++ b/README_ja_JP.md
@@ -9,7 +9,11 @@
[](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)
[](https://twitter.com/xorbitsio)
-[English](README.md) | [中文介绍](README_zh_CN.md) | 日本語
+
+
+
+
+
diff --git a/README_zh_CN.md b/README_zh_CN.md
index 34b9c4621e..2df28e2632 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -3,13 +3,24 @@
# Xorbits Inference:模型推理, 轻而易举 🤖
+
+ Xinference 云服务 ·
+ Xinference 企业版 ·
+ 自托管 ·
+ 文档
+
+
[](https://pypi.org/project/xinference/)
[](https://github.com/xorbitsai/inference/blob/main/LICENSE)
[](https://actions-badge.atrox.dev/xorbitsai/inference/goto?ref=main)
[](https://xorbits.cn/assets/images/wechat_work_qr.png)
[](https://www.zhihu.com/org/xorbits)
-[English](README.md) | 中文介绍 | [日本語](README_ja_JP.md)
+
+
+
+
+
@@ -31,14 +42,14 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布
- 支持语音识别模型: [#929](https://github.com/xorbitsai/inference/pull/929)
- 增加 Metrics 统计信息: [#906](https://github.com/xorbitsai/inference/pull/906)
### 新模型
+- 内置 [Qwen 2.5 Series](https://qwenlm.github.io/blog/qwen2.5/): [#2325](https://github.com/xorbitsai/inference/pull/2325)
+- 内置 [Fish Speech V1.4](https://huggingface.co/fishaudio/fish-speech-1.4): [#2295](https://github.com/xorbitsai/inference/pull/2295)
+- 内置 [DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5): [#2292](https://github.com/xorbitsai/inference/pull/2292)
+- 内置 [Qwen2-Audio](https://github.com/QwenLM/Qwen2-Audio): [#2271](https://github.com/xorbitsai/inference/pull/2271)
+- 内置 [Qwen2-vl-instruct](https://github.com/QwenLM/Qwen2-VL): [#2205](https://github.com/xorbitsai/inference/pull/2205)
+- 内置 [MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B): [#2263](https://github.com/xorbitsai/inference/pull/2263)
- 内置 [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
- 内置 [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
-- 内置 [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
-- 内置 [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
-- 内置 [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
-- 内置 [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
-- 内置 [llama3.1](https://ai.meta.com/blog/meta-llama-3-1/): [#1932](https://github.com/xorbitsai/inference/pull/1932)
-- 内置 [Mistral Nemo](https://mistral.ai/news/mistral-nemo/): [#1936](https://github.com/xorbitsai/inference/pull/1936)
### 集成
- [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/):一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力,帮助您轻松实现复杂的问答场景。
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。
@@ -72,10 +83,26 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布
| 语音识别模型 | ✅ | ❌ | ❌ | ❌ |
| 更多 OpenAI 功能 (函数调用) | ✅ | ❌ | ❌ | ❌ |
+## 使用 Xinference
-## 入门指南
+- **云 **
+我们提供 [Xinference 云服务](https://inference.top),无需任何设置。
+
+- **自托管 Xinference 社区版**
+使用 [入门指南](#getting-started) 快速在你自己的环境中运行 Xinference。
+参考 [文档](https://inference.readthedocs.io/zh-cn) 以获得参考和更多说明。
+
+- **面向企业/组织的 Xinference 版本**
+我们提供额外的面向企业的功能。 [通过企业微信联系](https://xorbits.cn/assets/images/wechat_work_qr.png)
+或 [提交表单](https://w8v6grm432.feishu.cn/share/base/form/shrcn9u1EBXQxmGMqILEjguuGoh) 讨论企业需求。
+
+## 保持领先
-**在开始之前,请给我们一个星标,这样你就可以在 GitHub 上及时收到每个新版本的通知!**
+在 GitHub 上给 Xinference Star,并立即收到新版本的通知。
+
+
+
+## 入门指南
* [文档](https://inference.readthedocs.io/zh-cn/latest/index.html)
* [内置模型](https://inference.readthedocs.io/zh-cn/latest/models/builtin/index.html)
@@ -141,4 +168,8 @@ $ xinference-local
-
\ No newline at end of file
+
+
+## Star 历史
+
+[](https://star-history.com/#xorbitsai/inference&Date)
\ No newline at end of file
diff --git a/assets/stay_ahead.gif b/assets/stay_ahead.gif
new file mode 100644
index 0000000000..fe148b6417
Binary files /dev/null and b/assets/stay_ahead.gif differ
diff --git a/benchmark/README.md b/benchmark/README.md
index 4c0ffc2ebd..a24fd292f9 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -38,3 +38,10 @@ python benchmark/benchmark_long.py --context-length ${context_length} --tokenize
--model-uid ${model_uid} \
--num-prompts 32 -c 16
```
+
+## Common Options for Benchmarking Tools
+- `--stream`. You can enable streaming responses by using the option, which is useful for real-time data processing and receiving incremental data without waiting for the entire dataset to be processed.
+
+- `--print-error`. For troubleshooting and more detailed output, the option can be used to print detailed error messages if any errors are encountered during the execution.
+
+These options are available for use in all benchmarking tools provided in this suite, enhancing flexibility and providing essential debugging information.
diff --git a/benchmark/benchmark_latency.py b/benchmark/benchmark_latency.py
index 3ae8125436..ac109ebb48 100644
--- a/benchmark/benchmark_latency.py
+++ b/benchmark/benchmark_latency.py
@@ -59,6 +59,7 @@ def main(args: argparse.Namespace):
input_requests,
args.stream,
args.api_key,
+ args.print_error,
)
asyncio.run(benchmark.run())
@@ -96,6 +97,10 @@ def main(args: argparse.Namespace):
default=None,
help="Authorization api key",
)
-
+ parser.add_argument(
+ "--print-error",
+ action="store_true",
+ help="Print detailed error messages if any errors encountered."
+ )
args = parser.parse_args()
main(args)
diff --git a/benchmark/benchmark_long.py b/benchmark/benchmark_long.py
index 75a0d43530..d19e142850 100644
--- a/benchmark/benchmark_long.py
+++ b/benchmark/benchmark_long.py
@@ -79,6 +79,7 @@ def main(args: argparse.Namespace):
args.stream,
concurrency=args.concurrency,
api_key=args.api_key,
+ print_error=args.print_error,
)
asyncio.run(benchmark.run())
@@ -120,5 +121,10 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--stream", action="store_true", help="Enable streaming responses."
)
+ parser.add_argument(
+ "--print-error",
+ action="store_true",
+ help="Print detailed error messages if any errors encountered."
+ )
args = parser.parse_args()
main(args)
diff --git a/benchmark/benchmark_rerank.py b/benchmark/benchmark_rerank.py
index 09e87d8758..765d5cf4c6 100644
--- a/benchmark/benchmark_rerank.py
+++ b/benchmark/benchmark_rerank.py
@@ -38,6 +38,7 @@ def __init__(
top_n: int,
concurrency: int,
api_key: Optional[str] = None,
+ print_error: bool = False,
):
super().__init__(
api_url,
@@ -46,6 +47,7 @@ def __init__(
stream,
concurrency,
api_key,
+ print_error,
)
self.top_n = top_n
@@ -127,6 +129,7 @@ def main(args: argparse.Namespace):
top_n=args.top_n,
concurrency=args.concurrency,
api_key=args.api_key,
+ print_error=args.print_error,
)
asyncio.run(benchmark.run())
@@ -182,5 +185,10 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--api-key", type=str, default=None, help="Authorization api key",
)
+ parser.add_argument(
+ "--print-error",
+ action="store_true",
+ help="Print detailed error messages if any errors encountered."
+ )
args = parser.parse_args()
main(args)
diff --git a/benchmark/benchmark_runner.py b/benchmark/benchmark_runner.py
index 78bcff0ecc..dd4a6fb143 100644
--- a/benchmark/benchmark_runner.py
+++ b/benchmark/benchmark_runner.py
@@ -54,7 +54,8 @@ def __init__(
model_uid: str,
input_requests: List[Tuple[str, int, int]],
stream: bool,
- api_key: Optional[str]=None,
+ api_key: Optional[str] = None,
+ print_error: bool = False,
):
self.api_url = api_url
self.model_uid = model_uid
@@ -63,6 +64,7 @@ def __init__(
self.benchmark_time = None
self.stream = stream
self.api_key = api_key
+ self.print_error = print_error
async def run(self):
await self.warm_up()
@@ -361,6 +363,17 @@ def print_stats(self):
print(f"Total time: {total_time:.2f} s")
print(f"Throughput: {len(self.outputs) / total_time:.2f} requests/s")
+ if completed < len(self.input_requests):
+ if self.print_error:
+ logger.info("Errors encountered during benchmark:")
+ for output in self.outputs:
+ if not output.success:
+ print(f"Error for prompt with length {output.prompt_len}: {output.error}")
+ else:
+ logger.info(
+ "Errors were encountered during the benchmark. Run with --print-error to see detailed error messages."
+ )
+
class ConcurrentBenchmarkRunner(BenchmarkRunner):
def __init__(
@@ -370,9 +383,17 @@ def __init__(
input_requests: List[Tuple[str, int, int]],
stream: bool,
concurrency: int,
- api_key: Optional[str]=None,
+ api_key: Optional[str] = None,
+ print_error: bool = False,
):
- super().__init__(api_url, model_uid, input_requests, stream, api_key)
+ super().__init__(
+ api_url,
+ model_uid,
+ input_requests,
+ stream,
+ api_key,
+ print_error,
+ )
self.concurrency = concurrency
self.left = len(input_requests)
diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
index 105cce0976..cc56750f27 100644
--- a/benchmark/benchmark_serving.py
+++ b/benchmark/benchmark_serving.py
@@ -38,6 +38,7 @@ def __init__(
concurrency: int,
request_rate: float,
api_key: Optional[str] = None,
+ print_error: bool = False,
):
super().__init__(
api_url,
@@ -46,6 +47,7 @@ def __init__(
stream,
concurrency,
api_key,
+ print_error,
)
self.request_rate = request_rate
self.queue = None # delay the creation of the queue
@@ -118,6 +120,7 @@ def main(args: argparse.Namespace):
request_rate=args.request_rate,
concurrency=args.concurrency,
api_key=args.api_key,
+ print_error=args.print_error,
)
asyncio.run(benchmark.run())
@@ -174,5 +177,10 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--stream", action="store_true", help="Enable streaming responses."
)
+ parser.add_argument(
+ "--print-error",
+ action="store_true",
+ help="Print detailed error messages if any errors encountered."
+ )
args = parser.parse_args()
main(args)
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
index 9dd563a0fc..bbb6e89e57 100644
--- a/doc/source/getting_started/installation.rst
+++ b/doc/source/getting_started/installation.rst
@@ -44,8 +44,10 @@ Currently, supported models include:
- ``codestral-v0.1``
- ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k``
- ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
-- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``
+- ``deepseek``, ``deepseek-coder``, ``deepseek-chat``, ``deepseek-coder-instruct``, ``deepseek-v2-chat``, ``deepseek-v2-chat-0628``, ``deepseek-v2.5``
+- ``yi-coder``, ``yi-coder-chat``
- ``codeqwen1.5``, ``codeqwen1.5-chat``
+- ``qwen2.5``, ``qwen2.5-coder``, ``qwen2.5-instruct``, ``qwen2.5-coder-instruct``
- ``baichuan-2-chat``
- ``internlm2-chat``
- ``internlm2.5-chat``, ``internlm2.5-chat-1m``
@@ -102,7 +104,7 @@ SGLang has a high-performance inference runtime with RadixAttention. It signific
Initial setup::
- pip install 'xinference[sglang]'
+ pip install "xinference[sglang]"
# For CUDA 12.4 & torch 2.4 to support sliding window attention for gemma 2 and llama 3.1 style rope
pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4
@@ -115,7 +117,7 @@ MLX-lm is designed for Apple silicon users to run LLM efficiently.
Initial setup::
- pip install 'xinference[mlx]'
+ pip install "xinference[mlx]"
Other Platforms
~~~~~~~~~~~~~~~
diff --git a/doc/source/getting_started/installation_npu.rst b/doc/source/getting_started/installation_npu.rst
index 8202661487..786ffdba34 100644
--- a/doc/source/getting_started/installation_npu.rst
+++ b/doc/source/getting_started/installation_npu.rst
@@ -6,6 +6,13 @@ Installation Guide for Ascend NPU
=================================
Xinference can run on Ascend NPU, follow below instructions to install.
+.. warning::
+
+ The open-source version relies on Transformers for inference,
+ which can be slow on chips like 310p3. We provide an enterprise version that supports the MindIE engine,
+ offering better performance and compatibility for Ascend NPU.
+ Refer to `Xinference Enterprise `_
+
Installing PyTorch and Ascend extension for PyTorch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/getting_started/using_xinference.rst b/doc/source/getting_started/using_xinference.rst
index b8cc47458a..af8071b3b6 100644
--- a/doc/source/getting_started/using_xinference.rst
+++ b/doc/source/getting_started/using_xinference.rst
@@ -243,11 +243,11 @@ or via Xinference's python client:
from xinference.client import RESTfulClient
client = RESTfulClient("http://127.0.0.1:9997")
model = client.get_model("my-llama-2")
- print(model.chat(
- prompt="What is the largest animal?",
- system_prompt="You are a helpful assistant.",
- chat_history=[]
- ))
+ model.chat(
+ messages=[
+ {"role": "user", "content": "Who won the world series in 2020?"}
+ ]
+ )
.. code-tab:: json output
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 926cb8dca0..270f71e565 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -35,14 +35,13 @@ Developing Real-world AI Applications with Xinference
# Chat to LLM
model.chat(
- prompt="What is the largest animal?",
- system_prompt="You are a helpful assistant",
+ messages=[{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "What is the largest animal?"}],
generate_config={"max_tokens": 1024}
)
# Chat to VL model
model.chat(
- chat_history=[
+ messages=[
{
"role": "user",
"content": [
diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/getting_started/installation_npu.po b/doc/source/locale/zh_CN/LC_MESSAGES/getting_started/installation_npu.po
index 85657a774b..4a27241727 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/getting_started/installation_npu.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/getting_started/installation_npu.po
@@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-07-30 17:00+0800\n"
+"POT-Creation-Date: 2024-10-25 15:13+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language: zh_CN\n"
@@ -28,52 +28,65 @@ msgid "Xinference can run on Ascend NPU, follow below instructions to install."
msgstr "Xinference 能在昇腾 NPU 上运行,使用如下命令安装。"
#: ../../source/getting_started/installation_npu.rst:11
+msgid ""
+"The open-source version relies on Transformers for inference, which can "
+"be slow on chips like 310p3. We provide an enterprise version that "
+"supports the MindIE engine, offering better performance and compatibility"
+" for Ascend NPU. Refer to `Xinference Enterprise "
+"`_"
+msgstr ""
+"开源版本依赖 Transformers 进行推理,在 310p3 等芯片上会存在运行慢的问题。"
+"我们提供了支持 MindIE 引擎,性能更为强大,兼容性更好的企业版本来支持 "
+"Ascend NPU。详细参考 `Xinference 企业版 `_"
+
+#: ../../source/getting_started/installation_npu.rst:18
msgid "Installing PyTorch and Ascend extension for PyTorch"
msgstr "安装 PyTorch 和昇腾扩展"
-#: ../../source/getting_started/installation_npu.rst:12
+#: ../../source/getting_started/installation_npu.rst:19
msgid "Install PyTorch CPU version and corresponding Ascend extension."
msgstr "安装 PyTorch CPU 版本和相应的昇腾扩展。"
-#: ../../source/getting_started/installation_npu.rst:14
+#: ../../source/getting_started/installation_npu.rst:21
msgid "Take PyTorch v2.1.0 as example."
msgstr "以 PyTorch v2.1.0 为例。"
-#: ../../source/getting_started/installation_npu.rst:20
+#: ../../source/getting_started/installation_npu.rst:27
msgid ""
"Then install `Ascend extension for PyTorch "
"`_."
-msgstr ""
-"接着安装 `昇腾 PyTorch 扩展 "
-"`_."
+msgstr "接着安装 `昇腾 PyTorch 扩展 `_."
-#: ../../source/getting_started/installation_npu.rst:28
+#: ../../source/getting_started/installation_npu.rst:35
msgid "Running below command to see if it correctly prints the Ascend NPU count."
msgstr "运行如下命令查看,如果正常运行,会打印昇腾 NPU 的个数。"
-#: ../../source/getting_started/installation_npu.rst:35
+#: ../../source/getting_started/installation_npu.rst:42
msgid "Installing Xinference"
msgstr "安装 Xinference"
-#: ../../source/getting_started/installation_npu.rst:41
+#: ../../source/getting_started/installation_npu.rst:48
msgid ""
"Now you can use xinference according to :ref:`doc `. "
"``Transformers`` backend is the only available engine supported for "
"Ascend NPU for open source version."
msgstr ""
-"现在你可以参考 :ref:`文档 ` 来使用 Xinference。"
-"``Transformers`` 是开源唯一支持的昇腾 NPU 的引擎。"
+"现在你可以参考 :ref:`文档 ` 来使用 Xinference。``"
+"Transformers`` 是开源唯一支持的昇腾 NPU 的引擎。"
-#: ../../source/getting_started/installation_npu.rst:45
+#: ../../source/getting_started/installation_npu.rst:52
msgid "Enterprise Support"
msgstr "企业支持"
-#: ../../source/getting_started/installation_npu.rst:46
+#: ../../source/getting_started/installation_npu.rst:53
msgid ""
"If you encounter any performance or other issues for Ascend NPU, please "
"reach out to us via `link `_."
msgstr ""
-"如果你在昇腾 NPU 遇到任何性能和其他问题,欢迎垂询 Xinference 企业版,"
-"在 `这里 `_ 可以找到我们,亦可以 "
-"`填写表单 `_ 申请企业版试用。"
+"如果你在昇腾 NPU 遇到任何性能和其他问题,欢迎垂询 Xinference 企业版,在 `"
+"这里 `_ 可以找到我们,亦可以 `填写表单 <"
+"https://w8v6grm432.feishu.cn/share/base/form/shrcn9u1EBXQxmGMqILEjguuGoh>"
+"`_ 申请企业版试用。"
diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po b/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
index 03a7e356cd..878e084003 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/models/custom.po
@@ -7,7 +7,7 @@ msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-08-15 11:39+0800\n"
+"POT-Creation-Date: 2024-09-05 13:08+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language: zh_CN\n"
@@ -16,7 +16,7 @@ msgstr ""
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.11.0\n"
+"Generated-By: Babel 2.16.0\n"
#: ../../source/models/custom.rst:5
msgid "Custom Models"
@@ -70,8 +70,8 @@ msgstr "定义自定义大语言模型"
msgid "Define a custom LLM model based on the following template:"
msgstr "基于以下模板定义一个自定义大语言模型:"
-#: ../../source/models/custom.rst:96 ../../source/models/custom.rst:225
-#: ../../source/models/custom.rst:248
+#: ../../source/models/custom.rst:95 ../../source/models/custom.rst:127
+#: ../../source/models/custom.rst:150
msgid ""
"model_name: A string defining the name of the model. The name must start "
"with a letter or a digit and can only contain letters, digits, "
@@ -80,7 +80,7 @@ msgstr ""
"model_name: 模型名称。名称必须以字母或数字开头,且只能包含字母、数字、"
"下划线或短划线。"
-#: ../../source/models/custom.rst:97
+#: ../../source/models/custom.rst:96
msgid ""
"context_length: context_length: An optional integer that specifies the "
"maximum context size the model was trained to accommodate, encompassing "
@@ -90,7 +90,7 @@ msgstr ""
"context_length: 一个可选的整数,模型支持的最大上下文长度,包括输入和输出"
"长度。如果未定义,默认值为2048个token(约1,500个词)。"
-#: ../../source/models/custom.rst:98
+#: ../../source/models/custom.rst:97
msgid ""
"model_lang: A list of strings representing the supported languages for "
"the model. Example: [\"en\"], which means that the model supports "
@@ -99,7 +99,7 @@ msgstr ""
"model_lang: 一个字符串列表,表示模型支持的语言。例如:['en'],表示该模型"
"支持英语。"
-#: ../../source/models/custom.rst:99
+#: ../../source/models/custom.rst:98
msgid ""
"model_ability: A list of strings defining the abilities of the model. It "
"could include options like \"embed\", \"generate\", and \"chat\". In this"
@@ -108,40 +108,32 @@ msgstr ""
"model_ability: 一个字符串列表,定义模型的能力。它可以包括像 'embed'、'"
"generate' 和 'chat' 这样的选项。示例表示模型具有 'generate' 的能力。"
-#: ../../source/models/custom.rst:100
+#: ../../source/models/custom.rst:99
msgid ""
"model_family: A required string representing the family of the model you "
-"want to register. The optional values are the model names of all :ref"
-":`built-in models `. If the model family you register "
-"is not among the built-in models in Xinference, please fill in ``other``."
-" Note that you should choose the model family based on the ability of the"
-" model you want to register. For example, if you want to register the "
-"``llama-2`` model, do not fill in ``llama-2-chat`` as the model family."
+"want to register. This parameter must not conflict with any builtin model"
+" names."
msgstr ""
-"model_family: 必需字段,表示你要注册的模型的家族(类别)。可选值来自于 "
-"Xinference :ref:`所有内置模型的模型名 `。如果你要注册的"
-"模型不在其中,填入 ``other`` 。注意,此字段的值必须根据模型能力填入。例如"
-",如果你注册的是自定义 ``llama-2`` 模型,千万不要填入 ``llama-2-chat`` 。"
-#: ../../source/models/custom.rst:106
+#: ../../source/models/custom.rst:100
msgid ""
"model_specs: An array of objects defining the specifications of the "
"model. These include:"
msgstr "model_specs: 一个包含定义模型规格的对象数组。这些规格包括:"
-#: ../../source/models/custom.rst:102
+#: ../../source/models/custom.rst:101
msgid ""
"model_format: A string that defines the model format, like \"pytorch\" or"
" \"ggufv2\"."
msgstr "model_format: 一个定义模型格式的字符串,可以是 'pytorch' 或 'ggufv2'。"
-#: ../../source/models/custom.rst:103
+#: ../../source/models/custom.rst:102
msgid ""
"model_size_in_billions: An integer defining the size of the model in "
"billions of parameters."
msgstr "model_size_in_billions: 一个整数,定义模型的参数量,以十亿为单位。"
-#: ../../source/models/custom.rst:104
+#: ../../source/models/custom.rst:103
msgid ""
"quantizations: A list of strings defining the available quantizations for"
" the model. For PyTorch models, it could be \"4-bit\", \"8-bit\", or "
@@ -152,7 +144,7 @@ msgstr ""
"可以是 \"4-bit\"、\"8-bit\" 或 \"none\"。对于 ggufv2 模型,量化方式应与 `"
"`model_file_name_template`` 中的值对应。"
-#: ../../source/models/custom.rst:105
+#: ../../source/models/custom.rst:104
msgid ""
"model_id: A string representing the model ID, possibly referring to an "
"identifier used by Hugging Face. **If model_uri is missing, Xinference "
@@ -163,7 +155,7 @@ msgstr ""
"如果 model_uri 字段缺失,Xinference 将尝试从此id指示的HuggingFace仓库下载"
"该模型。"
-#: ../../source/models/custom.rst:106
+#: ../../source/models/custom.rst:105
msgid ""
"model_uri: A string representing the URI where the model can be loaded "
"from, such as \"file:///path/to/llama-2-7b\". **When the model format is "
@@ -173,11 +165,11 @@ msgid ""
"the model from Hugging Face with the model ID."
msgstr ""
"model_uri:表示模型文件位置的字符串,例如本地目录:\"file:///path/to/"
-"llama-2-7b\"。当 model_format 是 ggufv2 ,此字段必须是具体的"
-"模型文件路径。而当 model_format 是 pytorch 时,此字段必须是一个包含所有"
-"模型文件的目录。"
+"llama-2-7b\"。当 model_format 是 ggufv2 ,此字段必须是具体的模型文件路径"
+"。而当 model_format 是 pytorch 时,此字段必须是一个包含所有模型文件的目录"
+"。"
-#: ../../source/models/custom.rst:107
+#: ../../source/models/custom.rst:106
msgid ""
"model_file_name_template: Required by gguf models. An f-string template "
"used for defining the model file name based on the quantization. **Note "
@@ -187,73 +179,57 @@ msgstr ""
"model_file_name_template: gguf 模型所需。一个 f-string 模板,用于根据量化"
"定义模型文件名。注意,这里不要填入文件的路径。"
-#: ../../source/models/custom.rst:108
+#: ../../source/models/custom.rst:107
msgid ""
-"prompt_style: If the ``model_family`` field is not ``other``, this field "
-"does not need to be filled in. ``prompt_style`` is an optional field that"
-" could be required by ``chat`` models to define the style of prompts. The"
-" given example has this set to None, but additional details could be "
-"found in a referenced file xinference/model/llm/tests/test_utils.py. You "
-"can also specify this field as a string, which will use the builtin "
-"prompt style in Xinference. For example:"
-msgstr ""
-"prompt_style: 如果上述 ``model_family`` 字段不是 ``other`` ,则无需设置"
-"此字段。 ``prompt_style`` 是一个可选字段,表示 ``chat`` 模型需要的提示词"
-"样式。给定的示例将其设置为 None,但可以在引用的文件 xinference/model/llm/"
-"tests/test_utils.py 中找到更多详细信息。你也可以指定一个字符串,以使用"
-"内置模型的提示词样式。"
-
-#: ../../source/models/custom.rst:117
-msgid "Xinference supports these builtin prompt styles in common usage:"
-msgstr "Xinference 支持这些内置、常用的提示词样式:"
-
-#: ../../source/models/custom.rst:121
-msgid "baichuan-chat"
-msgstr ""
-
-#: ../../source/models/custom.rst:140
-msgid "chatglm3"
-msgstr ""
-
-#: ../../source/models/custom.rst:153
-msgid "qwen-chat"
-msgstr ""
-
-#: ../../source/models/custom.rst:170
-msgid "llama-2-chat"
+"chat_template: If ``model_ability`` includes ``chat`` , you must "
+"configure this option to generate the correct full prompt during chat. "
+"This is a Jinja template string. Usually, you can find it in the "
+"``tokenizer_config.json`` file within the model directory."
msgstr ""
+"chat_template:如果 ``model_ability`` 中包含 ``chat`` ,那么此选项必须配置以生成合适的完整提示词。这是一个 Jinja 模版字符串。"
+"通常,你可以在模型目录的 ``tokenizer_config.json`` 文件中找到。"
-#: ../../source/models/custom.rst:191
-msgid "vicuna-v1.5"
+#: ../../source/models/custom.rst:108
+msgid ""
+"stop_token_ids: If ``model_ability`` includes ``chat`` , you can "
+"configure this option to control when the model stops during chat. This "
+"is a list of integers, and you can typically extract the corresponding "
+"values from the ``generation_config.json`` or ``tokenizer_config.json`` "
+"file in the model directory."
msgstr ""
+"stop_token_ids:如果 ``model_ability`` 中包含 ``chat`` ,那么推荐配置此选项以合理控制对话的停止。这是一个包含整数的列表,你可以"
+"在模型目录的 ``generation_config.json`` 和 ``tokenizer_config.json`` 文件中提取相应的值。"
-#: ../../source/models/custom.rst:206
+#: ../../source/models/custom.rst:109
msgid ""
-"The above lists some commonly used built-in prompt styles. The full list "
-"of supported prompt styles can be found on the Xinference web UI."
+"stop: If ``model_ability`` includes ``chat`` , you can configure this "
+"option to control when the model stops during chat. This is a list of "
+"strings, and you can typically extract the corresponding values from the "
+"``generation_config.json`` or ``tokenizer_config.json`` file in the model"
+" directory."
msgstr ""
-"以上列举出了最常使用的提示词样式。完整的支持列表可以通过 Xinference 页面"
-"的 register model 面板查看。"
+"stop:如果 ``model_ability`` 中包含 ``chat`` ,那么推荐配置此选项以合理控制对话的停止。这是一个包含字符串的列表,"
+"你可以在模型目录的 ``tokenizer_config.json`` 文件中找到 token 值对应的字符串。"
-#: ../../source/models/custom.rst:210
+#: ../../source/models/custom.rst:112
msgid "Define a custom embedding model"
msgstr "定义自定义 embedding 模型"
-#: ../../source/models/custom.rst:212
+#: ../../source/models/custom.rst:114
msgid "Define a custom embedding model based on the following template:"
msgstr "基于以下模板定义一个自定义 embedding 模型:"
-#: ../../source/models/custom.rst:226
+#: ../../source/models/custom.rst:128
msgid "dimensions: A integer that specifies the embedding dimensions."
msgstr "dimensions: 表示 embedding 维度的整型值。"
-#: ../../source/models/custom.rst:227
+#: ../../source/models/custom.rst:129
msgid ""
"max_tokens: A integer that represents the max sequence length that the "
"embedding model supports."
msgstr "max_tokens: 表示 embedding 模型支持的最大输入序列长度的整型值。"
-#: ../../source/models/custom.rst:228 ../../source/models/custom.rst:250
+#: ../../source/models/custom.rst:130 ../../source/models/custom.rst:152
msgid ""
"language: A list of strings representing the supported languages for the "
"model. Example: [\"en\"], which means that the model supports English."
@@ -261,7 +237,7 @@ msgstr ""
"model_lang: 一个字符串列表,表示模型支持的语言。例如:['en'],表示该模型"
"支持英语。"
-#: ../../source/models/custom.rst:229 ../../source/models/custom.rst:251
+#: ../../source/models/custom.rst:131 ../../source/models/custom.rst:153
msgid ""
"model_id: A string representing the model ID, possibly referring to an "
"identifier used by Hugging Face."
@@ -269,7 +245,7 @@ msgstr ""
"model_id: 一个表示模型标识的字符串,类似 HuggingFace 或 ModelScope 使用的"
"标识符。"
-#: ../../source/models/custom.rst:230 ../../source/models/custom.rst:252
+#: ../../source/models/custom.rst:132 ../../source/models/custom.rst:154
msgid ""
"model_uri: A string representing the URI where the model can be loaded "
"from, such as \"file:///path/to/your_model\". If model URI is absent, "
@@ -280,15 +256,15 @@ msgstr ""
"如果模型 URI 不存在,Xinference 将尝试使用 model_id 从 HuggingFace 或 "
"ModelScope 下载模型。"
-#: ../../source/models/custom.rst:234
+#: ../../source/models/custom.rst:136
msgid "Define a custom Rerank model"
msgstr "定义自定义 rerank 模型"
-#: ../../source/models/custom.rst:236
+#: ../../source/models/custom.rst:138
msgid "Define a custom rerank model based on the following template:"
msgstr "基于以下模板定义一个自定义大语言模型:"
-#: ../../source/models/custom.rst:249
+#: ../../source/models/custom.rst:151
msgid ""
"type: A string defining the type of the model, including ``normal``, "
"``LLM-based`` and ``LLM-based layerwise``."
@@ -296,20 +272,20 @@ msgstr ""
"type: 表示模型的类型,可选值包括 ``normal``、``LLM-based`` 和 ``LLM-based"
" layerwise``。"
-#: ../../source/models/custom.rst:256
+#: ../../source/models/custom.rst:158
msgid "Register a Custom Model"
msgstr "注册一个自定义模型"
-#: ../../source/models/custom.rst:258
+#: ../../source/models/custom.rst:160
msgid "Register a custom model programmatically:"
msgstr "以代码的方式注册自定义模型"
-#: ../../source/models/custom.rst:273 ../../source/models/custom.rst:291
-#: ../../source/models/custom.rst:306 ../../source/models/custom.rst:361
+#: ../../source/models/custom.rst:175 ../../source/models/custom.rst:193
+#: ../../source/models/custom.rst:208 ../../source/models/custom.rst:263
msgid "Or via CLI:"
msgstr "以命令行的方式"
-#: ../../source/models/custom.rst:279
+#: ../../source/models/custom.rst:181
msgid ""
"Note that replace the ```` above with ``LLM``, ``embedding`` "
"or ``rerank``. The same as below."
@@ -317,43 +293,43 @@ msgstr ""
"注意将以下部分的 ```` 替换为 ``LLM``、``embedding`` 或 ``"
"rerank`` 。"
-#: ../../source/models/custom.rst:283
+#: ../../source/models/custom.rst:185
msgid "List the Built-in and Custom Models"
msgstr "列举内置和自定义模型"
-#: ../../source/models/custom.rst:285
+#: ../../source/models/custom.rst:187
msgid "List built-in and custom models programmatically:"
msgstr "以代码的方式列举内置和自定义模型"
-#: ../../source/models/custom.rst:298
+#: ../../source/models/custom.rst:200
msgid "Launch the Custom Model"
msgstr "启动自定义模型"
-#: ../../source/models/custom.rst:300
+#: ../../source/models/custom.rst:202
msgid "Launch the custom model programmatically:"
msgstr "以代码的方式启动自定义模型"
-#: ../../source/models/custom.rst:313
+#: ../../source/models/custom.rst:215
msgid "Interact with the Custom Model"
msgstr "使用自定义模型"
-#: ../../source/models/custom.rst:315
+#: ../../source/models/custom.rst:217
msgid "Invoke the model programmatically:"
msgstr "以代码的方式调用模型"
-#: ../../source/models/custom.rst:322
+#: ../../source/models/custom.rst:224
msgid "Result:"
msgstr "结果为:"
-#: ../../source/models/custom.rst:346
+#: ../../source/models/custom.rst:248
msgid "Or via CLI, replace ``${UID}`` with real model UID:"
msgstr "或者以命令行的方式,用实际的模型 UID 替换 ``${UID}``:"
-#: ../../source/models/custom.rst:353
+#: ../../source/models/custom.rst:255
msgid "Unregister the Custom Model"
msgstr "注销自定义模型"
-#: ../../source/models/custom.rst:355
+#: ../../source/models/custom.rst:257
msgid "Unregister the custom model programmatically:"
msgstr "以代码的方式注销自定义模型"
diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po b/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po
index 66b4935516..e73ba213b0 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/models/model_abilities/image.po
@@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-08-09 19:13+0800\n"
+"POT-Creation-Date: 2024-10-30 07:49+0000\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language: zh_CN\n"
@@ -17,7 +17,7 @@ msgstr ""
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.14.0\n"
+"Generated-By: Babel 2.16.0\n"
#: ../../source/models/model_abilities/image.rst:5
msgid "Images"
@@ -143,17 +143,20 @@ msgid ""
" move a model component onto the GPU when it needs to be executed, while "
"keeping the remaining components on the CPU."
msgstr ""
-"``--cpu_offload True``:指定 ``True`` 会在推理过程中将模型的组件卸载到 CPU 上以节省内存,"
-"这会导致推理延迟略有增加。模型卸载仅会在需要执行时将模型组件移动到 GPU 上,同时保持其余组件在 CPU 上"
+"``--cpu_offload True``:指定 ``True`` 会在推理过程中将模型的组件卸载到 "
+"CPU 上以节省内存,这会导致推理延迟略有增加。模型卸载仅会在需要执行时将"
+"模型组件移动到 GPU 上,同时保持其余组件在 CPU 上"
#: ../../source/models/model_abilities/image.rst:117
msgid ""
"``--quantize_text_encoder ``: We leveraged the "
"``bitsandbytes`` library to load and quantize the T5-XXL text encoder to "
-"8-bit precision. This allows you to keep using all text encoders "
-"while only slightly impacting performance."
-msgstr "``--quantize_text_encoder ``:我们利用 ``bitsandbytes`` 库"
-"加载并量化 T5-XXL 文本编码器至8位精度。这使得你能够在仅轻微影响性能的情况下继续使用全部文本编码器。"
+"8-bit precision. This allows you to keep using all text encoders while "
+"only slightly impacting performance."
+msgstr ""
+"``--quantize_text_encoder ``:我们利用 ``bitsandbytes"
+"`` 库加载并量化 T5-XXL 文本编码器至8位精度。这使得你能够在仅轻微影响性能"
+"的情况下继续使用全部文本编码器。"
#: ../../source/models/model_abilities/image.rst:120
msgid ""
@@ -161,16 +164,18 @@ msgid ""
"4.7B parameter T5-XXL text encoder during inference can significantly "
"decrease the memory requirements with only a slight loss in performance."
msgstr ""
-"``--text_encoder_3 None``,对于 sd3-medium,"
-"移除在推理过程中内存密集型的47亿参数T5-XXL文本编码器可以显著降低内存需求,而仅造成性能上的轻微损失。"
+"``--text_encoder_3 None``,对于 sd3-medium,移除在推理过程中内存密集型的"
+"47亿参数T5-XXL文本编码器可以显著降低内存需求,而仅造成性能上的轻微损失。"
#: ../../source/models/model_abilities/image.rst:124
msgid ""
"If you are trying to run large image models liek sd3-medium or FLUX.1 "
"series on GPU card that has less memory than 24GB, you may encounter OOM "
"when launching or inference. Try below solutions."
-msgstr "如果你试图在显存小于24GB的GPU上运行像sd3-medium或FLUX.1系列这样的大型图像模型,"
-"你在启动或推理过程中可能会遇到显存溢出(OOM)的问题。尝试以下解决方案。"
+msgstr ""
+"如果你试图在显存小于24GB的GPU上运行像sd3-medium或FLUX.1系列这样的大型图像"
+"模型,你在启动或推理过程中可能会遇到显存溢出(OOM)的问题。尝试以下"
+"解决方案。"
#: ../../source/models/model_abilities/image.rst:128
msgid "For FLUX.1 series, try to apply quantization."
@@ -200,4 +205,15 @@ msgstr ""
msgid "Learn from a Stable Diffusion ControlNet example"
msgstr "学习一个 Stable Diffusion 控制网络的示例"
+#: ../../source/models/model_abilities/image.rst:160
+msgid "OCR"
+msgstr ""
+
+#: ../../source/models/model_abilities/image.rst:162
+msgid "The OCR API accepts image bytes and returns the OCR text."
+msgstr "OCR API 接受图像字节并返回 OCR 文本。"
+
+#: ../../source/models/model_abilities/image.rst:164
+msgid "We can try OCR API out either via cURL, or Xinference's python client:"
+msgstr "可以通过 cURL 或 Xinference 的 Python 客户端来尝试 OCR API。"
diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/continuous_batching.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/continuous_batching.po
index 427e855a09..4505a7fa2a 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/continuous_batching.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/continuous_batching.po
@@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: Xinference \n"
"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-07-04 16:08+0800\n"
+"POT-Creation-Date: 2024-10-17 18:49+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME \n"
"Language-Team: LANGUAGE \n"
@@ -18,8 +18,8 @@ msgstr ""
"Generated-By: Babel 2.11.0\n"
#: ../../source/user_guide/continuous_batching.rst:5
-msgid "Continuous Batching (experimental)"
-msgstr "连续批处理(实验性质)"
+msgid "Continuous Batching"
+msgstr "连续批处理"
#: ../../source/user_guide/continuous_batching.rst:7
msgid ""
@@ -35,11 +35,15 @@ msgstr ""
msgid "Usage"
msgstr "使用方式"
-#: ../../source/user_guide/continuous_batching.rst:12
+#: ../../source/user_guide/continuous_batching.rst:14
+msgid "LLM"
+msgstr "大语言模型"
+
+#: ../../source/user_guide/continuous_batching.rst:15
msgid "Currently, this feature can be enabled under the following conditions:"
msgstr "当前,此功能在满足以下条件时开启:"
-#: ../../source/user_guide/continuous_batching.rst:14
+#: ../../source/user_guide/continuous_batching.rst:17
msgid ""
"First, set the environment variable "
"``XINFERENCE_TRANSFORMERS_ENABLE_BATCHING`` to ``1`` when starting "
@@ -48,13 +52,22 @@ msgstr ""
"首先,启动 Xinference 时需要将环境变量 ``XINFERENCE_TRANSFORMERS_ENABLE_"
"BATCHING`` 置为 ``1`` 。"
-#: ../../source/user_guide/continuous_batching.rst:21
+#: ../../source/user_guide/continuous_batching.rst:25
+msgid ""
+"Since ``v0.16.0``, this feature is turned on by default and is no longer "
+"required to set the ``XINFERENCE_TRANSFORMERS_ENABLE_BATCHING`` "
+"environment variable. This environment variable has been removed."
+msgstr ""
+"自 ``v0.16.0`` 开始,此功能默认开启,不再需要设置 ``XINFERENCE_TRANSFORMERS_ENABLE_BATCHING`` 环境变量,"
+"且该环境变量已被移除。"
+
+#: ../../source/user_guide/continuous_batching.rst:30
msgid ""
"Then, ensure that the ``transformers`` engine is selected when launching "
"the model. For example:"
msgstr "然后,启动 LLM 模型时选择 ``transformers`` 推理引擎。例如:"
-#: ../../source/user_guide/continuous_batching.rst:57
+#: ../../source/user_guide/continuous_batching.rst:66
msgid ""
"Once this feature is enabled, all requests for LLMs will be managed by "
"continuous batching, and the average throughput of requests made to a "
@@ -64,57 +77,92 @@ msgstr ""
"一旦此功能开启,LLM 模型的所有接口将被此功能接管。所有接口的使用方式没有"
"任何变化。"
-#: ../../source/user_guide/continuous_batching.rst:63
+#: ../../source/user_guide/continuous_batching.rst:71
+msgid "Image Model"
+msgstr "图像模型"
+
+#: ../../source/user_guide/continuous_batching.rst:72
+msgid ""
+"Currently, for image models, only the ``text_to_image`` interface is "
+"supported for ``FLUX.1`` series models."
+msgstr ""
+"当前只有 ``FLUX.1`` 系列模型的 ``text_to_image`` (文生图)接口支持此功能。"
+
+#: ../../source/user_guide/continuous_batching.rst:74
+msgid ""
+"Enabling this feature requires setting the environment variable "
+"``XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE``, which indicates the ``size`` "
+"of the generated images."
+msgstr ""
+"图像模型开启此功能需要在启动 xinference 时指定 ``XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE`` 环境变量,"
+"表示生成图片的大小。"
+
+#: ../../source/user_guide/continuous_batching.rst:76
+msgid "For example, starting xinference like this:"
+msgstr ""
+"例如,像这样启动 xinference:"
+
+#: ../../source/user_guide/continuous_batching.rst:83
+msgid ""
+"Then just use the ``text_to_image`` interface as before, and nothing else"
+" needs to be changed."
+msgstr ""
+"接下来正常使用 ``text_to_image`` 接口即可,其他什么都不需要改变。"
+
+#: ../../source/user_guide/continuous_batching.rst:86
msgid "Abort your request"
msgstr "中止请求"
-#: ../../source/user_guide/continuous_batching.rst:64
+#: ../../source/user_guide/continuous_batching.rst:87
msgid "In this mode, you can abort requests that are in the process of inference."
-msgstr ""
-"此功能中,你可以优雅地中止正在推理中的请求。"
+msgstr "此功能中,你可以优雅地中止正在推理中的请求。"
-#: ../../source/user_guide/continuous_batching.rst:66
+#: ../../source/user_guide/continuous_batching.rst:89
msgid "First, add ``request_id`` option in ``generate_config``. For example:"
-msgstr ""
-"首先,在推理请求的 ``generate_config`` 中指定 ``request_id`` 选项。例如:"
+msgstr "首先,在推理请求的 ``generate_config`` 中指定 ``request_id`` 选项。例如:"
-#: ../../source/user_guide/continuous_batching.rst:75
+#: ../../source/user_guide/continuous_batching.rst:98
msgid ""
"Then, abort the request using the ``request_id`` you have set. For "
"example:"
-msgstr ""
-"接着,带着你指定的 ``request_id`` 去中止该请求。例如:"
+msgstr "接着,带着你指定的 ``request_id`` 去中止该请求。例如:"
-#: ../../source/user_guide/continuous_batching.rst:83
+#: ../../source/user_guide/continuous_batching.rst:106
msgid ""
"Note that if your request has already finished, aborting the request will"
-" be a no-op."
-msgstr ""
-"注意,如果你的请求已经结束,那么此操作将什么都不做。"
+" be a no-op. Image models also support this feature."
+msgstr "注意,如果你的请求已经结束,那么此操作将什么都不做。"
-#: ../../source/user_guide/continuous_batching.rst:86
+#: ../../source/user_guide/continuous_batching.rst:110
msgid "Note"
msgstr "注意事项"
-#: ../../source/user_guide/continuous_batching.rst:88
+#: ../../source/user_guide/continuous_batching.rst:112
msgid ""
-"Currently, this feature only supports the ``generate``, ``chat`` and "
-"``vision`` tasks for ``LLM`` models. The ``tool call`` tasks are not "
-"supported."
+"Currently, for ``LLM`` models, this feature only supports the "
+"``generate``, ``chat``, ``tool call`` and ``vision`` tasks."
msgstr ""
-"当前,此功能仅支持 LLM 模型的 ``generate``, ``chat`` 和 ``vision`` (多"
-"模态) 功能。``tool call`` (工具调用)暂时不支持。"
+"当前,此功能仅支持 LLM 模型的 ``generate``, ``chat``, ``tool call`` (工具调用)和 ``vision`` (多"
+"模态) 功能。"
-#: ../../source/user_guide/continuous_batching.rst:90
+#: ../../source/user_guide/continuous_batching.rst:114
msgid ""
-"For ``vision`` tasks, currently only ``qwen-vl-chat``, ``cogvlm2``, and "
-"``glm-4v`` models are supported. More models will be supported in the "
-"future. Please let us know your requirements."
+"Currently, for ``image`` models, this feature only supports the "
+"``text_to_image`` tasks. Only ``FLUX.1`` series models are supported."
msgstr ""
-"对于多模态任务,当前支持 ``qwen-vl-chat`` ,``cogvlm2`` 和 ``glm-4v`` "
-"模型。未来将加入更多模型,敬请期待。"
+"当前,对于图像模型,仅支持 `FLUX.1`` 系列模型的 ``text_to_image`` (文生图)功能。"
-#: ../../source/user_guide/continuous_batching.rst:92
+#: ../../source/user_guide/continuous_batching.rst:116
+msgid ""
+"For ``vision`` tasks, currently only ``qwen-vl-chat``, ``cogvlm2``, "
+"``glm-4v`` and ``MiniCPM-V-2.6`` (only for image tasks) models are "
+"supported. More models will be supported in the future. Please let us "
+"know your requirements."
+msgstr ""
+"对于多模态任务,当前支持 ``qwen-vl-chat`` ,``cogvlm2``, ``glm-4v`` 和 `"
+"`MiniCPM-V-2.6`` (仅对于图像任务)模型。未来将加入更多模型,敬请期待。"
+
+#: ../../source/user_guide/continuous_batching.rst:118
msgid ""
"If using GPU inference, this method will consume more GPU memory. Please "
"be cautious when increasing the number of concurrent requests to the same"
@@ -126,17 +174,3 @@ msgstr ""
"请求量。``launch_model`` 接口提供可选参数 ``max_num_seqs`` 用于调整并发度"
",默认值为 ``16`` 。"
-#: ../../source/user_guide/continuous_batching.rst:95
-msgid ""
-"This feature is still in the experimental stage, and we welcome your "
-"active feedback on any issues."
-msgstr "此功能仍处于实验阶段,欢迎反馈任何问题。"
-
-#: ../../source/user_guide/continuous_batching.rst:97
-msgid ""
-"After a period of testing, this method will remain enabled by default, "
-"and the original inference method will be deprecated."
-msgstr ""
-"一段时间的测试之后,此功能将代替原来的 transformers 推理逻辑成为默认行为"
-"。原来的推理逻辑将被摒弃。"
-
diff --git a/doc/source/models/builtin/audio/cosyvoice-300m-instruct.rst b/doc/source/models/builtin/audio/cosyvoice-300m-instruct.rst
index 9e438f04d5..eff5788cf0 100644
--- a/doc/source/models/builtin/audio/cosyvoice-300m-instruct.rst
+++ b/doc/source/models/builtin/audio/cosyvoice-300m-instruct.rst
@@ -12,7 +12,7 @@ CosyVoice-300M-Instruct
Specifications
^^^^^^^^^^^^^^
-- **Model ID:** model-scope/CosyVoice-300M-Instruct
+- **Model ID:** FunAudioLLM/CosyVoice-300M-Instruct
Execute the following command to launch the model::
diff --git a/doc/source/models/builtin/audio/cosyvoice-300m-sft.rst b/doc/source/models/builtin/audio/cosyvoice-300m-sft.rst
index 4aa6864d31..b903b9118b 100644
--- a/doc/source/models/builtin/audio/cosyvoice-300m-sft.rst
+++ b/doc/source/models/builtin/audio/cosyvoice-300m-sft.rst
@@ -12,7 +12,7 @@ CosyVoice-300M-SFT
Specifications
^^^^^^^^^^^^^^
-- **Model ID:** model-scope/CosyVoice-300M-SFT
+- **Model ID:** FunAudioLLM/CosyVoice-300M-SFT
Execute the following command to launch the model::
diff --git a/doc/source/models/builtin/audio/cosyvoice-300m.rst b/doc/source/models/builtin/audio/cosyvoice-300m.rst
index f667546dbd..da04f444a2 100644
--- a/doc/source/models/builtin/audio/cosyvoice-300m.rst
+++ b/doc/source/models/builtin/audio/cosyvoice-300m.rst
@@ -12,7 +12,7 @@ CosyVoice-300M
Specifications
^^^^^^^^^^^^^^
-- **Model ID:** model-scope/CosyVoice-300M
+- **Model ID:** FunAudioLLM/CosyVoice-300M
Execute the following command to launch the model::
diff --git a/doc/source/models/builtin/audio/fishspeech-1.4.rst b/doc/source/models/builtin/audio/fishspeech-1.4.rst
new file mode 100644
index 0000000000..c256495d67
--- /dev/null
+++ b/doc/source/models/builtin/audio/fishspeech-1.4.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_fishspeech-1.4:
+
+==============
+FishSpeech-1.4
+==============
+
+- **Model Name:** FishSpeech-1.4
+- **Model Family:** FishAudio
+- **Abilities:** text-to-audio
+- **Multilingual:** True
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** fishaudio/fish-speech-1.4
+
+Execute the following command to launch the model::
+
+ xinference launch --model-name FishSpeech-1.4 --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/audio/index.rst b/doc/source/models/builtin/audio/index.rst
index 8959b2b94f..b89eaf41f6 100644
--- a/doc/source/models/builtin/audio/index.rst
+++ b/doc/source/models/builtin/audio/index.rst
@@ -25,7 +25,7 @@ The following is a list of built-in audio models in Xinference:
cosyvoice-300m-sft
- fishspeech-1.2-sft
+ fishspeech-1.4
sensevoicesmall
@@ -35,6 +35,8 @@ The following is a list of built-in audio models in Xinference:
whisper-large-v3
+ whisper-large-v3-turbo
+
whisper-medium
whisper-medium.en
diff --git a/doc/source/models/builtin/audio/whisper-large-v3-turbo.rst b/doc/source/models/builtin/audio/whisper-large-v3-turbo.rst
new file mode 100644
index 0000000000..dbaecc0c6d
--- /dev/null
+++ b/doc/source/models/builtin/audio/whisper-large-v3-turbo.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_whisper-large-v3-turbo:
+
+======================
+whisper-large-v3-turbo
+======================
+
+- **Model Name:** whisper-large-v3-turbo
+- **Model Family:** whisper
+- **Abilities:** audio-to-text
+- **Multilingual:** True
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** openai/whisper-large-v3-turbo
+
+Execute the following command to launch the model::
+
+ xinference launch --model-name whisper-large-v3-turbo --model-type audio
\ No newline at end of file
diff --git a/doc/source/models/builtin/embedding/gte-qwen2.rst b/doc/source/models/builtin/embedding/gte-qwen2.rst
index a88fdece9d..85eeeac39a 100644
--- a/doc/source/models/builtin/embedding/gte-qwen2.rst
+++ b/doc/source/models/builtin/embedding/gte-qwen2.rst
@@ -11,11 +11,11 @@ gte-Qwen2
Specifications
^^^^^^^^^^^^^^
-- **Dimensions:** 3584
+- **Dimensions:** 4096
- **Max Tokens:** 32000
- **Model ID:** Alibaba-NLP/gte-Qwen2-7B-instruct
- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model::
- xinference launch --model-name gte-Qwen2 --model-type embedding
\ No newline at end of file
+ xinference launch --model-name gte-Qwen2 --model-type embedding
diff --git a/doc/source/models/builtin/embedding/index.rst b/doc/source/models/builtin/embedding/index.rst
index 5afa52c21d..4422b10977 100644
--- a/doc/source/models/builtin/embedding/index.rst
+++ b/doc/source/models/builtin/embedding/index.rst
@@ -53,6 +53,8 @@ The following is a list of built-in embedding models in Xinference:
jina-embeddings-v2-small-en
+ jina-embeddings-v3
+
m3e-base
m3e-large
diff --git a/doc/source/models/builtin/embedding/jina-embeddings-v3.rst b/doc/source/models/builtin/embedding/jina-embeddings-v3.rst
new file mode 100644
index 0000000000..59e7f3577c
--- /dev/null
+++ b/doc/source/models/builtin/embedding/jina-embeddings-v3.rst
@@ -0,0 +1,21 @@
+.. _models_builtin_jina-embeddings-v3:
+
+==================
+jina-embeddings-v3
+==================
+
+- **Model Name:** jina-embeddings-v3
+- **Languages:** zh, en
+- **Abilities:** embed
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Dimensions:** 1024
+- **Max Tokens:** 8192
+- **Model ID:** jinaai/jina-embeddings-v3
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model::
+
+ xinference launch --model-name jina-embeddings-v3 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/image/flux.1-dev.rst b/doc/source/models/builtin/image/flux.1-dev.rst
index 829bcbfd75..3a16cfe0a7 100644
--- a/doc/source/models/builtin/image/flux.1-dev.rst
+++ b/doc/source/models/builtin/image/flux.1-dev.rst
@@ -6,7 +6,7 @@ FLUX.1-dev
- **Model Name:** FLUX.1-dev
- **Model Family:** stable_diffusion
-- **Abilities:** text2image
+- **Abilities:** text2image, image2image, inpainting
- **Available ControlNet:** None
Specifications
diff --git a/doc/source/models/builtin/image/flux.1-schnell.rst b/doc/source/models/builtin/image/flux.1-schnell.rst
index 268f5a1720..df82d2069f 100644
--- a/doc/source/models/builtin/image/flux.1-schnell.rst
+++ b/doc/source/models/builtin/image/flux.1-schnell.rst
@@ -6,7 +6,7 @@ FLUX.1-schnell
- **Model Name:** FLUX.1-schnell
- **Model Family:** stable_diffusion
-- **Abilities:** text2image
+- **Abilities:** text2image, image2image, inpainting
- **Available ControlNet:** None
Specifications
diff --git a/doc/source/models/builtin/image/got-ocr2_0.rst b/doc/source/models/builtin/image/got-ocr2_0.rst
new file mode 100644
index 0000000000..994b0deae4
--- /dev/null
+++ b/doc/source/models/builtin/image/got-ocr2_0.rst
@@ -0,0 +1,19 @@
+.. _models_builtin_got-ocr2_0:
+
+==========
+GOT-OCR2_0
+==========
+
+- **Model Name:** GOT-OCR2_0
+- **Model Family:** ocr
+- **Abilities:** ocr
+- **Available ControlNet:** None
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** stepfun-ai/GOT-OCR2_0
+
+Execute the following command to launch the model::
+
+ xinference launch --model-name GOT-OCR2_0 --model-type image
\ No newline at end of file
diff --git a/doc/source/models/builtin/image/index.rst b/doc/source/models/builtin/image/index.rst
index 5bc8744338..bf4efdab86 100644
--- a/doc/source/models/builtin/image/index.rst
+++ b/doc/source/models/builtin/image/index.rst
@@ -15,6 +15,8 @@ The following is a list of built-in image models in Xinference:
flux.1-schnell
+ got-ocr2_0
+
kolors
sd-turbo
diff --git a/doc/source/models/builtin/image/sd3-medium.rst b/doc/source/models/builtin/image/sd3-medium.rst
index c69b4a708b..953a3eca32 100644
--- a/doc/source/models/builtin/image/sd3-medium.rst
+++ b/doc/source/models/builtin/image/sd3-medium.rst
@@ -6,7 +6,7 @@ sd3-medium
- **Model Name:** sd3-medium
- **Model Family:** stable_diffusion
-- **Abilities:** text2image, image2image
+- **Abilities:** text2image, image2image, inpainting
- **Available ControlNet:** None
Specifications
diff --git a/doc/source/models/builtin/llm/chatglm3-128k.rst b/doc/source/models/builtin/llm/chatglm3-128k.rst
deleted file mode 100644
index 410669fd83..0000000000
--- a/doc/source/models/builtin/llm/chatglm3-128k.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. _models_llm_chatglm3-128k:
-
-========================================
-chatglm3-128k
-========================================
-
-- **Context Length:** 131072
-- **Model Name:** chatglm3-128k
-- **Languages:** en, zh
-- **Abilities:** chat
-- **Description:** ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
-
-Specifications
-^^^^^^^^^^^^^^
-
-
-Model Spec 1 (pytorch, 6 Billion)
-++++++++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 6
-- **Quantizations:** 4-bit, 8-bit, none
-- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
-- **Model ID:** THUDM/chatglm3-6b-128k
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
- xinference launch --model-engine ${engine} --model-name chatglm3-128k --size-in-billions 6 --model-format pytorch --quantization ${quantization}
-
diff --git a/doc/source/models/builtin/llm/chatglm3-32k.rst b/doc/source/models/builtin/llm/chatglm3-32k.rst
deleted file mode 100644
index b728ce3fad..0000000000
--- a/doc/source/models/builtin/llm/chatglm3-32k.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. _models_llm_chatglm3-32k:
-
-========================================
-chatglm3-32k
-========================================
-
-- **Context Length:** 32768
-- **Model Name:** chatglm3-32k
-- **Languages:** en, zh
-- **Abilities:** chat
-- **Description:** ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
-
-Specifications
-^^^^^^^^^^^^^^
-
-
-Model Spec 1 (pytorch, 6 Billion)
-++++++++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 6
-- **Quantizations:** 4-bit, 8-bit, none
-- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
-- **Model ID:** THUDM/chatglm3-6b-32k
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
- xinference launch --model-engine ${engine} --model-name chatglm3-32k --size-in-billions 6 --model-format pytorch --quantization ${quantization}
-
diff --git a/doc/source/models/builtin/llm/chatglm3.rst b/doc/source/models/builtin/llm/chatglm3.rst
deleted file mode 100644
index baf7a0fa08..0000000000
--- a/doc/source/models/builtin/llm/chatglm3.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. _models_llm_chatglm3:
-
-========================================
-chatglm3
-========================================
-
-- **Context Length:** 8192
-- **Model Name:** chatglm3
-- **Languages:** en, zh
-- **Abilities:** chat, tools
-- **Description:** ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
-
-Specifications
-^^^^^^^^^^^^^^
-
-
-Model Spec 1 (pytorch, 6 Billion)
-++++++++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 6
-- **Quantizations:** 4-bit, 8-bit, none
-- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
-- **Model ID:** THUDM/chatglm3-6b
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
- xinference launch --model-engine ${engine} --model-name chatglm3 --size-in-billions 6 --model-format pytorch --quantization ${quantization}
-
diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
new file mode 100644
index 0000000000..d6e91cb248
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2-chat-0628.rst
@@ -0,0 +1,31 @@
+.. _models_llm_deepseek-v2-chat-0628:
+
+========================================
+deepseek-v2-chat-0628
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2-chat-0628
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Chat-0628
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2-chat-0628 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2-chat.rst b/doc/source/models/builtin/llm/deepseek-v2-chat.rst
new file mode 100644
index 0000000000..84595c2bbb
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2-chat.rst
@@ -0,0 +1,47 @@
+.. _models_llm_deepseek-v2-chat:
+
+========================================
+deepseek-v2-chat
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 16
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Lite-Chat
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 16 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2-Chat
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2-chat --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2.5.rst b/doc/source/models/builtin/llm/deepseek-v2.5.rst
new file mode 100644
index 0000000000..5f5b9475d4
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2.5.rst
@@ -0,0 +1,31 @@
+.. _models_llm_deepseek-v2.5:
+
+========================================
+deepseek-v2.5
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2.5
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** deepseek-ai/DeepSeek-V2.5
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2.5 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/deepseek-v2.rst b/doc/source/models/builtin/llm/deepseek-v2.rst
new file mode 100644
index 0000000000..4102b9568c
--- /dev/null
+++ b/doc/source/models/builtin/llm/deepseek-v2.rst
@@ -0,0 +1,47 @@
+.. _models_llm_deepseek-v2:
+
+========================================
+deepseek-v2
+========================================
+
+- **Context Length:** 128000
+- **Model Name:** deepseek-v2
+- **Languages:** en, zh
+- **Abilities:** generate
+- **Description:** DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 16
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: Transformers
+- **Model ID:** deepseek-ai/DeepSeek-V2-Lite
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 16 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 236 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 236
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: Transformers
+- **Model ID:** deepseek-ai/DeepSeek-V2
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name deepseek-v2 --size-in-billions 236 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/gorilla-openfunctions-v1.rst b/doc/source/models/builtin/llm/gorilla-openfunctions-v1.rst
deleted file mode 100644
index d7ea21418d..0000000000
--- a/doc/source/models/builtin/llm/gorilla-openfunctions-v1.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-.. _models_llm_gorilla-openfunctions-v1:
-
-========================================
-gorilla-openfunctions-v1
-========================================
-
-- **Context Length:** 4096
-- **Model Name:** gorilla-openfunctions-v1
-- **Languages:** en
-- **Abilities:** chat
-- **Description:** OpenFunctions is designed to extend Large Language Model (LLM) Chat Completion feature to formulate executable APIs call given natural language instructions and API context.
-
-Specifications
-^^^^^^^^^^^^^^
-
-
-Model Spec 1 (pytorch, 7 Billion)
-++++++++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Engines**: Transformers
-- **Model ID:** gorilla-llm/gorilla-openfunctions-v1
-- **Model Hubs**: `Hugging Face `__
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
- xinference launch --model-engine ${engine} --model-name gorilla-openfunctions-v1 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-
-Model Spec 2 (ggufv2, 7 Billion)
-++++++++++++++++++++++++++++++++++++++++
-
-- **Model Format:** ggufv2
-- **Model Size (in billions):** 7
-- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
-- **Engines**: llama.cpp
-- **Model ID:** TheBloke/gorilla-openfunctions-v1-GGUF
-- **Model Hubs**: `Hugging Face `__
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
- xinference launch --model-engine ${engine} --model-name gorilla-openfunctions-v1 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
-
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 75745ffdc6..3ff3c4b4f9 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -46,21 +46,6 @@ The following is a list of built-in LLM in Xinference:
- 131072
- C4AI Command-R(+) is a research release of a 35 and 104 billion parameter highly performant generative model.
- * - :ref:`chatglm3 `
- - chat, tools
- - 8192
- - ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
-
- * - :ref:`chatglm3-128k `
- - chat
- - 131072
- - ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
-
- * - :ref:`chatglm3-32k `
- - chat
- - 32768
- - ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.
-
* - :ref:`code-llama `
- generate
- 100000
@@ -141,6 +126,26 @@ The following is a list of built-in LLM in Xinference:
- 16384
- deepseek-coder-instruct is a model initialized from deepseek-coder-base and fine-tuned on 2B tokens of instruction data.
+ * - :ref:`deepseek-v2 `
+ - generate
+ - 128000
+ - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+ * - :ref:`deepseek-v2-chat `
+ - chat
+ - 128000
+ - DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference.
+
+ * - :ref:`deepseek-v2-chat-0628 `
+ - chat
+ - 128000
+ - DeepSeek-V2-Chat-0628 is an improved version of DeepSeek-V2-Chat.
+
+ * - :ref:`deepseek-v2.5 `
+ - chat
+ - 128000
+ - DeepSeek-V2.5 is an upgraded version that combines DeepSeek-V2-Chat and DeepSeek-Coder-V2-Instruct. The new model integrates the general and coding abilities of the two previous versions.
+
* - :ref:`deepseek-vl-chat `
- chat, vision
- 4096
@@ -171,11 +176,6 @@ The following is a list of built-in LLM in Xinference:
- 1048576
- GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.
- * - :ref:`gorilla-openfunctions-v1 `
- - chat
- - 4096
- - OpenFunctions is designed to extend Large Language Model (LLM) Chat Completion feature to formulate executable APIs call given natural language instructions and API context.
-
* - :ref:`gorilla-openfunctions-v2 `
- chat
- 4096
@@ -237,7 +237,7 @@ The following is a list of built-in LLM in Xinference:
- Llama 3.1 is an auto-regressive language model that uses an optimized transformer architecture
* - :ref:`llama-3.1-instruct `
- - chat
+ - chat, tools
- 131072
- The Llama 3.1 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..
@@ -276,6 +276,11 @@ The following is a list of built-in LLM in Xinference:
- 32768
- MiniCPM-V 2.6 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Qwen2-7B with a total of 8B parameters.
+ * - :ref:`minicpm3-4b `
+ - chat
+ - 32768
+ - MiniCPM3-4B is the 3rd generation of MiniCPM series. The overall performance of MiniCPM3-4B surpasses Phi-3.5-mini-Instruct and GPT-3.5-Turbo-0125, being comparable with many recent 7B~9B models.
+
* - :ref:`mistral-instruct-v0.1 `
- chat
- 8192
@@ -367,7 +372,7 @@ The following is a list of built-in LLM in Xinference:
- Platypus-70B-instruct is a merge of garage-bAInd/Platypus2-70B and upstage/Llama-2-70b-instruct-v2.
* - :ref:`qwen-chat `
- - chat, tools
+ - chat
- 32768
- Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.
@@ -386,6 +391,16 @@ The following is a list of built-in LLM in Xinference:
- 32768
- Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.
+ * - :ref:`qwen2-audio `
+ - chat, audio
+ - 32768
+ - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+ * - :ref:`qwen2-audio-instruct `
+ - chat, audio
+ - 32768
+ - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
* - :ref:`qwen2-instruct `
- chat, tools
- 32768
@@ -396,6 +411,31 @@ The following is a list of built-in LLM in Xinference:
- 32768
- Qwen2 is the new series of Qwen large language models.
+ * - :ref:`qwen2-vl-instruct `
+ - chat, vision
+ - 32768
+ - Qwen2-VL: To See the World More Clearly.Qwen2-VL is the latest version of the vision language models in the Qwen model familities.
+
+ * - :ref:`qwen2.5 `
+ - generate
+ - 32768
+ - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
+ * - :ref:`qwen2.5-coder `
+ - generate
+ - 32768
+ - Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).
+
+ * - :ref:`qwen2.5-coder-instruct `
+ - chat, tools
+ - 32768
+ - Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).
+
+ * - :ref:`qwen2.5-instruct `
+ - chat, tools
+ - 32768
+ - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
* - :ref:`seallm_v2 `
- generate
- 8192
@@ -481,21 +521,21 @@ The following is a list of built-in LLM in Xinference:
- 4096
- The Yi series models are large language models trained from scratch by developers at 01.AI.
+ * - :ref:`yi-coder `
+ - generate
+ - 131072
+ - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
+ * - :ref:`yi-coder-chat `
+ - chat
+ - 131072
+ - Yi-Coder is a series of open-source code language models that delivers state-of-the-art coding performance with fewer than 10 billion parameters.Excelling in long-context understanding with a maximum context length of 128K tokens.Supporting 52 major programming languages, including popular ones such as Java, Python, JavaScript, and C++.
+
* - :ref:`yi-vl-chat `
- chat, vision
- 4096
- Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images.
- * - :ref:`zephyr-7b-alpha `
- - chat
- - 8192
- - Zephyr-7B-α is the first model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1.
-
- * - :ref:`zephyr-7b-beta `
- - chat
- - 8192
- - Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1
-
.. toctree::
:maxdepth: 3
@@ -513,12 +553,6 @@ The following is a list of built-in LLM in Xinference:
c4ai-command-r-v01
- chatglm3
-
- chatglm3-128k
-
- chatglm3-32k
-
code-llama
code-llama-instruct
@@ -551,6 +585,14 @@ The following is a list of built-in LLM in Xinference:
deepseek-coder-instruct
+ deepseek-v2
+
+ deepseek-v2-chat
+
+ deepseek-v2-chat-0628
+
+ deepseek-v2.5
+
deepseek-vl-chat
gemma-2-it
@@ -563,8 +605,6 @@ The following is a list of built-in LLM in Xinference:
glm4-chat-1m
- gorilla-openfunctions-v1
-
gorilla-openfunctions-v2
gpt-2
@@ -605,6 +645,8 @@ The following is a list of built-in LLM in Xinference:
minicpm-v-2.6
+ minicpm3-4b
+
mistral-instruct-v0.1
mistral-instruct-v0.2
@@ -649,10 +691,24 @@ The following is a list of built-in LLM in Xinference:
qwen1.5-moe-chat
+ qwen2-audio
+
+ qwen2-audio-instruct
+
qwen2-instruct
qwen2-moe-instruct
+ qwen2-vl-instruct
+
+ qwen2.5
+
+ qwen2.5-coder
+
+ qwen2.5-coder-instruct
+
+ qwen2.5-instruct
+
seallm_v2
seallm_v2.5
@@ -687,10 +743,10 @@ The following is a list of built-in LLM in Xinference:
yi-chat
- yi-vl-chat
+ yi-coder
- zephyr-7b-alpha
+ yi-coder-chat
- zephyr-7b-beta
+ yi-vl-chat
diff --git a/doc/source/models/builtin/llm/internvl2.rst b/doc/source/models/builtin/llm/internvl2.rst
index cf74863d96..e83d989707 100644
--- a/doc/source/models/builtin/llm/internvl2.rst
+++ b/doc/source/models/builtin/llm/internvl2.rst
@@ -38,7 +38,7 @@ Model Spec 2 (pytorch, 2 Billion)
- **Quantizations:** 4-bit, 8-bit, none
- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
- **Model ID:** OpenGVLab/InternVL2-2B
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
@@ -54,7 +54,7 @@ Model Spec 3 (awq, 2 Billion)
- **Quantizations:** Int4
- **Engines**:
- **Model ID:** OpenGVLab/InternVL2-2B-AWQ
-- **Model Hubs**: `Hugging Face `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
@@ -78,36 +78,36 @@ chosen quantization method from the options listed above::
xinference launch --model-engine ${engine} --model-name internvl2 --size-in-billions 4 --model-format pytorch --quantization ${quantization}
-Model Spec 5 (awq, 4 Billion)
+Model Spec 5 (pytorch, 8 Billion)
++++++++++++++++++++++++++++++++++++++++
-- **Model Format:** awq
-- **Model Size (in billions):** 4
-- **Quantizations:** Int4
-- **Engines**:
-- **Model ID:** OpenGVLab/InternVL2-8B-AWQ
-- **Model Hubs**: `Hugging Face `__
+- **Model Format:** pytorch
+- **Model Size (in billions):** 8
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** OpenGVLab/InternVL2-8B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
- xinference launch --model-engine ${engine} --model-name internvl2 --size-in-billions 4 --model-format awq --quantization ${quantization}
+ xinference launch --model-engine ${engine} --model-name internvl2 --size-in-billions 8 --model-format pytorch --quantization ${quantization}
-Model Spec 6 (pytorch, 8 Billion)
+Model Spec 6 (awq, 8 Billion)
++++++++++++++++++++++++++++++++++++++++
-- **Model Format:** pytorch
+- **Model Format:** awq
- **Model Size (in billions):** 8
-- **Quantizations:** 4-bit, 8-bit, none
-- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
-- **Model ID:** OpenGVLab/InternVL2-8B
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+- **Quantizations:** Int4
+- **Engines**:
+- **Model ID:** OpenGVLab/InternVL2-8B-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
- xinference launch --model-engine ${engine} --model-name internvl2 --size-in-billions 8 --model-format pytorch --quantization ${quantization}
+ xinference launch --model-engine ${engine} --model-name internvl2 --size-in-billions 8 --model-format awq --quantization ${quantization}
Model Spec 7 (pytorch, 26 Billion)
@@ -118,7 +118,7 @@ Model Spec 7 (pytorch, 26 Billion)
- **Quantizations:** 4-bit, 8-bit, none
- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
- **Model ID:** OpenGVLab/InternVL2-26B
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
@@ -134,7 +134,7 @@ Model Spec 8 (awq, 26 Billion)
- **Quantizations:** Int4
- **Engines**:
- **Model ID:** OpenGVLab/InternVL2-26B-AWQ
-- **Model Hubs**: `Hugging Face `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
@@ -150,7 +150,7 @@ Model Spec 9 (pytorch, 40 Billion)
- **Quantizations:** 4-bit, 8-bit, none
- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
- **Model ID:** OpenGVLab/InternVL2-40B
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
@@ -166,7 +166,7 @@ Model Spec 10 (awq, 40 Billion)
- **Quantizations:** Int4
- **Engines**:
- **Model ID:** OpenGVLab/InternVL2-40B-AWQ
-- **Model Hubs**: `Hugging Face `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
@@ -182,7 +182,7 @@ Model Spec 11 (pytorch, 76 Billion)
- **Quantizations:** 4-bit, 8-bit, none
- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
- **Model ID:** OpenGVLab/InternVL2-Llama3-76B
-- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
@@ -198,7 +198,7 @@ Model Spec 12 (awq, 76 Billion)
- **Quantizations:** Int4
- **Engines**:
- **Model ID:** OpenGVLab/InternVL2-Llama3-76B-AWQ
-- **Model Hubs**: `Hugging Face `__
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
diff --git a/doc/source/models/builtin/llm/llama-2-chat.rst b/doc/source/models/builtin/llm/llama-2-chat.rst
index ebcc14f7ca..c5899c2116 100644
--- a/doc/source/models/builtin/llm/llama-2-chat.rst
+++ b/doc/source/models/builtin/llm/llama-2-chat.rst
@@ -84,7 +84,7 @@ Model Spec 5 (gptq, 7 Billion)
- **Model Format:** gptq
- **Model Size (in billions):** 7
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-7B-Chat-GPTQ
- **Model Hubs**: `Hugging Face `__
@@ -100,7 +100,7 @@ Model Spec 6 (gptq, 70 Billion)
- **Model Format:** gptq
- **Model Size (in billions):** 70
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-70B-Chat-GPTQ
- **Model Hubs**: `Hugging Face `__
@@ -116,7 +116,7 @@ Model Spec 7 (awq, 70 Billion)
- **Model Format:** awq
- **Model Size (in billions):** 70
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-70B-Chat-AWQ
- **Model Hubs**: `Hugging Face `__
@@ -132,7 +132,7 @@ Model Spec 8 (awq, 7 Billion)
- **Model Format:** awq
- **Model Size (in billions):** 7
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-7B-Chat-AWQ
- **Model Hubs**: `Hugging Face `__
@@ -164,7 +164,7 @@ Model Spec 10 (gptq, 13 Billion)
- **Model Format:** gptq
- **Model Size (in billions):** 13
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-13B-chat-GPTQ
- **Model Hubs**: `Hugging Face `__
@@ -180,7 +180,7 @@ Model Spec 11 (awq, 13 Billion)
- **Model Format:** awq
- **Model Size (in billions):** 13
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-13B-chat-AWQ
- **Model Hubs**: `Hugging Face `__
diff --git a/doc/source/models/builtin/llm/llama-2.rst b/doc/source/models/builtin/llm/llama-2.rst
index 11c634b467..0a34f17fdb 100644
--- a/doc/source/models/builtin/llm/llama-2.rst
+++ b/doc/source/models/builtin/llm/llama-2.rst
@@ -36,7 +36,7 @@ Model Spec 2 (gptq, 7 Billion)
- **Model Format:** gptq
- **Model Size (in billions):** 7
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-7B-GPTQ
- **Model Hubs**: `Hugging Face `__
@@ -52,7 +52,7 @@ Model Spec 3 (awq, 7 Billion)
- **Model Format:** awq
- **Model Size (in billions):** 7
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-7B-AWQ
- **Model Hubs**: `Hugging Face `__
@@ -132,7 +132,7 @@ Model Spec 8 (gptq, 13 Billion)
- **Model Format:** gptq
- **Model Size (in billions):** 13
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-13B-GPTQ
- **Model Hubs**: `Hugging Face `__
@@ -148,7 +148,7 @@ Model Spec 9 (awq, 13 Billion)
- **Model Format:** awq
- **Model Size (in billions):** 13
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-13B-AWQ
- **Model Hubs**: `Hugging Face `__
@@ -180,7 +180,7 @@ Model Spec 11 (gptq, 70 Billion)
- **Model Format:** gptq
- **Model Size (in billions):** 70
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-70B-GPTQ
- **Model Hubs**: `Hugging Face `__
@@ -196,7 +196,7 @@ Model Spec 12 (awq, 70 Billion)
- **Model Format:** awq
- **Model Size (in billions):** 70
- **Quantizations:** Int4
-- **Engines**: vLLM, SGLang
+- **Engines**: vLLM, Transformers, SGLang
- **Model ID:** TheBloke/Llama-2-70B-AWQ
- **Model Hubs**: `Hugging Face `__
diff --git a/doc/source/models/builtin/llm/minicpm3-4b.rst b/doc/source/models/builtin/llm/minicpm3-4b.rst
new file mode 100644
index 0000000000..868175aba9
--- /dev/null
+++ b/doc/source/models/builtin/llm/minicpm3-4b.rst
@@ -0,0 +1,47 @@
+.. _models_llm_minicpm3-4b:
+
+========================================
+minicpm3-4b
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** minicpm3-4b
+- **Languages:** zh
+- **Abilities:** chat
+- **Description:** MiniCPM3-4B is the 3rd generation of MiniCPM series. The overall performance of MiniCPM3-4B surpasses Phi-3.5-mini-Instruct and GPT-3.5-Turbo-0125, being comparable with many recent 7B~9B models.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 4 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 4
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** openbmb/MiniCPM3-4B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name minicpm3-4b --size-in-billions 4 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (gptq, 4 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 4
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** openbmb/MiniCPM3-4B-GPTQ-Int4
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name minicpm3-4b --size-in-billions 4 --model-format gptq --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen-chat.rst b/doc/source/models/builtin/llm/qwen-chat.rst
index d0b6ddcfdc..b3f8230307 100644
--- a/doc/source/models/builtin/llm/qwen-chat.rst
+++ b/doc/source/models/builtin/llm/qwen-chat.rst
@@ -7,7 +7,7 @@ qwen-chat
- **Context Length:** 32768
- **Model Name:** qwen-chat
- **Languages:** en, zh
-- **Abilities:** chat, tools
+- **Abilities:** chat
- **Description:** Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.
Specifications
diff --git a/doc/source/models/builtin/llm/qwen2-audio-instruct.rst b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst
new file mode 100644
index 0000000000..2d126a387e
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2-audio-instruct.rst
@@ -0,0 +1,31 @@
+.. _models_llm_qwen2-audio-instruct:
+
+========================================
+qwen2-audio-instruct
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2-audio-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, audio
+- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-Audio-7B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-audio-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2-audio.rst b/doc/source/models/builtin/llm/qwen2-audio.rst
new file mode 100644
index 0000000000..2973390c44
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2-audio.rst
@@ -0,0 +1,31 @@
+.. _models_llm_qwen2-audio:
+
+========================================
+qwen2-audio
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2-audio
+- **Languages:** en, zh
+- **Abilities:** chat, audio
+- **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-Audio-7B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-audio --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2-vl-instruct.rst b/doc/source/models/builtin/llm/qwen2-vl-instruct.rst
new file mode 100644
index 0000000000..0872ea0168
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2-vl-instruct.rst
@@ -0,0 +1,191 @@
+.. _models_llm_qwen2-vl-instruct:
+
+========================================
+qwen2-vl-instruct
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2-vl-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, vision
+- **Description:** Qwen2-VL: To See the World More Clearly.Qwen2-VL is the latest version of the vision language models in the Qwen model familities.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 2
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-2B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 2 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (gptq, 2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 2
+- **Quantizations:** Int8
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 2 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 3 (gptq, 2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 2
+- **Quantizations:** Int4
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 2 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 4 (awq, 2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 2
+- **Quantizations:** Int4
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-2B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 2 --model-format awq --quantization ${quantization}
+
+
+Model Spec 5 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-7B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 6 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int8
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 7 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 8 (awq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-7B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 7 --model-format awq --quantization ${quantization}
+
+
+Model Spec 9 (pytorch, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 72
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-72B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 10 (awq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-72B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 72 --model-format awq --quantization ${quantization}
+
+
+Model Spec 11 (gptq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4, Int8
+- **Engines**: Transformers
+- **Model ID:** Qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2-vl-instruct --size-in-billions 72 --model-format gptq --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2.5-coder-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-coder-instruct.rst
new file mode 100644
index 0000000000..74614b4f0b
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2.5-coder-instruct.rst
@@ -0,0 +1,79 @@
+.. _models_llm_qwen2.5-coder-instruct:
+
+========================================
+qwen2.5-coder-instruct
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2.5-coder-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, tools
+- **Description:** Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-Coder-1.5B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-coder-instruct --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-Coder-7B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-coder-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (ggufv2, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 1_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-coder-instruct --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 4 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-coder-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2.5-coder.rst b/doc/source/models/builtin/llm/qwen2.5-coder.rst
new file mode 100644
index 0000000000..8ae4709930
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2.5-coder.rst
@@ -0,0 +1,47 @@
+.. _models_llm_qwen2.5-coder:
+
+========================================
+qwen2.5-coder
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2.5-coder
+- **Languages:** en, zh
+- **Abilities:** generate
+- **Description:** Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-Coder-1.5B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-coder --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-Coder-7B
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-coder --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2.5-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-instruct.rst
new file mode 100644
index 0000000000..a214dcdd23
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen2.5-instruct.rst
@@ -0,0 +1,799 @@
+.. _models_llm_qwen2.5-instruct:
+
+========================================
+qwen2.5-instruct
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** qwen2.5-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, tools
+- **Description:** Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 0_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 3
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 5 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 6 (pytorch, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 32
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 7 (pytorch, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 72
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 8 (gptq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 9 (gptq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 10 (gptq, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 3
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 11 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 12 (gptq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 13 (gptq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 14 (gptq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4, Int8
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GPTQ-{quantization}
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 15 (awq, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 0_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 16 (awq, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 1_5
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format awq --quantization ${quantization}
+
+
+Model Spec 17 (awq, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 3
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format awq --quantization ${quantization}
+
+
+Model Spec 18 (awq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format awq --quantization ${quantization}
+
+
+Model Spec 19 (awq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format awq --quantization ${quantization}
+
+
+Model Spec 20 (awq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format awq --quantization ${quantization}
+
+
+Model Spec 21 (awq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-AWQ
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format awq --quantization ${quantization}
+
+
+Model Spec 22 (ggufv2, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 0_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-0.5B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 23 (ggufv2, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 1_5
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-1.5B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 24 (ggufv2, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 3
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-3B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 25 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-7B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 26 (ggufv2, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 14
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-14B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 27 (ggufv2, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 32
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-32B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 28 (ggufv2, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 72
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0, fp16
+- **Engines**: llama.cpp
+- **Model ID:** Qwen/Qwen2.5-72B-Instruct-GGUF
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 72 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 29 (mlx, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 0_5
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-0.5B-Instruct-4bit
+- **Model Hubs**: `Hugging Face `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 30 (mlx, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 0_5
+- **Quantizations:** 8-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-0.5B-Instruct-8bit
+- **Model Hubs**: `Hugging Face `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 31 (mlx, 0_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 0_5
+- **Quantizations:** none
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-0.5B-Instruct-bf16
+- **Model Hubs**: `Hugging Face `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 0_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 32 (mlx, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-1.5B-Instruct-4bit
+- **Model Hubs**: `Hugging Face `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 33 (mlx, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 1_5
+- **Quantizations:** 8-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-1.5B-Instruct-8bit
+- **Model Hubs**: `Hugging Face `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 34 (mlx, 1_5 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 1_5
+- **Quantizations:** none
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-1.5B-Instruct-bf16
+- **Model Hubs**: `Hugging Face `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 1_5 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 35 (mlx, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 3
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-3B-Instruct-4bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 36 (mlx, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 3
+- **Quantizations:** 8-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-3B-Instruct-8bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 37 (mlx, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 3
+- **Quantizations:** none
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-3B-Instruct-bf16
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 3 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 38 (mlx, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-7B-Instruct-4bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 39 (mlx, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 7
+- **Quantizations:** 8-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-7B-Instruct-8bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 40 (mlx, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-7B-Instruct-bf16
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 7 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 41 (mlx, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 14
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-14B-Instruct-4bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 42 (mlx, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 14
+- **Quantizations:** 8-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-14B-Instruct-8bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 43 (mlx, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 14
+- **Quantizations:** none
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-14B-Instruct-bf16
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 14 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 44 (mlx, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 32
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-32B-Instruct-4bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 45 (mlx, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 32
+- **Quantizations:** 8-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-32B-Instruct-8bit
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 46 (mlx, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 32
+- **Quantizations:** none
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-32B-Instruct-bf16
+- **Model Hubs**: `Hugging Face `__, `ModelScope `__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-engine ${engine} --model-name qwen2.5-instruct --size-in-billions 32 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 47 (mlx, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 72
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen2.5-72B-Instruct-4bit
+- **Model Hubs**: `Hugging Face