huggingface
diff --git a/‎.github/workflows/build.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 31 additions & 7 deletions b/‎Cargo.lock‎
Lines changed: 31 additions & 7 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile_intel‎
Lines changed: 5 additions & 11 deletions b/‎Dockerfile_intel‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎README.md‎
Lines changed: 17 additions & 5 deletions b/‎README.md‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎assets/v3_benchmarks.png‎
209 KB b/‎assets/v3_benchmarks.png‎
209 KB
diff --git a/‎backends/v2/src/backend.rs‎
Lines changed: 4 additions & 0 deletions b/‎backends/v2/src/backend.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/v2/src/queue.rs‎
Lines changed: 1 addition & 0 deletions b/‎backends/v2/src/queue.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/v3/src/backend.rs‎
Lines changed: 4 additions & 0 deletions b/‎backends/v3/src/backend.rs‎
Lines changed: 4 additions & 0 deletions
@@ -137,7 +137,7 @@ jobs:
         uses: docker/[email protected]
         with:
           flavor: |
-            latest=auto
+            latest=false
           images: |
             registry.internal.huggingface.tech/api-inference/community/text-generation-inference
             ghcr.io/huggingface/text-generation-inference
 
@@ -4,6 +4,7 @@ repos:
     hooks:
     -   id: check-yaml
     -   id: end-of-file-fixer
+        exclude: crate-hashes.json
     -   id: trailing-whitespace
         exclude: docs/source/reference/launcher.md
 -   repo: https://github.com/psf/black
 
@@ -20,7 +20,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "2.4.2-dev0"
+version = "3.0.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
@@ -45,7 +45,7 @@ RUN cargo build --profile release-opt --frozen
 
 # Text Generation Inference base image for Intel
 
-FROM intel/intel-extension-for-pytorch:2.3.110-xpu AS xpu
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS xpu
 
 USER root
 
@@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/
 
 RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit=2024.2.1-98 xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
@@ -114,15 +114,8 @@ RUN cd server && \
     pip install -r requirements_intel.txt && \
     pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
-ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
-ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
-ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
-ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
-ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
-ENV PATH=/opt/conda/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
 ENV CCL_ZE_IPC_EXCHANGE=sockets
-ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
-ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
 #ENV TORCH_LLM_ALLREDUCE=1
 #ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
 
@@ -197,9 +190,10 @@ RUN pip install triton py-libnuma
 
 WORKDIR /usr/src
 
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 2e1c98f74ec1b35ad8dd1ebe7dd4b25470f2fd41
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout b7b552baf64283b594665b8687430fe92990e497
 RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout v2.4.0+cpu+rc0
 
+RUN sed -i 's/VERSION_MINOR 6/VERSION_MINOR 5/' intel-extension-for-pytorch/version.txt
 RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install
 
 RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
 
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
+3.0.0   ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -151,7 +151,7 @@ model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
@@ -196,14 +196,26 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T
 
 You can also opt to install `text-generation-inference` locally.
 
-First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
-Python 3.9, e.g. using `conda`:
+First clone the repository and change directoy into it:
+
+```shell
+git clone https://github.com/huggingface/text-generation-inference
+cd text-generation-inference
+```
+
+Then [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
+Python 3.9, e.g. using `conda` or `python venv`:
 
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 
+#using conda
 conda create -n text-generation-inference python=3.11
 conda activate text-generation-inference
+
+#using pyton venv
+python3 -m venv .venv
+source .venv/bin/activate
 ```
 
 You may also need to install Protoc.
 
@@ -104,6 +104,10 @@ impl Backend for BackendV2 {
         }
         .is_ok()
     }
+
+    fn start_health(&self) -> bool {
+        true
+    }
 }
 
 /// Batching logic
 
@@ -436,6 +436,7 @@ mod tests {
                 stopping_parameters: ValidStoppingParameters {
                     ignore_eos_token: false,
                     max_new_tokens: 1,
+                    max_total_new_tokens: 1024,
                     stop_sequences: vec![],
                 },
                 top_n_tokens: 0,
 
@@ -111,6 +111,10 @@ impl Backend for BackendV3 {
         }
         .is_ok()
     }
+
+    fn start_health(&self) -> bool {
+        true
+    }
 }
 
 /// Batching logic
Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,10 @@ impl Backend for BackendV2 {`
`104`	`104`	`}`
`105`	`105`	`.is_ok()`
`106`	`106`	`}`
	`107`	`+`
	`108`	`+ fn start_health(&self) -> bool {`
	`109`	`+ true`
	`110`	`+ }`
`107`	`111`	`}`
`108`	`112`
`109`	`113`	`/// Batching logic`
Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,10 @@ impl Backend for BackendV3 {`
`111`	`111`	`}`
`112`	`112`	`.is_ok()`
`113`	`113`	`}`
	`114`	`+`
	`115`	`+ fn start_health(&self) -> bool {`
	`116`	`+ true`
	`117`	`+ }`
`114`	`118`	`}`
`115`	`119`
`116`	`120`	`/// Batching logic`