Skip to content

Commit 14e8ca5

Browse files
authored
Merge branch 'main' into feature/get-trace-id-from-req-headers
2 parents 64b0337 + ba5fc7d commit 14e8ca5

File tree

139 files changed

+5267
-4283
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+5267
-4283
lines changed

Cargo.lock

Lines changed: 17 additions & 51 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Dockerfile.trtllm

Lines changed: 0 additions & 23 deletions
This file was deleted.

Dockerfile_amd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,8 @@ ENV ROCM_USE_CUSTOM_PAGED_ATTN=1
327327
ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0
328328
ENV VLLM_MOE_PADDING=0
329329
ENV ATTENTION=paged
330-
ENV USE_PREFIX_CACHING=0
330+
ENV PREFIX_CACHING=0
331+
ENV PREFILL_CHUNKING=0
331332
ENV ROCM_USE_SKINNY_GEMM=1
332333

333334
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh

Dockerfile_intel

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,8 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/lo
218218

219219
FROM ${PLATFORM} AS final
220220
ENV ATTENTION=paged
221-
ENV USE_PREFIX_CACHING=0
221+
ENV PREFIX_CACHING=0
222+
ENV PREFILL_CHUNKING=0
222223
ENV CUDA_GRAPHS=0
223224
ENTRYPOINT ["text-generation-launcher"]
224225
CMD ["--json-output"]

backends/trtllm/Dockerfile renamed to Dockerfile_trtllm

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ COPY . .
1010
RUN cargo chef prepare --recipe-path recipe.json
1111

1212
# CUDA dependent dependencies resolver stage
13-
FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
13+
FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
1414

1515
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
1616
--mount=type=cache,target=/var/lib/apt,sharing=locked \
@@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
2626
ninja-build \
2727
pkg-config \
2828
python3 \
29+
python3-dev \
2930
python3-setuptools \
3031
tar \
3132
wget
@@ -42,7 +43,7 @@ RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILE
4243
mkdir /usr/src/mpi && \
4344
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
4445
cd /usr/src/mpi && \
45-
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda && \
46+
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
4647
make -j all && \
4748
make install && \
4849
rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
@@ -82,10 +83,16 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
8283
cd backends/trtllm && \
8384
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
8485

85-
FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
86+
FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
87+
RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
88+
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
89+
python3 -m pip install transformers tokenizers
90+
8691
WORKDIR /usr/local/tgi/bin
8792

88-
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
93+
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
94+
ENV TOKENIZERS_PARALLELISM=false
95+
ENV OMPI_MCA_plm_rsh_agent=""
8996

9097
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
9198
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ curl 127.0.0.1:8080/generate_stream \
9898
You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.
9999

100100
```bash
101-
curl localhost:3000/v1/chat/completions \
101+
curl localhost:8080/v1/chat/completions \
102102
-X POST \
103103
-d '{
104104
"model": "tgi",

backends/client/src/v3/client.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ impl Client {
158158
// Blocks and slots will be set on the server side if we use paged attention
159159
blocks: vec![],
160160
slots: vec![],
161-
prefix_len: 0,
161+
cache_len: 0,
162+
chunk_len: None,
162163
// Set sampling parameters to also take these ops into account in the max memory
163164
parameters: Some(NextTokenChooserParameters {
164165
temperature: 0.9,
@@ -217,8 +218,13 @@ impl Client {
217218
pub async fn prefill(
218219
&mut self,
219220
batch: Batch,
221+
cached_batch: Option<CachedBatch>,
220222
) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
221-
let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
223+
let request = tonic::Request::new(PrefillRequest {
224+
batch: Some(batch),
225+
cached_batch,
226+
})
227+
.inject_context();
222228
let response = self.stub.prefill(request).await?.into_inner();
223229
Ok((
224230
response.generations,

backends/client/src/v3/sharded_client.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,12 @@ impl ShardedClient {
134134
pub async fn prefill(
135135
&mut self,
136136
batch: Batch,
137+
cached_batch: Option<CachedBatch>,
137138
) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
138139
let futures: Vec<_> = self
139140
.clients
140141
.iter_mut()
141-
.map(|client| Box::pin(client.prefill(batch.clone())))
142+
.map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
142143
.collect();
143144
#[allow(clippy::type_complexity)]
144145
let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
@@ -245,7 +246,8 @@ impl Health for ShardedClient {
245246
// Block 0 is reserved for health checks
246247
blocks: vec![0],
247248
slots: (0..16).collect(),
248-
prefix_len: 0,
249+
cache_len: 0,
250+
chunk_len: None,
249251
adapter_id: None,
250252
};
251253
let batch = Batch {
@@ -255,7 +257,7 @@ impl Health for ShardedClient {
255257
max_tokens: 2,
256258
max_blocks: 1,
257259
};
258-
self.clone().prefill(batch).await?;
260+
self.clone().prefill(batch, None).await?;
259261
Ok(())
260262
}
261263
}

backends/trtllm/CMakeLists.txt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
cmake_minimum_required(VERSION 3.20)
22

3+
if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
4+
find_program(CCACHE_EXECUTABLE "ccache")
5+
if (CCACHE_EXECUTABLE)
6+
message(STATUS "Using ccache")
7+
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
8+
endif ()
9+
endif ()
10+
11+
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
12+
cmake_policy(SET CMP0135 NEW)
13+
endif ()
14+
315
project(tgi-trtllm-backend VERSION 1.0.0)
416
set(CMAKE_CXX_STANDARD 20)
517

@@ -14,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include"
1426
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
1527

1628
# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
17-
find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
29+
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
1830

1931
#### External dependencies ####
2032
include(cmake/fmt.cmake)

backends/trtllm/Cargo.toml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,17 @@ async-trait = "0.1"
1010
async-stream = "0.3"
1111
clap = { version = "4.5", features = ["derive"] }
1212
cxx = "1.0"
13+
hashbrown = "0.14"
14+
hf-hub = { workspace = true }
1315
log = { version = "0.4", features = [] }
1416
text-generation-router = { path = "../../router" }
15-
tokenizers = { version = "0.19", features = ["hf-hub"] }
16-
tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
17+
tokenizers = { workspace = true }
18+
tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
1719
tokio-stream = "0.1.15"
18-
thiserror = "1.0.62"
20+
thiserror = "1.0.63"
1921
tracing = "0.1"
20-
tracing-opentelemetry = "0.24"
22+
tracing-opentelemetry = "0.25"
2123
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
22-
parking_lot = "0.12"
2324

2425
[build-dependencies]
2526
cmake = "0.1"

0 commit comments

Comments
 (0)