LinPoly
diff --git a/‎Kaldi/SpeechRecognition/.gitignore
Lines changed: 1 addition & 0 deletions b/‎Kaldi/SpeechRecognition/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎Kaldi/SpeechRecognition/Dockerfile
Lines changed: 85 additions & 21 deletions b/‎Kaldi/SpeechRecognition/Dockerfile
Lines changed: 85 additions & 21 deletions
diff --git a/‎Kaldi/SpeechRecognition/Dockerfile.client
Lines changed: 86 additions & 13 deletions b/‎Kaldi/SpeechRecognition/Dockerfile.client
Lines changed: 86 additions & 13 deletions
diff --git a/‎Kaldi/SpeechRecognition/Dockerfile.notebook
Lines changed: 1 addition & 1 deletion b/‎Kaldi/SpeechRecognition/Dockerfile.notebook
Lines changed: 1 addition & 1 deletion
diff --git a/‎Kaldi/SpeechRecognition/README.md
Lines changed: 26 additions & 28 deletions b/‎Kaldi/SpeechRecognition/README.md
Lines changed: 26 additions & 28 deletions
diff --git a/‎Kaldi/SpeechRecognition/data/README.md b/‎Kaldi/SpeechRecognition/data/README.md
@@ -2,3 +2,4 @@ data/*
 !data/README.md
 .*.swp
 .*.swo
+.clang-format
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,42 +11,106 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/kaldi:20.03-py3 as kb
-FROM nvcr.io/nvidia/tritonserver:20.03-py3
-ENV DEBIAN_FRONTEND=noninteractive
+ARG TRITONSERVER_IMAGE=nvcr.io/nvidia/tritonserver:21.05-py3
+ARG KALDI_IMAGE=nvcr.io/nvidia/kaldi:21.08-py3
+ARG PYTHON_VER=3.8
+
+#
+# Kaldi shared library dependencies
+#
+FROM ${KALDI_IMAGE} as kaldi
+
+#
+# Builder image based on Triton Server SDK image
+#
+FROM ${TRITONSERVER_IMAGE}-sdk as builder
+ARG PYTHON_VER
 
 # Kaldi dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -yq --no-install-recommends \
         automake \
         autoconf \
         cmake \
         flac \
         gawk \
         libatlas3-base \
         libtool \
-        python3.6 \
-        python3.6-dev \
+        python${PYTHON_VER} \
+        python${PYTHON_VER}-dev \
         sox \
         subversion \
         unzip \
         bc \
         libatlas-base-dev \
-        zlib1g-dev
+        gfortran \
+        zlib1g-dev; \
+    rm -rf /var/lib/apt/lists/*
 
-RUN mkdir /opt/trtis-kaldi && mkdir -p /workspace/model-repo/kaldi_online/1 && mkdir -p /mnt/model-repo
-# Copying static files
-COPY scripts /workspace/scripts
+# Add Kaldi dependency
+COPY --from=kaldi /opt/kaldi /opt/kaldi
+
+# Set up Atlas
+RUN set -eux; \
+    ln -sf /usr/include/x86_64-linux-gnu/atlas     /usr/local/include/atlas; \
+    ln -sf /usr/include/x86_64-linux-gnu/cblas.h   /usr/local/include/cblas.h; \
+    ln -sf /usr/include/x86_64-linux-gnu/clapack.h /usr/local/include/clapack.h; \
+    ln -sf /usr/lib/x86_64-linux-gnu/atlas         /usr/local/lib/atlas
 
-# Moving Kaldi to container
-COPY --from=kb /opt/kaldi /opt/kaldi
-ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
 
-# Building the custom backend
-COPY trtis-kaldi-backend /workspace/trtis-kaldi-backend
-#COPY --from=cbe /workspace/install/custom-backend-sdk /workspace/trtis-kaldi-backend/custom-backend-sdk
-RUN cd /workspace/trtis-kaldi-backend && wget https://github.com/NVIDIA/tensorrt-inference-server/releases/download/v1.9.0/v1.9.0_ubuntu1804.custombackend.tar.gz -O custom-backend-sdk.tar.gz && tar -xzf custom-backend-sdk.tar.gz
-RUN cd /workspace/trtis-kaldi-backend/ && make && cp libkaldi-trtisbackend.so /workspace/model-repo/kaldi_online/1/ && cd - && rm -r /workspace/trtis-kaldi-backend
+#
+# Kaldi custom backend build
+#
+FROM builder as backend-build
+
+# Build the custom backend
+COPY kaldi-asr-backend /workspace/triton-kaldi-backend
+RUN set -eux; \
+    cd /workspace/triton-kaldi-backend; \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$(pwd)/install" \
+        -B build .; \
+    cmake --build build --parallel; \
+    cmake --install build
+
 
-COPY scripts/nvidia_kaldi_trtis_entrypoint.sh /opt/trtis-kaldi
+#
+# Final server image
+#
+FROM ${TRITONSERVER_IMAGE}
+ARG PYTHON_VER
+
+# Kaldi dependencies
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -yq --no-install-recommends \
+        automake \
+        autoconf \
+        cmake \
+        flac \
+        gawk \
+        libatlas3-base \
+        libtool \
+        python${PYTHON_VER} \
+        python${PYTHON_VER}-dev \
+        sox \
+        subversion \
+        unzip \
+        bc \
+        libatlas-base-dev \
+        zlib1g-dev; \
+    rm -rf /var/lib/apt/lists/*
+
+# Add Kaldi dependency
+COPY --from=kaldi /opt/kaldi /opt/kaldi
+
+# Add Kaldi custom backend shared library and scripts
+COPY --from=backend-build /workspace/triton-kaldi-backend/install/backends/kaldi/libtriton_kaldi.so /workspace/model-repo/kaldi_online/1/
+COPY scripts /workspace/scripts
 
-ENTRYPOINT ["/opt/trtis-kaldi/nvidia_kaldi_trtis_entrypoint.sh"]
+# Setup entrypoint and environment
+ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:/opt/tritonserver/lib:$LD_LIBRARY_PATH
+COPY scripts/nvidia_kaldi_triton_entrypoint.sh /opt/triton-kaldi/
+VOLUME /mnt/model-repo
+ENTRYPOINT ["/opt/triton-kaldi/nvidia_kaldi_triton_entrypoint.sh"]
+CMD ["tritonserver", "--model-repo=/workspace/model-repo"]
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,33 +11,106 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/kaldi:20.03-py3 as kb
-FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk
+ARG TRITONSERVER_IMAGE=nvcr.io/nvidia/tritonserver:21.05-py3
+ARG KALDI_IMAGE=nvcr.io/nvidia/kaldi:21.08-py3
+ARG PYTHON_VER=3.8
+
+
+#
+# Kaldi shared library dependencies
+#
+FROM ${KALDI_IMAGE} as kaldi
+
+
+#
+# Builder image based on Triton Server SDK image
+#
+FROM ${TRITONSERVER_IMAGE}-sdk as builder
+ARG PYTHON_VER
+
+# Kaldi dependencies
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -yq --no-install-recommends \
+        automake \
+        autoconf \
+        cmake \
+        flac \
+        gawk \
+        libatlas3-base \
+        libtool \
+        python${PYTHON_VER} \
+        python${PYTHON_VER}-dev \
+        sox \
+        subversion \
+        unzip \
+        bc \
+        libatlas-base-dev \
+        gfortran \
+        zlib1g-dev; \
+    rm -rf /var/lib/apt/lists/*
+
+# Add Kaldi dependency
+COPY --from=kaldi /opt/kaldi /opt/kaldi
+
+# Set up Atlas
+RUN set -eux; \
+    ln -sf /usr/include/x86_64-linux-gnu/atlas     /usr/local/include/atlas; \
+    ln -sf /usr/include/x86_64-linux-gnu/cblas.h   /usr/local/include/cblas.h; \
+    ln -sf /usr/include/x86_64-linux-gnu/clapack.h /usr/local/include/clapack.h; \
+    ln -sf /usr/lib/x86_64-linux-gnu/atlas         /usr/local/lib/atlas
+
+
+#
+# Triton Kaldi client build
+#
+FROM builder as client-build
+
+# Build the clients
+COPY kaldi-asr-client /workspace/triton-client
+RUN set -eux; \
+    cd /workspace; \
+    echo 'add_subdirectory(../../../triton-client src/c++/triton-client)' \
+        >> /workspace/client/src/c++/CMakeLists.txt; \
+    cmake -DCMAKE_BUILD_TYPE=Release -B build client; \
+    cmake --build build --parallel --target cc-clients
+
+
+#
+# Final gRPC client image
+#
+FROM ${TRITONSERVER_IMAGE}
+ARG PYTHON_VER
 
 # Kaldi dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -yq --no-install-recommends \
         automake \
         autoconf \
         cmake \
         flac \
         gawk \
         libatlas3-base \
         libtool \
-        python3.6 \
-        python3.6-dev \
+        python${PYTHON_VER} \
+        python${PYTHON_VER}-dev \
         sox \
         subversion \
         unzip \
         bc \
         libatlas-base-dev \
-        zlib1g-dev
+        zlib1g-dev; \
+    rm -rf /var/lib/apt/lists/*
 
-# Moving Kaldi to container
-COPY --from=kb /opt/kaldi /opt/kaldi
-ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
+# Add Kaldi dependency
+COPY --from=kaldi /opt/kaldi /opt/kaldi
 
+# Add Triton clients and scripts
+COPY --from=client-build /workspace/build/cc-clients/src/c++/triton-client/kaldi-asr-parallel-client /usr/local/bin/
 COPY scripts /workspace/scripts
 
-COPY kaldi-asr-client /workspace/src/clients/c++/kaldi-asr-client
-RUN echo "add_subdirectory(kaldi-asr-client)" >> "/workspace/src/clients/c++/CMakeLists.txt"
-RUN cd /workspace/build/ && make -j16 trtis-clients
+# Setup environment and entrypoint
+ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:/opt/tritonserver/lib:$LD_LIBRARY_PATH
+VOLUME /mnt/model-repo
+ENTRYPOINT ["/usr/local/bin/kaldi-asr-parallel-client"]
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk
+FROM nvcr.io/nvidia/tritonserver:21.05-py3-sdk
 
 # Kaldi dependencies
 RUN apt-get update && apt-get install -y jupyter \
 
@@ -46,15 +46,18 @@ A reference model is used by all test scripts and benchmarks presented in this r
 Details about parameters can be found in the [Parameters](#parameters) section.
 
 * `model path`: Configured to use the pretrained LibriSpeech model.
+* `use_tensor_cores`: 1
+* `main_q_capacity`: 30000
+* `aux_q_capacity`: 400000
 * `beam`: 10
+* `num_channels`: 4000
 * `lattice_beam`: 7
 * `max_active`: 10,000
 * `frame_subsampling_factor`: 3
 * `acoustic_scale`: 1.0
-* `num_worker_threads`: 20
-* `max_execution_batch_size`: 256
-* `max_batch_size`: 4096
-* `instance_group.count`: 2
+* `num_worker_threads`: 40
+* `max_batch_size`: 400
+* `instance_group.count`: 1
 
 ## Setup
 
@@ -134,9 +137,8 @@ The model configuration parameters are passed to the model and have  an impact o
 
 The inference engine configuration parameters configure the inference engine. They impact performance, but not accuracy.
 
-* `max_batch_size`: The maximum number of inference channels opened at a given time. If set to `4096`, then one instance will handle at most 4096 concurrent users.
+* `max_batch_size`: The size of one execution batch on the GPU. This parameter should be set as large as necessary to saturate the GPU, but not bigger. Larger batches will lead to a higher throughput, smaller batches to lower latency.
 * `num_worker_threads`: The number of CPU threads for the postprocessing CPU tasks, such as lattice determinization and text generation from the lattice.
-* `max_execution_batch_size`: The size of one execution batch on the GPU. This parameter should be set as large as necessary to saturate the GPU, but not bigger. Larger batches will lead to a higher throughput, smaller batches to lower latency. 
 * `input.WAV_DATA.dims`: The maximum number of samples per chunk. The value must be a multiple of `frame_subsampling_factor * chunks_per_frame`.
 
 ### Inference process
@@ -156,7 +158,7 @@ The client can be configured through a set of parameters that define its behavio
     -u <URL for inference service and its gRPC port>
     -o : Only feed each channel at realtime speed. Simulates online clients.
     -p : Print text outputs
-
+    -b : Print partial (best path) text outputs
 ```
 
 ### Input/Output
@@ -187,13 +189,8 @@ Even if only the best path is used, we are still generating a full lattice for b
 
 Support for Kaldi ASR models that are different from the provided LibriSpeech model is experimental. However, it is possible to modify the [Model Path](#model-path) section of the config file `model-repo/kaldi_online/config.pbtxt` to set up your own model. 
 
-The models and Kaldi allocators are currently not shared between instances. This means that if your model is large, you may end up with not enough memory on the GPU to store two different instances. If that's the case, 
-you can set `count` to `1` in the [`instance_group` section](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/model_configuration.html#instance-groups) of the config file.
-
 ## Performance
 
-The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
-
 
 ### Metrics
 
@@ -207,8 +204,7 @@ Latency is defined as the delay between the availability of the last chunk of au
 4. *Server:* Compute inference of last chunk
 5. *Server:* Generate the raw lattice for the full utterance
 6. *Server:* Determinize the raw lattice
-7. *Server:* Generate the text output associated with the best path in the determinized lattice
-8. *Client:* Receive text output
+8. *Client:* Receive lattice output
 9. *Client:* Call callback with output
 10. ***t1** <- Current time*  
 
@@ -219,20 +215,18 @@ The latency is defined such as `latency = t1 - t0`.
 Our results were obtained by:
 
 1. Building and starting the server as described in [Quick Start Guide](#quick-start-guide).
-2. Running  `scripts/run_inference_all_v100.sh` and  `scripts/run_inference_all_t4.sh`
-
-
-| GPU | Realtime I/O | Number of parallel audio channels | Throughput (RTFX) | Latency | | | |
-| ------ | ------ | ------ | ------ | ------ | ------ | ------ |------ |
-| | | | | 90% | 95% | 99% | Avg |
-| V100 | No | 2000 | 1506.5 | N/A | N/A | N/A | N/A |
-| V100 | Yes | 1500 |  1243.2 | 0.582 | 0.699 | 1.04 | 0.400 |
-| V100 | Yes | 1000 |  884.1 | 0.379 | 0.393 | 0.788 | 0.333 |
-| V100 | Yes | 800 |  660.2 | 0.334 | 0.340 | 0.438 | 0.288 |
-| T4 | No | 1000 | 675.2 | N/A | N/A | N/A| N/A |
-| T4 | Yes | 700 | 629.2 | 0.945 | 1.08 | 1.27 | 0.645 |
-| T4 | Yes | 400 | 373.7 | 0.579 | 0.624 | 0.758 | 0.452 |
-
+2. Running  `scripts/run_inference_all_a100.sh`,  `scripts/run_inference_all_v100.sh` and `scripts/run_inference_all_t4.sh`
+
+
+|  GPU  | Realtime I/O | Number of parallel audio channels | Latency (s) |       |       |       |
+| ----- | ------------ | --------------------------------- | ----------- | ----- | ----- | ----- |
+|       |              |                                   |     90%     |  95%  |  99%  |  Avg  |
+|  A100 |          Yes |                              2000 |       0.11 | 0.12 | 0.14 | 0.09 |
+|  V100 |          Yes |                              2000 |       0.42 | 0.50 | 0.61 | 0.23 |
+|  V100 |          Yes |                              1000 |       0.09 | 0.09 | 0.11 | 0.07 |
+|  T4   |          Yes |                              600  |       0.17 | 0.18 | 0.22 | 0.14 |
+|  T4   |          Yes |                              400  |       0.12 | 0.13 | 0.15 | 0.10 |
+  
 ## Release notes
 
 ### Changelog
@@ -244,5 +238,9 @@ April 2020
 * Printing WER accuracy in Triton client
 * Using the latest Kaldi GPU ASR pipeline, extended support for features (ivectors, fbanks)
 
+July 2021
+* Significantly improve latency and throughput for the backend
+* Update Triton to v2.10.0
+
 ### Known issues
 * No multi-gpu support for the Triton integration