CyCle1024
diff --git a/‎.github/scripts/eval_opencompass_config.py
+73-6 b/‎.github/scripts/eval_opencompass_config.py
+73-6
diff --git a/‎.github/scripts/set_benchmark_param.sh
+5 b/‎.github/scripts/set_benchmark_param.sh
+5
diff --git a/‎.github/workflows/benchmark.yml
+3-3 b/‎.github/workflows/benchmark.yml
+3-3
diff --git a/‎.github/workflows/daily_ete_test.yml
+10-8 b/‎.github/workflows/daily_ete_test.yml
+10-8
diff --git a/‎.github/workflows/evaluate.yml
+62-31 b/‎.github/workflows/evaluate.yml
+62-31
diff --git a/‎.github/workflows/pr_ete_test.yml
+1 b/‎.github/workflows/pr_ete_test.yml
+1
diff --git a/‎autotest/config.yaml
+8 b/‎autotest/config.yaml
+8
@@ -495,26 +495,40 @@
 tb_qwen_chat_7b = dict(type=TurboMindModel,
                        abbr='qwen-7b-chat-turbomind',
                        path='Qwen/Qwen-7B-Chat',
-                       engine_config=tb_engine_config_template_max_bs_16,
+                       engine_config=tb_engine_config_template_max_bs_128,
                        gen_config=qwen_gen_config_template,
                        max_out_len=MAX_NEW_TOKENS,
                        max_seq_len=MAX_SESSION_LEN,
-                       batch_size=16,
-                       concurrency=16,
+                       batch_size=128,
+                       concurrency=128,
                        meta_template=qwen_meta_template,
                        run_cfg=run_cfg_tp1_template,
                        end_str='<|im_end|>')
 
+tb_qwen_chat_7b_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='qwen-7b-chat-4bits-turbomind',
+    path='Qwen/Qwen-7B-Chat-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_128,
+    gen_config=qwen_gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=qwen_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<|im_end|>')
+
 # config for qwen-chat-7b pytorch
 pt_qwen_chat_7b = dict(type=LmdeployPytorchModel,
                        abbr='qwen-7b-chat-pytorch',
                        path='Qwen/Qwen-7B-Chat',
-                       engine_config=pt_engine_config_template_max_bs_16,
+                       engine_config=pt_engine_config_template_max_bs_64,
                        gen_config=qwen_gen_config_template,
                        max_out_len=MAX_NEW_TOKENS,
                        max_seq_len=MAX_SESSION_LEN,
-                       batch_size=16,
-                       concurrency=16,
+                       batch_size=64,
+                       concurrency=64,
                        meta_template=qwen_meta_template,
                        run_cfg=run_cfg_tp1_template,
                        end_str='<|im_end|>')
@@ -552,6 +566,21 @@
                          run_cfg=run_cfg_tp1_template,
                          end_str='[INST]')
 
+# config for llama2-chat-7b-w4a16 turbomind
+tb_llama2_chat_7b_wa416 = dict(
+    type=TurboMindModel,
+    abbr='llama-2-7b-chat-4bits-turbomind',
+    path='meta-llama/Llama-2-7b-chat-hf-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=llama2_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='[INST]')
+
 # config for llama2-chat-7b pytorch
 pt_llama2_chat_7b = dict(type=LmdeployPytorchModel,
                          abbr='llama-2-7b-chat-pytorch',
@@ -854,3 +883,41 @@
     meta_template=llama3_meta_template,
     run_cfg=run_cfg_tp1_template,
     end_str='[INST]')
+
+# config for code llama
+tb_codellama_7b_chat = dict(type=TurboMindModel,
+                            abbr='codellama-7b-chat-turbomind',
+                            path='codellama/CodeLlama-7b-Instruct-hf',
+                            engine_config=tb_engine_config_template_max_bs_128,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
+                            batch_size=128,
+                            concurrency=128,
+                            run_cfg=dict(num_gpus=1, num_procs=1),
+                            end_str='</s>')
+
+tb_codellama_7b_chat_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='codellama-7b-chat-4bits-turbomind',
+    path='codellama/CodeLlama-7b-Instruct-hf-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+    end_str='</s>')
+
+pt_codellama_7b_chat = dict(type=LmdeployPytorchModel,
+                            abbr='codellama-7b-chat-pytorch',
+                            path='codellama/CodeLlama-7b-Instruct-hf',
+                            engine_config=pt_engine_config_template_max_bs_128,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
+                            batch_size=128,
+                            concurrency=128,
+                            run_cfg=dict(num_gpus=1, num_procs=1),
+                            end_str='</s>')
@@ -33,3 +33,8 @@ if [[ $1 == *"internlm2-chat-20b"* ]] || [[ $1 == *"Qwen1.5-32B-Chat"* ]]
 then
   echo "TP_INFO=--tp 2" >> "$GITHUB_ENV"
 fi
+
+if [[ $1 == *"internlm2"* ]] || [[ $1 == *"Llama-3"* ]]
+then
+  echo "LONGTEXT_BENCHMARK=TRUE" >> "$GITHUB_ENV"
+fi
@@ -32,7 +32,7 @@ on:
         required: true
         description: 'Dependency packages, you can also set a specific version'
         type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'
+        default: 'packaging transformers_stream_generator transformers==4.41.0 datasets matplotlib jmespath'
       default_tp:
         required: true
         description: 'Default tp value'
@@ -431,7 +431,7 @@ jobs:
       - name: Start restful api turbomind - kvint4
         if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint4')
         run: |
-          lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level ${{inputs.log_level}} > turbomind_kvint4_run.log 2>&1 &
+          lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MAX_BATCH_SIZE $MODEL_FORMAT $TP_INFO --quant-policy 4 --log-level ${{inputs.log_level}} > turbomind_kvint4_run.log 2>&1 &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 180s
       - name: Run restful benchmark -kvint4
@@ -456,7 +456,7 @@ jobs:
       - name: Start restful api turbomind - kvint8
         if: contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.kvint_quantization), 'kvint8')
         run: |
-          lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level ${{inputs.log_level}} > turbomind_kvint8_run.log 2>&1 &
+          lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MAX_BATCH_SIZE $MODEL_FORMAT $TP_INFO --quant-policy 8 --log-level ${{inputs.log_level}} > turbomind_kvint8_run.log 2>&1 &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 180s
       - name: Run restful benchmark -kvint8
 
@@ -32,7 +32,7 @@ on:
         required: true
         description: 'Dependency packages, you can also set a specific version'
         type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'
+        default: 'packaging transformers_stream_generator transformers==4.41.0 datasets matplotlib openai attrdict timm modelscope jmespath'
       tools_regression:
         required: true
         description: 'Whether start a tool regression'
@@ -60,7 +60,7 @@ env:
 
 jobs:
   linux-build:
-    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
     strategy:
       matrix:
         pyver: [py38, py310]
@@ -96,7 +96,7 @@ jobs:
 
   test_tools:
     needs: linux-build
-    if: ${{github.event_name == 'schedule' || (!cancelled() && inputs.tools_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.tools_regression)}}
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 300
     env:
@@ -137,7 +137,7 @@ jobs:
           python3 -m pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu118
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'}}
+          python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers==4.41.0 datasets matplotlib openai attrdict timm modelscope jmespath'}}
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
@@ -240,7 +240,9 @@ jobs:
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'interface-pipeline'))
         run: |
-          pytest autotest/interface/pipeline -m 'not pr_test' --alluredir=allure-results
+          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=allure-results || true
+          pytest autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results || true
+          pytest autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
       - name: Test lmdeploy - local testcase
         if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case')
         run: |
@@ -264,7 +266,7 @@ jobs:
 
 
   test_triton:
-    if: ${{github.event_name == 'schedule' || (!cancelled() && inputs.triton_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.triton_regression)}}
     runs-on: [self-hosted, linux-a100-2]
     needs: test_tools
     timeout-minutes: 30
@@ -420,7 +422,7 @@ jobs:
 
 
   test_restful:
-    if: ${{github.event_name == 'schedule' || (!cancelled() && inputs.restful_regression)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.restful_regression)}}
     runs-on: [self-hosted, linux-a100]
     needs: test_tools
     strategy:
@@ -459,7 +461,7 @@ jobs:
           python3 -m pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu118
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath'}}
+          python3 -m pip install ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers==4.41.0 datasets matplotlib openai attrdict timm modelscope jmespath'}}
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
 
@@ -5,9 +5,9 @@ on:
     inputs:
       repo_org:
         required: false
-        description: 'Tested repository organization name. Default is InternLM'
+        description: 'Tested repository organization name. Default is InternLM/lmdeploy'
         type: string
-        default: InternLM
+        default: 'InternLM/lmdeploy'
       repo_ref:
         required: false
         description: 'Set branch or tag or commit id. Default is "main"'
@@ -28,10 +28,52 @@ on:
         description: 'CUDA_VISIBLE_DEVICES.'
         type: string
         default: '0,1,2,3,4,5,6,7'
+      dependency_pkgs:
+        required: true
+        description: 'Dependency packages, you can also set a specific version'
+        type: string
+        default: 'pynvml packaging protobuf transformers_stream_generator transformers==4.41.0'
 
 
 jobs:
+  linux-build:
+    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py38]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda11.8
+      OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
   evaluate:
+    needs: linux-build
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 4320 # 72hours
     container:
@@ -48,52 +90,41 @@ jobs:
     steps:
       - name: Setup systems
         run: |
-          rm /etc/apt/sources.list.d/cuda*.list
-          apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \
-              libgoogle-glog-dev
-          rm -rf /var/lib/apt/lists
           export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')"
           echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV
-      - name: Checkout repository
+      - name: Clone repository
         uses: actions/checkout@v3
         with:
-          repository: '${{ github.event.inputs.repo_org}}/lmdeploy'
-          ref: ${{github.event.inputs.repo_ref}}
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Download Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py38
       - name: Install pytorch
         run: |
           python3 -m pip cache dir
           python3 -m pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
-      - name: Build lmdeploy
+      - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install cmake
-          python3 -m pip install -r requirements/build.txt
-          mkdir build
-          cd build
-          cmake .. \
-              -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-              -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-              -DCMAKE_INSTALL_PREFIX=./install \
-              -DBUILD_PY_FFI=ON \
-              -DBUILD_MULTI_GPU=ON \
-              -DCMAKE_CUDA_FLAGS="-lineinfo" \
-              -DUSE_NVTX=ON \
-              -DSM=80 \
-              -DCMAKE_CUDA_ARCHITECTURES=80 \
-              -DBUILD_TEST=OFF
-          make -j$(nproc) && make install
-      - name: Install lmdeploy from source
-        run: |
-          python3 -m pip install pynvml packaging protobuf transformers_stream_generator
+          python3 -m pip install ${{inputs.dependency_pkgs }}
           # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install .
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl
+          python3 -m pip install -r requirements/test.txt
       - name: Install opencompass
         run: |
           git clone --depth=1 https://github.com/open-compass/opencompass.git
           cd opencompass
           python3 -m pip install --user -e .
           echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          python3 -m pip list
+          lmdeploy check_env
       - name: Setup paths for evaluation
         run: |
           ln -s /root/opencompass-data ./data
 
@@ -58,6 +58,7 @@ jobs:
           python3 -m pip install -r requirements/build.txt
           mkdir build
           cd build
+          cp -r /nvme/qa_test_models/offline_pkg/_deps .
           cmake .. \
               -DCMAKE_BUILD_TYPE=RelWithDebInfo \
               -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
 
@@ -86,6 +86,14 @@ quatization_case_config:
         - internlm/internlm2-20b
         - Qwen/Qwen1.5-7B-Chat
         - meta-llama/Meta-Llama-3-8B-Instruct
+        - Qwen/Qwen-VL-Chat
+        - liuhaotian/llava-v1.5-7b
+        - liuhaotian/llava-v1.5-13b
+        - liuhaotian/llava-v1.6-vicuna-7b
+        - 01-ai/Yi-VL-6B
+        - deepseek-ai/deepseek-vl-1.3b-chat
+        - OpenGVLab/InternVL-Chat-V1-5
+        - internlm/internlm-xcomposer2-vl-7b
     kvint:
         - meta-llama/Llama-2-7b-chat-hf
         - meta-llama/Meta-Llama-3-8B-Instruct