running generic tests

namanlalitnyu · namanlalitnyu · commit 3fc9fb8711e8 · 2025-09-12T13:43:45.000-07:00
diff --git a/.github/scripts/run_vllm_profiling.sh b/.github/scripts/run_vllm_profiling.sh
@@ -1,157 +1,197 @@
 #!/bin/bash
-
 set -eux
 
-# Script to run vLLM profiling with configurable parameters via environment variables
-
-# Global variables - set defaults for environment variables
-setup_environment() {
-  export VLLM_USE_V1=${VLLM_USE_V1:-1}
-  MODEL_NAME=${MODEL_NAME:-facebook/opt-125m}
-  SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-${MODEL_NAME}}
-  DATASET_NAME=${DATASET_NAME:-random}
-  RANDOM_INPUT_LEN=${RANDOM_INPUT_LEN:-750}
-  RANDOM_OUTPUT_LEN=${RANDOM_OUTPUT_LEN:-75}
-  ENDPOINT=${ENDPOINT:-/v1/completions}
-  HOST=${HOST:-localhost}
-  PORT=${PORT:-8000}
-  NUM_PROMPTS=${NUM_PROMPTS:-100}
+json2args() {
+    # transforms the JSON string to command line args, and '_' is replaced to '-'
+    # example:
+    # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+    # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+    local json_string=$1
+    local args=$(
+        echo "$json_string" | jq -r '
+        to_entries |
+        map(
+            if .value == "" then "--" + (.key | gsub("_"; "-"))
+            else "--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)
+            end
+        ) |
+        join(" ")
+        '
+    )
+    echo "$args"
 }
 
 print_configuration() {
-  echo 'Running vLLM profiling with the following configuration:'
-  echo "  Model: ${MODEL_NAME}"
-  echo "  Served Model: ${SERVED_MODEL_NAME}"
-  echo "  Dataset: ${DATASET_NAME}"
-  echo "  Input Length: ${RANDOM_INPUT_LEN}"
-  echo "  Output Length: ${RANDOM_OUTPUT_LEN}"
-  echo "  Endpoint: ${ENDPOINT}"
-  echo "  Host: ${HOST}"
-  echo "  Port: ${PORT}"
-  echo "  Num Prompts: ${NUM_PROMPTS}"
-  echo "  VLLM_USE_V1: ${VLLM_USE_V1}"
+    echo 'Running vLLM profiling with the following configuration:'
+    echo "  Profiler Dir: ${VLLM_TORCH_PROFILER_DIR:-not set}"
+    echo "  VLLM_USE_V1: ${VLLM_USE_V1:-1}"
 }
 
 install_dependencies() {
-  echo "Installing required dependencies..."
-  (which curl) || (apt-get update && apt-get install -y curl)
-  (which lsof) || (apt-get update && apt-get install -y lsof)
+    echo "Installing required dependencies..."
+    (which curl) || (apt-get update && apt-get install -y curl)
+    (which lsof) || (apt-get update && apt-get install -y lsof)
+    (which jq) || (apt-get update && apt-get -y install jq)
 }
 
 setup_workspace() {
-  # Ensure we're in the workspace directory, but don't go into vllm source
-  # The Docker container has vLLM pre-installed, we shouldn't run from source
-  cd /tmp/workspace
-
-  # Create the profiling directory (no need for tilde expansion now)
-  echo "Creating profiling directory: ${VLLM_TORCH_PROFILER_DIR}"
-  mkdir -p "${VLLM_TORCH_PROFILER_DIR}"
+    # Ensure we're in the workspace directory, but don't go into vllm source
+    cd /tmp/workspace
 
-  # Ensure the directory is writable
-  chmod 755 "${VLLM_TORCH_PROFILER_DIR}"
+    # Create the profiling directory
+    echo "Creating profiling directory: ${VLLM_TORCH_PROFILER_DIR}"
+    mkdir -p "${VLLM_TORCH_PROFILER_DIR}"
+    chmod 755 "${VLLM_TORCH_PROFILER_DIR}"
 }
 
 wait_for_server() {
-  # Wait for vLLM server to start
-  # Return 1 if vLLM server crashes
-  timeout 1200 bash -c "
-    until curl -s ${HOST}:${PORT}/v1/models > /dev/null; do
-      sleep 1
-    done" && return 0 || return 1
+    # Wait for vLLM server to start
+    # Return 1 if vLLM server crashes
+    local host_port="${1:-localhost:8000}"
+    timeout 1200 bash -c "
+        until curl -s ${host_port}/v1/models > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
 }
 
 kill_gpu_processes() {
-  ps -aux
-  lsof -t -i:8000 | xargs -r kill -9
-  pgrep python3 | xargs -r kill -9
-  pgrep VLLM | xargs -r kill -9
-
-  # Wait until GPU memory usage decreases
-  if command -v nvidia-smi; then
-    echo "Waiting for GPU memory to clear..."
-    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-      sleep 1
-    done
-  fi
+    ps -aux
+    lsof -t -i:8000 | xargs -r kill -9
+    pgrep python3 | xargs -r kill -9
+    pgrep VLLM | xargs -r kill -9
+
+    # Wait until GPU memory usage decreases
+    if command -v nvidia-smi; then
+        echo "Waiting for GPU memory to clear..."
+        while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+            sleep 1
+        done
+    fi
 }
 
 start_vllm_server() {
-  echo "Starting vLLM server..."
-
-  VLLM_USE_V1=${VLLM_USE_V1} python3 -m vllm.entrypoints.openai.api_server \
-    --model "${MODEL_NAME}" \
-    --swap-space 16 \
-    --disable-log-requests \
-    --host :: \
-    --port "${PORT}" \
-    --dtype float16 &
-
-  server_pid=$!
-  echo "vLLM server started with PID: ${server_pid}"
-
-  # Wait for server to be ready
-  echo "Waiting for vLLM server to be ready..."
-  if wait_for_server; then
-    echo "vLLM server is up and running!"
-    return 0
-  else
-    echo "vLLM server failed to start within the timeout period."
-    kill -9 $server_pid 2>/dev/null || true
-    return 1
-  fi
+    local server_args="$1"
+
+    echo "Starting vLLM server..."
+    VLLM_USE_V1=${VLLM_USE_V1:-1} python3 -m vllm.entrypoints.openai.api_server ${server_args} &
+
+    server_pid=$!
+    echo "vLLM server started with PID: ${server_pid}"
+
+    # Wait for server to be ready
+    echo "Waiting for vLLM server to be ready..."
+    if wait_for_server "${SERVER_HOST}:${SERVER_PORT}"; then
+        echo "vLLM server is up and running!"
+        return 0
+    else
+        echo "vLLM server failed to start within the timeout period."
+        kill -9 $server_pid 2>/dev/null || true
+        return 1
+    fi
 }
 
 run_profiling() {
-  echo "Starting load generation for profiling..."
-
-  local bench_command="vllm bench serve --dataset-name ${DATASET_NAME} --model ${MODEL_NAME} --served-model-name ${SERVED_MODEL_NAME} --random-input-len ${RANDOM_INPUT_LEN} --random-output-len ${RANDOM_OUTPUT_LEN} --endpoint ${ENDPOINT} --ignore-eos --host ${HOST} --port ${PORT} --num-prompts ${NUM_PROMPTS} --profile"
-
-  echo "Load gen command: ${bench_command}"
-
-  vllm bench serve \
-    --dataset-name "${DATASET_NAME}" \
-    --model "${MODEL_NAME}" \
-    --served-model-name "${SERVED_MODEL_NAME}" \
-    --random-input-len "${RANDOM_INPUT_LEN}" \
-    --random-output-len "${RANDOM_OUTPUT_LEN}" \
-    --endpoint "${ENDPOINT}" \
-    --ignore-eos \
-    --host "${HOST}" \
-    --port "${PORT}" \
-    --num-prompts "${NUM_PROMPTS}" \
-    --profile
+    local client_args="$1"
+
+    echo "Starting load generation for profiling..."
+    echo "Client command: vllm bench serve ${client_args}"
+
+    vllm bench serve ${client_args}
 }
 
 cleanup_server() {
-  echo "Stopping vLLM server..."
-  kill -9 $server_pid 2>/dev/null || true
-  kill_gpu_processes
+    echo "Stopping vLLM server..."
+    kill -9 $server_pid 2>/dev/null || true
+    kill_gpu_processes
+}
+
+run_profiling_tests() {
+    # run profiling tests using JSON configuration
+    local profiling_test_file="$1"
+
+    if [[ ! -f "$profiling_test_file" ]]; then
+        echo "Error: Profiling test file $profiling_test_file not found!"
+        exit 1
+    fi
+
+    # Iterate over profiling tests
+    jq -c '.[]' "$profiling_test_file" | while read -r params; do
+        # Get the test name
+        TEST_NAME=$(echo "$params" | jq -r '.test_name')
+        echo "Running profiling test case: $TEST_NAME"
+
+
+        # Extract server and client parameters
+        server_params=$(echo "$params" | jq -r '.server_parameters')
+        client_params=$(echo "$params" | jq -r '.client_parameters')
+
+        # Convert JSON to command line arguments
+        server_args=$(json2args "$server_params")
+        client_args=$(json2args "$client_params")
+
+        # Extract host and port for server health check
+        SERVER_HOST=$(echo "$server_params" | jq -r '.host // "::"')
+        SERVER_PORT=$(echo "$server_params" | jq -r '.port // 8000')
+
+        # Convert :: to localhost for health check
+        if [[ "$SERVER_HOST" == "::" ]]; then
+            SERVER_HOST="localhost"
+        fi
+
+        # Clean up any existing processes first
+        kill_gpu_processes
+
+        # Run the profiling test
+        if start_vllm_server "$server_args"; then
+            run_profiling "$client_args"
+            cleanup_server
+
+            # Debug: Check if profiling files were created
+            echo "DEBUG: Checking profiling directory: ${VLLM_TORCH_PROFILER_DIR}"
+            if [ -d "${VLLM_TORCH_PROFILER_DIR}" ]; then
+                echo "DEBUG: Profiling directory exists for test $TEST_NAME"
+                ls -la "${VLLM_TORCH_PROFILER_DIR}" || echo "DEBUG: Directory is empty or inaccessible"
+                find "${VLLM_TORCH_PROFILER_DIR}" -type f 2>/dev/null | head -10 | while read file; do
+                    echo "DEBUG: Found profiling file: ${file}"
+                done
+            else
+                echo "DEBUG: Profiling directory does not exist for test $TEST_NAME!"
+            fi
+
+            echo "Profiling test $TEST_NAME completed successfully."
+        else
+            echo "Failed to start vLLM server for test $TEST_NAME."
+            continue
+        fi
+    done
 }
 
 main() {
-  # Setup phase
-  setup_environment
-  print_configuration
-  install_dependencies
-  setup_workspace
-
-  # Debug: Show environment variables
-  echo "DEBUG: VLLM_TORCH_PROFILER_DIR=${VLLM_TORCH_PROFILER_DIR:-not set}"
-
-  # Clean up any existing processes first
-  kill_gpu_processes
-
-  # Main execution phase
-  if start_vllm_server; then
-    run_profiling
-    cleanup_server
-
-    echo "Profiling completed. Artifacts should be available in ${VLLM_TORCH_PROFILER_DIR:-default profiler directory}."
-  else
-    echo "Failed to start vLLM server. Exiting."
-    exit 1
-  fi
+    # Set default values
+    export VLLM_USE_V1=${VLLM_USE_V1:-1}
+
+    # Setup phase
+    print_configuration
+    install_dependencies
+    setup_workspace
+
+    # Determine the profiling test file based on device type
+    local device_name="${DEVICE_NAME:-cuda}"
+    local profiling_test_file="/tmp/workspace/vllm-profiling/${device_name}/profiling-tests.json"
+
+    echo "Looking for profiling test file: $profiling_test_file"
+
+    if [[ -f "$profiling_test_file" ]]; then
+        echo "Found profiling test file: $profiling_test_file"
+        run_profiling_tests "$profiling_test_file"
+    else
+        echo "Error: No profiling test file found at $profiling_test_file"
+        echo "Available files in vllm-profiling/:"
+        find /tmp/workspace/vllm-profiling/ -name "*.json" 2>/dev/null || echo "No JSON files found"
+        exit 1
+    fi
+
+    echo "All profiling tests completed. Artifacts should be available in ${VLLM_TORCH_PROFILER_DIR:-default profiler directory}."
 }
 
-# Run the main function
 main "$@"
diff --git a/.github/workflows/vllm-profiling.yml b/.github/workflows/vllm-profiling.yml
@@ -15,12 +15,6 @@ on:
         description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked)
         required: false
         type: string
-      models:
-        description: |
-          A comma-separated list of models (optional, default to run everything)
-        required: false
-        type: string
-        default: 'facebook/opt-125m'
   pull_request:
     paths:
       - .github/workflows/vllm-profiling.yml
@@ -193,14 +187,6 @@ jobs:
           VLLM_TORCH_PROFILER_DIR: /tmp/workspace/vllm/vllm_profile
           CUDA_VISIBLE_DEVICES: 0
           VLLM_USE_V1: 1
-          # Profiling parameters
-          MODEL_NAME: ${{ inputs.models || 'facebook/opt-125m' }}
-          SERVED_MODEL_NAME: ${{ inputs.models || 'facebook/opt-125m' }}
-          RANDOM_INPUT_LEN: 750
-          RANDOM_OUTPUT_LEN: 75
-          PORT: 8000
-          NUM_PROMPTS: 100
-          DATASET_NAME: random
 
         run: |
           set -eux
@@ -223,13 +209,7 @@ jobs:
             -e VLLM_TORCH_PROFILER_DIR \
             -e CUDA_VISIBLE_DEVICES \
             -e VLLM_USE_V1 \
-            -e MODEL_NAME \
-            -e SERVED_MODEL_NAME \
-            -e RANDOM_INPUT_LEN \
-            -e RANDOM_OUTPUT_LEN \
-            -e PORT \
-            -e NUM_PROMPTS \
-            -e DATASET_NAME \
+            -e DEVICE_NAME \
             -e ON_CPU="${ON_CPU}" \
             --ipc=host \
             --tty \
diff --git a/vllm-profiling/cuda/profiling-tests.json b/vllm-profiling/cuda/profiling-tests.json
@@ -0,0 +1,26 @@
+[
+    {
+        "test_name": "profiling_opt_125m_tp1_random",
+        "server_parameters": {
+            "model": "facebook/opt-125m",
+            "swap_space": 16,
+            "disable_log_requests": "",
+            "host": "::",
+            "port": 8000,
+            "dtype": "float16"
+        },
+        "client_parameters": {
+            "model": "facebook/opt-125m",
+            "served_model_name": "facebook/opt-125m",
+            "dataset_name": "random",
+            "random_input_len": 750,
+            "random_output_len": 75,
+            "endpoint": "/v1/completions",
+            "host": "localhost",
+            "port": 8000,
+            "num_prompts": 100,
+            "ignore_eos": true,
+            "profile": true
+        }
+    }
+]