[Profiling] Upload vLLM Profiling results to AWS S3 (#81)

namanlalitnyu · web-flow · commit d67c81241b4d · 2025-09-17T13:54:44.000-07:00
* upload profiling results to S3

* fix

* fix repo name

* add model level heirarchy

* review comments

* rename test name
diff --git a/.github/scripts/run_vllm_profiling.sh b/.github/scripts/run_vllm_profiling.sh
@@ -59,6 +59,12 @@ cleanup_server() {
 run_profiling_tests() {
     # run profiling tests using JSON configuration
     local profiling_test_file="$1"
+    local base_profiler_dir="${VLLM_TORCH_PROFILER_DIR:-}"
+
+    if [[ -z "${base_profiler_dir}" ]]; then
+        echo "Error: VLLM_TORCH_PROFILER_DIR is not set."
+        exit 1
+    fi
 
     if [[ ! -f "$profiling_test_file" ]]; then
         echo "Error: Profiling test file $profiling_test_file not found!"
@@ -92,18 +98,29 @@ run_profiling_tests() {
         # Clean up any existing processes first
         kill_gpu_processes
 
+        # Create a profiling sub-directory for each test case to isolate the
+        # generated traces (e.g. using the model name hierarchy)
+        local sanitized_test_name="${TEST_NAME// /_}"
+        local test_name_directory="${base_profiler_dir}/${sanitized_test_name}"
+        mkdir -p "${test_name_directory}"
+        chmod 755 "${test_name_directory}"
+
+        # Override the profiler output directory for this test only
+        export VLLM_TORCH_PROFILER_DIR="${test_name_directory}"
+
         # Run the profiling test
         if start_vllm_server "$server_args"; then
             run_profiling "$client_args"
             cleanup_server
 
             # Debug: Check if profiling files were created
-            echo "DEBUG: Checking profiling directory: ${VLLM_TORCH_PROFILER_DIR}"
-            if [ -d "${VLLM_TORCH_PROFILER_DIR}" ]; then
+            echo "DEBUG: Checking profiling directory: $test_name_directory"
+            if [ -d "$test_name_directory" ]; then
                 echo "DEBUG: Profiling directory exists for test $TEST_NAME"
-                ls -la "${VLLM_TORCH_PROFILER_DIR}" || echo "DEBUG: Directory is empty or inaccessible"
-                find "${VLLM_TORCH_PROFILER_DIR}" -type f 2>/dev/null | head -10 | while read file; do
+                ls -la "$test_name_directory" || echo "DEBUG: Directory is empty or inaccessible"
+                find "$test_name_directory" -type f 2>/dev/null | head -10 | while read file; do
                     echo "DEBUG: Found profiling file: ${file}"
+                    rename_profiling_file "$file" "vllm"
                 done
             else
                 echo "DEBUG: Profiling directory does not exist for test $TEST_NAME!"
@@ -115,6 +132,9 @@ run_profiling_tests() {
             continue
         fi
     done
+
+    # Ensure the profiler directory is restored after processing all tests
+    export VLLM_TORCH_PROFILER_DIR="${base_profiler_dir}"
 }
 
 main() {
diff --git a/.github/scripts/utilities.sh b/.github/scripts/utilities.sh
@@ -133,3 +133,46 @@ check_hf_token() {
         echo "HF_TOKEN is set and valid."
     fi
 }
+
+rename_profiling_file() {
+    # Rename profiling files to standardized format
+    # $1: file path to rename
+    # $2: prefix name (e.g., "vllm", "sglang")
+    local file="$1"
+    local prefix_name="$2"
+
+    # Process .pt.trace.json.gz files
+    if [[ "$file" == *.pt.trace.json.gz ]]; then
+        local dir_path=$(dirname "$file")
+        local basename_file=$(basename "$file")
+
+        # Determine new filename based on content
+        local new_filename
+        if [[ "$basename_file" == *".async_llm."* ]]; then
+            new_filename="${prefix_name}.async_llm.pt.trace.json.gz"
+        else
+            new_filename="${prefix_name}.pt.trace.json.gz"
+        fi
+
+        local new_filepath="${dir_path}/${new_filename}"
+
+        # Only rename if the new filename is different
+        if [[ "$file" != "$new_filepath" ]]; then
+            echo "DEBUG: Renaming ${file} to ${new_filepath}"
+            mv "$file" "$new_filepath"
+            if [[ $? -eq 0 ]]; then
+                echo "DEBUG: Successfully renamed to ${new_filepath}"
+                return 0
+            else
+                echo "DEBUG: Failed to rename ${file}"
+                return 1
+            fi
+        else
+            echo "DEBUG: File ${file} already has correct name"
+            return 0
+        fi
+    else
+        echo "DEBUG: Skipping non-profiling file: ${file}"
+        return 0
+    fi
+}
diff --git a/.github/workflows/vllm-profiling.yml b/.github/workflows/vllm-profiling.yml
@@ -214,6 +214,25 @@ jobs:
           )
           docker exec -t "${container_name}" bash -c "cd vllm-profiling && bash ../.github/scripts/run_vllm_profiling.sh"
 
+      - name: Prepare S3 upload metadata
+        id: prepare_s3_upload
+        env:
+          REPOSITORY: vllm-project/vllm
+        run: |
+          set -eux
+
+          UPLOAD_DATE=$(date -u +"%Y-%m-%d")
+          echo "upload-date=${UPLOAD_DATE}" >> "${GITHUB_OUTPUT}"
+          echo "s3-prefix=${UPLOAD_DATE}/${REPOSITORY}/${HEAD_SHA}/${GITHUB_RUN_ID}/${GITHUB_JOB}" >> "${GITHUB_OUTPUT}"
+
+      - name: Upload profiling results to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-prefix: ${{ steps.prepare_s3_upload.outputs.s3-prefix }}
+          retention-days: 180
+          path: vllm-profiling/profiling-results
+          if-no-files-found: warn
+
       - uses: actions/upload-artifact@v4
         with:
           name: profiling-results--${{ env.DEVICE_TYPE }}
diff --git a/vllm-profiling/cuda/profiling-tests.json b/vllm-profiling/cuda/profiling-tests.json
@@ -1,6 +1,6 @@
 [
     {
-        "test_name": "profiling_opt_125m_tp1_random",
+        "test_name": "facebook_opt_125m_tp1_random",
         "server_parameters": {
             "model": "facebook/opt-125m",
             "swap_space": 16,

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`[`
`2`	`2`	`{`
`3`		`- "test_name": "profiling_opt_125m_tp1_random",`
	`3`	`+ "test_name": "facebook_opt_125m_tp1_random",`
`4`	`4`	`"server_parameters": {`
`5`	`5`	`"model": "facebook/opt-125m",`
`6`	`6`	`"swap_space": 16,`