Skip to content

Commit d67c812

Browse files
[Profiling] Upload vLLM Profiling results to AWS S3 (#81)
* upload profiling results to S3 * fix * fix repo name * add model level heirarchy * review comments * rename test name
1 parent 4d1bbbf commit d67c812

File tree

4 files changed

+87
-5
lines changed

4 files changed

+87
-5
lines changed

.github/scripts/run_vllm_profiling.sh

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ cleanup_server() {
5959
run_profiling_tests() {
6060
# run profiling tests using JSON configuration
6161
local profiling_test_file="$1"
62+
local base_profiler_dir="${VLLM_TORCH_PROFILER_DIR:-}"
63+
64+
if [[ -z "${base_profiler_dir}" ]]; then
65+
echo "Error: VLLM_TORCH_PROFILER_DIR is not set."
66+
exit 1
67+
fi
6268

6369
if [[ ! -f "$profiling_test_file" ]]; then
6470
echo "Error: Profiling test file $profiling_test_file not found!"
@@ -92,18 +98,29 @@ run_profiling_tests() {
9298
# Clean up any existing processes first
9399
kill_gpu_processes
94100

101+
# Create a profiling sub-directory for each test case to isolate the
102+
# generated traces (e.g. using the model name hierarchy)
103+
local sanitized_test_name="${TEST_NAME// /_}"
104+
local test_name_directory="${base_profiler_dir}/${sanitized_test_name}"
105+
mkdir -p "${test_name_directory}"
106+
chmod 755 "${test_name_directory}"
107+
108+
# Override the profiler output directory for this test only
109+
export VLLM_TORCH_PROFILER_DIR="${test_name_directory}"
110+
95111
# Run the profiling test
96112
if start_vllm_server "$server_args"; then
97113
run_profiling "$client_args"
98114
cleanup_server
99115

100116
# Debug: Check if profiling files were created
101-
echo "DEBUG: Checking profiling directory: ${VLLM_TORCH_PROFILER_DIR}"
102-
if [ -d "${VLLM_TORCH_PROFILER_DIR}" ]; then
117+
echo "DEBUG: Checking profiling directory: $test_name_directory"
118+
if [ -d "$test_name_directory" ]; then
103119
echo "DEBUG: Profiling directory exists for test $TEST_NAME"
104-
ls -la "${VLLM_TORCH_PROFILER_DIR}" || echo "DEBUG: Directory is empty or inaccessible"
105-
find "${VLLM_TORCH_PROFILER_DIR}" -type f 2>/dev/null | head -10 | while read file; do
120+
ls -la "$test_name_directory" || echo "DEBUG: Directory is empty or inaccessible"
121+
find "$test_name_directory" -type f 2>/dev/null | head -10 | while read file; do
106122
echo "DEBUG: Found profiling file: ${file}"
123+
rename_profiling_file "$file" "vllm"
107124
done
108125
else
109126
echo "DEBUG: Profiling directory does not exist for test $TEST_NAME!"
@@ -115,6 +132,9 @@ run_profiling_tests() {
115132
continue
116133
fi
117134
done
135+
136+
# Ensure the profiler directory is restored after processing all tests
137+
export VLLM_TORCH_PROFILER_DIR="${base_profiler_dir}"
118138
}
119139

120140
main() {

.github/scripts/utilities.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,46 @@ check_hf_token() {
133133
echo "HF_TOKEN is set and valid."
134134
fi
135135
}
136+
137+
rename_profiling_file() {
138+
# Rename profiling files to standardized format
139+
# $1: file path to rename
140+
# $2: prefix name (e.g., "vllm", "sglang")
141+
local file="$1"
142+
local prefix_name="$2"
143+
144+
# Process .pt.trace.json.gz files
145+
if [[ "$file" == *.pt.trace.json.gz ]]; then
146+
local dir_path=$(dirname "$file")
147+
local basename_file=$(basename "$file")
148+
149+
# Determine new filename based on content
150+
local new_filename
151+
if [[ "$basename_file" == *".async_llm."* ]]; then
152+
new_filename="${prefix_name}.async_llm.pt.trace.json.gz"
153+
else
154+
new_filename="${prefix_name}.pt.trace.json.gz"
155+
fi
156+
157+
local new_filepath="${dir_path}/${new_filename}"
158+
159+
# Only rename if the new filename is different
160+
if [[ "$file" != "$new_filepath" ]]; then
161+
echo "DEBUG: Renaming ${file} to ${new_filepath}"
162+
mv "$file" "$new_filepath"
163+
if [[ $? -eq 0 ]]; then
164+
echo "DEBUG: Successfully renamed to ${new_filepath}"
165+
return 0
166+
else
167+
echo "DEBUG: Failed to rename ${file}"
168+
return 1
169+
fi
170+
else
171+
echo "DEBUG: File ${file} already has correct name"
172+
return 0
173+
fi
174+
else
175+
echo "DEBUG: Skipping non-profiling file: ${file}"
176+
return 0
177+
fi
178+
}

.github/workflows/vllm-profiling.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,25 @@ jobs:
214214
)
215215
docker exec -t "${container_name}" bash -c "cd vllm-profiling && bash ../.github/scripts/run_vllm_profiling.sh"
216216
217+
- name: Prepare S3 upload metadata
218+
id: prepare_s3_upload
219+
env:
220+
REPOSITORY: vllm-project/vllm
221+
run: |
222+
set -eux
223+
224+
UPLOAD_DATE=$(date -u +"%Y-%m-%d")
225+
echo "upload-date=${UPLOAD_DATE}" >> "${GITHUB_OUTPUT}"
226+
echo "s3-prefix=${UPLOAD_DATE}/${REPOSITORY}/${HEAD_SHA}/${GITHUB_RUN_ID}/${GITHUB_JOB}" >> "${GITHUB_OUTPUT}"
227+
228+
- name: Upload profiling results to S3
229+
uses: seemethere/upload-artifact-s3@v5
230+
with:
231+
s3-prefix: ${{ steps.prepare_s3_upload.outputs.s3-prefix }}
232+
retention-days: 180
233+
path: vllm-profiling/profiling-results
234+
if-no-files-found: warn
235+
217236
- uses: actions/upload-artifact@v4
218237
with:
219238
name: profiling-results--${{ env.DEVICE_TYPE }}

vllm-profiling/cuda/profiling-tests.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"test_name": "profiling_opt_125m_tp1_random",
3+
"test_name": "facebook_opt_125m_tp1_random",
44
"server_parameters": {
55
"model": "facebook/opt-125m",
66
"swap_space": 16,

0 commit comments

Comments
 (0)