Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/workflows/e2e_test-on-change.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: E2E Test on change

on:
push:
branches:
- main
- 'feature/**'
pull_request:
branches:
- main
- 'feature/**'

jobs:
e2e-tests:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.13']
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Set up PDM
uses: pdm-project/setup-pdm@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
pdm sync -d
- name: Run e2e tests
run: |
pdm run test:e2e
10 changes: 9 additions & 1 deletion .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,27 @@ on:
- main
- 'feature/**'
pull_request:
branches:
- main
- 'feature/**'

jobs:
format-check:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.13']
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.13'
python-version: ${{ matrix.python-version }}
- name: Set up PDM
uses: pdm-project/setup-pdm@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
pdm sync -d
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/publish-on-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,16 @@ jobs:
python-package:
needs: build-and-publish # Run after the release is created
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.13']
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'
python-version: ${{ matrix.python-version }}

- name: Install build dependencies
run: |
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/test-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ on:
jobs:
test-python-package:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.13']
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'
python-version: ${{ matrix.python-version }}

- name: Install build dependencies
run: |
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,20 @@ on:
jobs:
format-check:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.13']
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.13'
python-version: ${{ matrix.python-version }}
- name: Set up PDM
uses: pdm-project/setup-pdm@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
pdm sync -d
Expand Down
18 changes: 18 additions & 0 deletions e2e/configs/e2e_simple_mock_client.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
data:
type: mock
load:
type: constant
stages:
- rate: 1
duration: 10
num_workers: 2
api:
type: chat
server:
type: mock
base_url: http://0.0.0.0:8000
report:
request_lifecycle:
summary: true
per_stage: true
per_request: true
Empty file added e2e/conftest.py
Empty file.
22 changes: 22 additions & 0 deletions e2e/tests/test_mock_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pytest

from utils.benchmark import run_benchmark_minimal


def test_simple_mock_client_benchmark():
result = run_benchmark_minimal("e2e/configs/e2e_simple_mock_client.yaml", timeout_sec=None)
assert result.success, "Benchmark failed"
assert result.reports, "No reports generated from benchmark"
assert result.reports["per_request_lifecycle_metrics.json"], "Missing requests report"
assert result.reports["stage_0_lifecycle_metrics.json"], "Missing stage report"
assert result.reports["summary_lifecycle_metrics.json"], "Missing summary report"

requests_report = result.reports["per_request_lifecycle_metrics.json"]
stage_report = result.reports["stage_0_lifecycle_metrics.json"]
summary_report = result.reports["summary_lifecycle_metrics.json"]

assert len(requests_report) == 10, "the number of requests should be 10"
assert stage_report["load_summary"]["achieved_rate"] > 1 or stage_report["load_summary"]["achieved_rate"] == pytest.approx(
1, abs=0.2
), "the achieved rate should be close to 1.0"
assert summary_report["successes"]["count"] == 10
115 changes: 115 additions & 0 deletions e2e/utils/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import json
import os
import shlex
import subprocess
import tempfile
import yaml
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional, List, Union

logger = logging.getLogger(__name__)


@dataclass
class BenchmarkResult:
"""Result of a minimal benchmark run."""

success: bool # True if process exit code == 0 and not timed out
timed_out: bool # True if we hit timeout and killed the process
returncode: int # Raw process return code (or -9/-15 on kill)
stdout: str # Combined stdout/stderr text
work_dir: Path # Working directory used for the run
reports: Optional[Dict[str, Any]] # Parsed json for reports if present


def _process_yaml_config(config: Union[str, Path, Dict[str, Any]], out_dir: Path) -> Path:
out_dir.mkdir(parents=True, exist_ok=True)
cfg_path = out_dir / "config_input.yaml"

if isinstance(config, (str, Path)):
src = Path(config)
if not src.exists():
raise FileNotFoundError(f"Config file not found: {src}")
config = yaml.safe_load(src.read_text(encoding="utf-8"))

# Overwrite output path to temporaty folder
config["storage"] = {"local_storage": {"path": out_dir.as_posix()}}

cfg_path.write_text(
yaml.safe_dump(config, sort_keys=False, default_flow_style=False),
encoding="utf-8",
)
return cfg_path


def _find_report_files(path: Path) -> Optional[List[Path]]:
"""Return the json reports files under path (if any)."""
candidates = list(path.glob("**/*.json"))
if not candidates:
return None
return candidates


def run_benchmark_minimal(
config: Union[str, Path, Dict[str, Any]],
*,
work_dir: Optional[Union[str, Path]] = None,
executable: str = "inference-perf",
timeout_sec: Optional[int] = 300,
extra_env: Optional[Dict[str, str]] = None,
) -> BenchmarkResult:
"""
Minimal wrapper:
- materializes config to YAML in work_dir,
- runs `inference-perf --config_file <config.yml>`,
- returns success/failure, stdout text, and parsed report.json (if present).
On timeout:
- kills the spawned process,
- marks `timed_out=True`, returns collected stdout up to kill.
"""
wd = Path(work_dir) if work_dir else Path(tempfile.mkdtemp(prefix="inference-perf-e2e-"))
cfg_path = _process_yaml_config(config, wd)

env = os.environ.copy()
if extra_env:
env.update({k: str(v) for k, v in extra_env.items()})

cmd = f"{shlex.quote(executable)} --config_file {shlex.quote(str(cfg_path))} --log-level DEBUG"

timed_out = False
try:
proc = subprocess.run(
cmd,
cwd=str(wd),
env=env,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
timeout=timeout_sec,
)
stdout = proc.stdout
return_code = proc.returncode
except subprocess.TimeoutExpired as e:
timed_out = True
stdout = e.stdout
return_code = -9

success = (return_code == 0) and (not timed_out)

logger.info("Benchmark output:\n%s", stdout)

# Attempt to read report.json (optional)
report_path = _find_report_files(wd)
reports = {report.name: json.loads(report.read_text(encoding="utf-8")) for report in report_path} if report_path else None

return BenchmarkResult(
success=success,
timed_out=timed_out,
returncode=return_code,
stdout=stdout or "",
work_dir=wd,
reports=reports,
)
55 changes: 51 additions & 4 deletions inference_perf/client/modelserver/mock_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from typing import List, Optional
from inference_perf.config import APIConfig, APIType
from inference_perf.apis import InferenceAPIData, InferenceInfo, RequestLifecycleMetric, ErrorResponseInfo
from .base import ModelServerClient
from .base import ModelServerClient, ModelServerPrometheusMetric, PrometheusMetricMetadata
import asyncio
import time
import logging
Expand All @@ -29,12 +29,13 @@ def __init__(
self,
metrics_collector: RequestDataCollector,
api_config: APIConfig,
timeout: Optional[int] = None,
mock_latency: float = 3,
timeout: Optional[float] = None,
mock_latency: float = 1,
) -> None:
super().__init__(api_config, timeout)
self.metrics_collector = metrics_collector
self.mock_latency = mock_latency
self.tokenizer = None

async def process_request(self, data: InferenceAPIData, stage_id: int, scheduled_time: float) -> None:
start = time.perf_counter()
Expand All @@ -44,7 +45,8 @@ async def process_request(self, data: InferenceAPIData, stage_id: int, scheduled
await asyncio.sleep(self.timeout)
raise asyncio.exceptions.TimeoutError()
else:
await asyncio.sleep(self.mock_latency)
if self.mock_latency > 0:
await asyncio.sleep(self.mock_latency)
self.metrics_collector.record_metric(
RequestLifecycleMetric(
stage_id=stage_id,
Expand Down Expand Up @@ -81,3 +83,48 @@ async def process_request(self, data: InferenceAPIData, stage_id: int, scheduled

def get_supported_apis(self) -> List[APIType]:
return [APIType.Completion, APIType.Chat]

def get_prometheus_metric_metadata(self) -> PrometheusMetricMetadata:
mock_prometheus_metric = ModelServerPrometheusMetric(
name="mock_metric",
op="mean",
type="counter",
filters=[],
)
return PrometheusMetricMetadata(
# Throughput
prompt_tokens_per_second=mock_prometheus_metric,
output_tokens_per_second=mock_prometheus_metric,
requests_per_second=mock_prometheus_metric,
# Latency
avg_request_latency=mock_prometheus_metric,
median_request_latency=mock_prometheus_metric,
p90_request_latency=mock_prometheus_metric,
p99_request_latency=mock_prometheus_metric,
# Request
total_requests=mock_prometheus_metric,
avg_prompt_tokens=mock_prometheus_metric,
avg_output_tokens=mock_prometheus_metric,
avg_queue_length=mock_prometheus_metric,
# Others
avg_time_to_first_token=None,
median_time_to_first_token=None,
p90_time_to_first_token=None,
p99_time_to_first_token=None,
avg_time_per_output_token=None,
median_time_per_output_token=None,
p90_time_per_output_token=None,
p99_time_per_output_token=None,
avg_inter_token_latency=None,
median_inter_token_latency=None,
p90_inter_token_latency=None,
p99_inter_token_latency=None,
avg_kv_cache_usage=None,
median_kv_cache_usage=None,
p90_kv_cache_usage=None,
p99_kv_cache_usage=None,
num_preemptions_total=None,
num_requests_swapped=None,
prefix_cache_hits=None,
prefix_cache_queries=None,
)
1 change: 1 addition & 0 deletions inference_perf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class ModelServerType(Enum):
VLLM = "vllm"
SGLANG = "sglang"
TGI = "tgi"
MOCK = "mock"


class LoadType(Enum):
Expand Down
2 changes: 1 addition & 1 deletion inference_perf/loadgen/load_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def __init__(
finished_requests_counter: "Synchronized[int]",
active_requests_counter: "Synchronized[int]",
):
super().__init__()
super().__init__(daemon=True) # kill worker process if main process exit unexpected
self.id = id
self.client = client
self.request_queue = request_queue
Expand Down
Loading