Skip to content

Commit aa5c195

Browse files
author
Yijia J
committed
testpr
Change to ubuntu-lastest since ubuntu-20.04 is deprecated Change to ubuntu-lastest since ubuntu-20.04 is deprecated Change to ubuntu-lastest since ubuntu-20.04 is deprecated Move prefix cache from MaxText (#239) Retry grpc async request (#240) The exception raised by asyncio task is not welled catch. If the server is not ready, it cause the benchmark serving blocked forever without noticed. Retry the connection to the server. Adding PyTests in JetStream unit test workflow for code coverage. (#242) Supporting Multi-LoRA inferencing via JetStream server (#221) Supporting Multi-LoRA inferencing via JetStream server following [LLM Inference gateway API protocols](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol#inference-api-protocol). - Implemented an adapter_tensorstore to load, store, manage and unload the adapter weights - Added and exposed [required metrics](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol#metrics-reporting) at prometheus endpoint - Added multi_lora_decoding service with corresponding APIs as per the [requirement](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol#inference-api-protocol). - Implemented single LoRA functionality support.
1 parent a2b3333 commit aa5c195

27 files changed

+5463
-96
lines changed

.github/workflows/add_label.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
# Test
1415

1516
name: Add Label
1617

.github/workflows/release.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
runs-on: ${{ matrix.os }}
3434
strategy:
3535
matrix:
36-
os: [ubuntu-20.04]
36+
os: [ubuntu-24.04]
3737
python-version: ['3.10']
3838
steps:
3939
- name: Checkout
@@ -59,7 +59,7 @@ jobs:
5959
needs: release
6060
strategy:
6161
matrix:
62-
os: [ubuntu-20.04]
62+
os: [ubuntu-24.04]
6363
python-version: ['3.10']
6464
environment:
6565
name: pypi

.github/workflows/unit_tests.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
name: "Python type/lint/format checks"
3232
strategy:
3333
matrix:
34-
os: [ubuntu-20.04]
34+
os: [ubuntu-24.04]
3535
python-version: ['3.10']
3636
runs-on: ${{ matrix.os }}
3737
steps:
@@ -54,7 +54,7 @@ jobs:
5454
name: "JetStream unit tests"
5555
strategy:
5656
matrix:
57-
os: [ubuntu-20.04]
57+
os: [ubuntu-24.04]
5858
python-version: ['3.10']
5959
runs-on: ${{ matrix.os }}
6060
steps:
@@ -68,5 +68,7 @@ jobs:
6868
run: make install-deps
6969
- name: Run all unit tests in JetStream (jetstream/tests)
7070
run: make unit-tests
71+
- name: Run all py tests in JetStream (jetstream/tests)
72+
run: make py-tests
7173
- name: Create test coverage report
72-
run: make check-test-coverage
74+
run: make check-test-coverage

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ python -m unittest -v jetstream.tests.core.test_orchestrator
6565
# Test JetStream core server library
6666
python -m unittest -v jetstream.tests.core.test_server
6767
68+
# Test JetStream lora adapter tensorstore
69+
python -m unittest -v jetstream.tests.core.lora.test_adapter_tensorstore
70+
6871
# Test mock JetStream engine implementation
6972
python -m unittest -v jetstream.tests.engine.test_mock_engine
7073

benchmarks/benchmark_serving.py

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -625,29 +625,37 @@ async def grpc_async_request(
625625
) -> tuple[list[int], float, float, float]:
626626
"""Send grpc synchronous request since the current grpc server is sync."""
627627
options = [("grpc.keepalive_timeout_ms", 10000)]
628-
async with grpc.aio.insecure_channel(api_url, options=options) as channel:
629-
stub = jetstream_pb2_grpc.OrchestratorStub(channel)
630-
request_start_time = time.perf_counter()
631-
response = stub.Decode(request)
632-
token_list = []
633-
ttft = 0
634-
ttst = 0
635-
stream_resp_cnt = 0
636-
async for resp in response:
637-
stream_resp_cnt += 1
638-
if stream_resp_cnt == 1:
639-
await prefill_quota.inc()
640-
ttft = time.perf_counter() - request_start_time
641-
if ttft > 2.0:
642-
print(datetime.now(), f"slow TTFT {ttft:.2f}", prefill_quota.value())
643-
elif stream_resp_cnt == 2:
644-
ttst = time.perf_counter() - request_start_time
645-
resp_tokens = resp.stream_content.samples[0].token_ids
646-
token_list.extend(resp_tokens)
647-
out_token_cnt.increment(len(resp_tokens))
648-
await active_req_quota.inc()
649-
req_latency = time.perf_counter() - request_start_time
650-
return token_list, ttft, ttst, req_latency
628+
# Retry connection while server is not ready.
629+
while True:
630+
try:
631+
async with grpc.aio.insecure_channel(api_url, options=options) as channel:
632+
stub = jetstream_pb2_grpc.OrchestratorStub(channel)
633+
request_start_time = time.perf_counter()
634+
response = stub.Decode(request)
635+
token_list = []
636+
ttft = 0
637+
ttst = 0
638+
stream_resp_cnt = 0
639+
async for resp in response:
640+
stream_resp_cnt += 1
641+
if stream_resp_cnt == 1:
642+
await prefill_quota.inc()
643+
ttft = time.perf_counter() - request_start_time
644+
if ttft > 2.0:
645+
print(
646+
datetime.now(), f"slow TTFT {ttft:.2f}", prefill_quota.value()
647+
)
648+
elif stream_resp_cnt == 2:
649+
ttst = time.perf_counter() - request_start_time
650+
resp_tokens = resp.stream_content.samples[0].token_ids
651+
token_list.extend(resp_tokens)
652+
out_token_cnt.increment(len(resp_tokens))
653+
await active_req_quota.inc()
654+
req_latency = time.perf_counter() - request_start_time
655+
return token_list, ttft, ttst, req_latency
656+
except grpc.aio.AioRpcError as e:
657+
print(e)
658+
await asyncio.sleep(10)
651659

652660

653661
async def send_request(

0 commit comments

Comments
 (0)