Skip to content

Commit f747a4a

Browse files
authored
fix: Handle multiple rate limit types per descriptor and prevent IndexError (#16039)
* improve descriptor_key handling for multiple and missing rate limit descriptors in parallel request limiter v3 * Add tests for parallel request limiter v3 in proxy hooks
1 parent c1369a0 commit f747a4a

File tree

2 files changed

+161
-7
lines changed

2 files changed

+161
-7
lines changed

litellm/proxy/hooks/parallel_request_limiter_v3.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import binascii
88
import os
99
from datetime import datetime
10-
from math import floor
1110
from typing import (
1211
TYPE_CHECKING,
1312
Any,
@@ -1137,9 +1136,23 @@ async def async_pre_call_hook(
11371136

11381137
if response["overall_code"] == "OVER_LIMIT":
11391138
# Find which descriptor hit the limit
1140-
for i, status in enumerate(response["statuses"]):
1139+
for status in response["statuses"]:
11411140
if status["code"] == "OVER_LIMIT":
1142-
descriptor = descriptors[floor(i / 2)]
1141+
# Use descriptor_key from status instead of index mapping
1142+
descriptor_key = status["descriptor_key"]
1143+
1144+
# Find the corresponding descriptor by key
1145+
matching_descriptor = None
1146+
for desc in descriptors:
1147+
if desc["key"] == descriptor_key:
1148+
matching_descriptor = desc
1149+
break
1150+
1151+
if matching_descriptor is None:
1152+
# Fallback if we can't find the descriptor
1153+
descriptor_value = "unknown"
1154+
else:
1155+
descriptor_value = matching_descriptor["value"]
11431156

11441157
# Calculate reset time (window_start + window_size)
11451158
now = self._get_current_time().timestamp()
@@ -1156,7 +1169,7 @@ async def async_pre_call_hook(
11561169
current_limit = status["current_limit"]
11571170

11581171
detail = (
1159-
f"Rate limit exceeded for {descriptor['key']}: {descriptor['value']}. "
1172+
f"Rate limit exceeded for {descriptor_key}: {descriptor_value}. "
11601173
f"Limit type: {rate_limit_type}. "
11611174
f"Current limit: {current_limit}, Remaining: {remaining_display}. "
11621175
f"Limit resets at: {reset_time_formatted}"
@@ -1476,9 +1489,9 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti
14761489
from litellm.types.caching import RedisPipelineIncrementOperation
14771490

14781491
try:
1479-
litellm_parent_otel_span: Union[Span, None] = (
1480-
_get_parent_otel_span_from_kwargs(kwargs)
1481-
)
1492+
litellm_parent_otel_span: Union[
1493+
Span, None
1494+
] = _get_parent_otel_span_from_kwargs(kwargs)
14821495
litellm_metadata = kwargs["litellm_params"]["metadata"]
14831496
user_api_key = (
14841497
litellm_metadata.get("user_api_key") if litellm_metadata else None

tests/test_litellm/proxy/hooks/test_parallel_request_limiter_v3.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1440,6 +1440,147 @@ async def test_execute_redis_batch_rate_limiter_script_cluster_compatibility():
14401440
assert len(unique_processed_keys) >= 2, "Should have processed at least some keys"
14411441

14421442

1443+
@pytest.mark.asyncio
1444+
async def test_multiple_rate_limits_per_descriptor():
1445+
"""
1446+
Test that the IndexError fix works correctly when a descriptor has multiple rate limit types.
1447+
1448+
This specifically tests the scenario where:
1449+
1. A descriptor has multiple rate limit types (requests, tokens, max_parallel_requests)
1450+
2. Multiple statuses are generated for a single descriptor
1451+
3. The old floor(i / 2) mapping would fail with IndexError
1452+
4. The new descriptor_key-based lookup works correctly
1453+
"""
1454+
_api_key = "sk-12345"
1455+
_api_key_hash = hash_token(_api_key)
1456+
1457+
# Create a user with multiple rate limit types to trigger multiple statuses per descriptor
1458+
user_api_key_dict = UserAPIKeyAuth(
1459+
api_key=_api_key_hash,
1460+
rpm_limit=2, # requests limit
1461+
tpm_limit=10, # tokens limit
1462+
max_parallel_requests=1, # parallel requests limit
1463+
)
1464+
1465+
local_cache = DualCache()
1466+
parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
1467+
internal_usage_cache=InternalUsageCache(local_cache)
1468+
)
1469+
1470+
# Mock should_rate_limit to return a response with multiple statuses where one hits the limit
1471+
# This simulates the case where we have more statuses than descriptors due to multiple rate limit types
1472+
async def mock_should_rate_limit(descriptors, **kwargs):
1473+
# Verify we have one descriptor but will generate multiple statuses
1474+
assert len(descriptors) == 1, "Should have exactly one api_key descriptor"
1475+
assert descriptors[0]["key"] == "api_key", "Descriptor should be for api_key"
1476+
1477+
# Return multiple statuses for the single descriptor (requests OK, tokens OK, parallel OVER_LIMIT)
1478+
return {
1479+
"overall_code": "OVER_LIMIT",
1480+
"statuses": [
1481+
{
1482+
"code": "OK",
1483+
"current_limit": 2,
1484+
"limit_remaining": 1,
1485+
"rate_limit_type": "requests",
1486+
"descriptor_key": "api_key"
1487+
},
1488+
{
1489+
"code": "OK",
1490+
"current_limit": 10,
1491+
"limit_remaining": 8,
1492+
"rate_limit_type": "tokens",
1493+
"descriptor_key": "api_key"
1494+
},
1495+
{
1496+
"code": "OVER_LIMIT",
1497+
"current_limit": 1,
1498+
"limit_remaining": -1,
1499+
"rate_limit_type": "max_parallel_requests",
1500+
"descriptor_key": "api_key"
1501+
}
1502+
]
1503+
}
1504+
1505+
parallel_request_handler.should_rate_limit = mock_should_rate_limit
1506+
1507+
# Test the pre-call hook - this should raise HTTPException but NOT IndexError
1508+
with pytest.raises(HTTPException) as exc_info:
1509+
await parallel_request_handler.async_pre_call_hook(
1510+
user_api_key_dict=user_api_key_dict,
1511+
cache=local_cache,
1512+
data={"model": "gpt-3.5-turbo"},
1513+
call_type="",
1514+
)
1515+
1516+
# Verify the exception details are correct and use the descriptor_key approach
1517+
assert exc_info.value.status_code == 429
1518+
assert "Rate limit exceeded for api_key:" in exc_info.value.detail
1519+
assert "max_parallel_requests" in exc_info.value.detail
1520+
assert "Current limit: 1" in exc_info.value.detail
1521+
assert "Remaining: 0" in exc_info.value.detail # max(0, -1) = 0
1522+
1523+
# Verify headers are set correctly
1524+
assert exc_info.value.headers.get("rate_limit_type") == "max_parallel_requests"
1525+
assert "retry-after" in exc_info.value.headers
1526+
assert "reset_at" in exc_info.value.headers
1527+
1528+
1529+
@pytest.mark.asyncio
1530+
async def test_missing_descriptor_fallback():
1531+
"""
1532+
Test that the fallback works when a descriptor_key cannot be found in the descriptors list.
1533+
1534+
This tests an edge case where somehow the descriptor_key in status doesn't match
1535+
any descriptor key (shouldn't happen in normal operation but good for robustness).
1536+
"""
1537+
_api_key = "sk-12345"
1538+
_api_key_hash = hash_token(_api_key)
1539+
1540+
user_api_key_dict = UserAPIKeyAuth(
1541+
api_key=_api_key_hash,
1542+
rpm_limit=2,
1543+
)
1544+
1545+
local_cache = DualCache()
1546+
parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
1547+
internal_usage_cache=InternalUsageCache(local_cache)
1548+
)
1549+
1550+
# Mock should_rate_limit to return a status with descriptor_key that doesn't match descriptors
1551+
async def mock_should_rate_limit(descriptors, **kwargs):
1552+
# Return a status with a mismatched descriptor_key to test fallback
1553+
return {
1554+
"overall_code": "OVER_LIMIT",
1555+
"statuses": [
1556+
{
1557+
"code": "OVER_LIMIT",
1558+
"current_limit": 2,
1559+
"limit_remaining": -1,
1560+
"rate_limit_type": "requests",
1561+
"descriptor_key": "nonexistent_key" # This won't match any descriptor
1562+
}
1563+
]
1564+
}
1565+
1566+
parallel_request_handler.should_rate_limit = mock_should_rate_limit
1567+
1568+
# Test the pre-call hook - should handle missing descriptor gracefully
1569+
with pytest.raises(HTTPException) as exc_info:
1570+
await parallel_request_handler.async_pre_call_hook(
1571+
user_api_key_dict=user_api_key_dict,
1572+
cache=local_cache,
1573+
data={"model": "gpt-3.5-turbo"},
1574+
call_type="",
1575+
)
1576+
1577+
# Verify the exception uses fallback values
1578+
assert exc_info.value.status_code == 429
1579+
assert "Rate limit exceeded for nonexistent_key: unknown" in exc_info.value.detail
1580+
assert "requests" in exc_info.value.detail
1581+
assert "Current limit: 2" in exc_info.value.detail
1582+
1583+
14431584
@pytest.mark.asyncio
14441585
async def test_execute_token_increment_script_cluster_compatibility():
14451586
"""

0 commit comments

Comments
 (0)