fix: Handle multiple rate limit types per descriptor and prevent IndexError (#16039)

orolega · web-flow · commit f747a4a38a3b · 2025-10-30T20:12:54.000-07:00
* improve descriptor_key handling for multiple and missing rate limit descriptors in parallel request limiter v3

* Add tests for parallel request limiter v3 in proxy hooks
diff --git a/litellm/proxy/hooks/parallel_request_limiter_v3.py b/litellm/proxy/hooks/parallel_request_limiter_v3.py
@@ -7,7 +7,6 @@
 import binascii
 import os
 from datetime import datetime
-from math import floor
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -1137,9 +1136,23 @@ async def async_pre_call_hook(
 
             if response["overall_code"] == "OVER_LIMIT":
                 # Find which descriptor hit the limit
-                for i, status in enumerate(response["statuses"]):
+                for status in response["statuses"]:
                     if status["code"] == "OVER_LIMIT":
-                        descriptor = descriptors[floor(i / 2)]
+                        # Use descriptor_key from status instead of index mapping
+                        descriptor_key = status["descriptor_key"]
+
+                        # Find the corresponding descriptor by key
+                        matching_descriptor = None
+                        for desc in descriptors:
+                            if desc["key"] == descriptor_key:
+                                matching_descriptor = desc
+                                break
+
+                        if matching_descriptor is None:
+                            # Fallback if we can't find the descriptor
+                            descriptor_value = "unknown"
+                        else:
+                            descriptor_value = matching_descriptor["value"]
 
                         # Calculate reset time (window_start + window_size)
                         now = self._get_current_time().timestamp()
@@ -1156,7 +1169,7 @@ async def async_pre_call_hook(
                         current_limit = status["current_limit"]
 
                         detail = (
-                            f"Rate limit exceeded for {descriptor['key']}: {descriptor['value']}. "
+                            f"Rate limit exceeded for {descriptor_key}: {descriptor_value}. "
                             f"Limit type: {rate_limit_type}. "
                             f"Current limit: {current_limit}, Remaining: {remaining_display}. "
                             f"Limit resets at: {reset_time_formatted}"
@@ -1476,9 +1489,9 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti
         from litellm.types.caching import RedisPipelineIncrementOperation
 
         try:
-            litellm_parent_otel_span: Union[Span, None] = (
-                _get_parent_otel_span_from_kwargs(kwargs)
-            )
+            litellm_parent_otel_span: Union[
+                Span, None
+            ] = _get_parent_otel_span_from_kwargs(kwargs)
             litellm_metadata = kwargs["litellm_params"]["metadata"]
             user_api_key = (
                 litellm_metadata.get("user_api_key") if litellm_metadata else None
diff --git a/tests/test_litellm/proxy/hooks/test_parallel_request_limiter_v3.py b/tests/test_litellm/proxy/hooks/test_parallel_request_limiter_v3.py
@@ -1440,6 +1440,147 @@ async def test_execute_redis_batch_rate_limiter_script_cluster_compatibility():
         assert len(unique_processed_keys) >= 2, "Should have processed at least some keys"
 
 
+@pytest.mark.asyncio
+async def test_multiple_rate_limits_per_descriptor():
+    """
+    Test that the IndexError fix works correctly when a descriptor has multiple rate limit types.
+
+    This specifically tests the scenario where:
+    1. A descriptor has multiple rate limit types (requests, tokens, max_parallel_requests)
+    2. Multiple statuses are generated for a single descriptor
+    3. The old floor(i / 2) mapping would fail with IndexError
+    4. The new descriptor_key-based lookup works correctly
+    """
+    _api_key = "sk-12345"
+    _api_key_hash = hash_token(_api_key)
+
+    # Create a user with multiple rate limit types to trigger multiple statuses per descriptor
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key_hash,
+        rpm_limit=2,  # requests limit
+        tpm_limit=10,  # tokens limit
+        max_parallel_requests=1,  # parallel requests limit
+    )
+
+    local_cache = DualCache()
+    parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
+        internal_usage_cache=InternalUsageCache(local_cache)
+    )
+
+    # Mock should_rate_limit to return a response with multiple statuses where one hits the limit
+    # This simulates the case where we have more statuses than descriptors due to multiple rate limit types
+    async def mock_should_rate_limit(descriptors, **kwargs):
+        # Verify we have one descriptor but will generate multiple statuses
+        assert len(descriptors) == 1, "Should have exactly one api_key descriptor"
+        assert descriptors[0]["key"] == "api_key", "Descriptor should be for api_key"
+
+        # Return multiple statuses for the single descriptor (requests OK, tokens OK, parallel OVER_LIMIT)
+        return {
+            "overall_code": "OVER_LIMIT",
+            "statuses": [
+                {
+                    "code": "OK",
+                    "current_limit": 2,
+                    "limit_remaining": 1,
+                    "rate_limit_type": "requests",
+                    "descriptor_key": "api_key"
+                },
+                {
+                    "code": "OK",
+                    "current_limit": 10,
+                    "limit_remaining": 8,
+                    "rate_limit_type": "tokens",
+                    "descriptor_key": "api_key"
+                },
+                {
+                    "code": "OVER_LIMIT",
+                    "current_limit": 1,
+                    "limit_remaining": -1,
+                    "rate_limit_type": "max_parallel_requests",
+                    "descriptor_key": "api_key"
+                }
+            ]
+        }
+
+    parallel_request_handler.should_rate_limit = mock_should_rate_limit
+
+    # Test the pre-call hook - this should raise HTTPException but NOT IndexError
+    with pytest.raises(HTTPException) as exc_info:
+        await parallel_request_handler.async_pre_call_hook(
+            user_api_key_dict=user_api_key_dict,
+            cache=local_cache,
+            data={"model": "gpt-3.5-turbo"},
+            call_type="",
+        )
+
+    # Verify the exception details are correct and use the descriptor_key approach
+    assert exc_info.value.status_code == 429
+    assert "Rate limit exceeded for api_key:" in exc_info.value.detail
+    assert "max_parallel_requests" in exc_info.value.detail
+    assert "Current limit: 1" in exc_info.value.detail
+    assert "Remaining: 0" in exc_info.value.detail  # max(0, -1) = 0
+
+    # Verify headers are set correctly
+    assert exc_info.value.headers.get("rate_limit_type") == "max_parallel_requests"
+    assert "retry-after" in exc_info.value.headers
+    assert "reset_at" in exc_info.value.headers
+
+
+@pytest.mark.asyncio
+async def test_missing_descriptor_fallback():
+    """
+    Test that the fallback works when a descriptor_key cannot be found in the descriptors list.
+
+    This tests an edge case where somehow the descriptor_key in status doesn't match
+    any descriptor key (shouldn't happen in normal operation but good for robustness).
+    """
+    _api_key = "sk-12345"
+    _api_key_hash = hash_token(_api_key)
+
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key_hash,
+        rpm_limit=2,
+    )
+
+    local_cache = DualCache()
+    parallel_request_handler = _PROXY_MaxParallelRequestsHandler(
+        internal_usage_cache=InternalUsageCache(local_cache)
+    )
+
+    # Mock should_rate_limit to return a status with descriptor_key that doesn't match descriptors
+    async def mock_should_rate_limit(descriptors, **kwargs):
+        # Return a status with a mismatched descriptor_key to test fallback
+        return {
+            "overall_code": "OVER_LIMIT",
+            "statuses": [
+                {
+                    "code": "OVER_LIMIT",
+                    "current_limit": 2,
+                    "limit_remaining": -1,
+                    "rate_limit_type": "requests",
+                    "descriptor_key": "nonexistent_key"  # This won't match any descriptor
+                }
+            ]
+        }
+
+    parallel_request_handler.should_rate_limit = mock_should_rate_limit
+
+    # Test the pre-call hook - should handle missing descriptor gracefully
+    with pytest.raises(HTTPException) as exc_info:
+        await parallel_request_handler.async_pre_call_hook(
+            user_api_key_dict=user_api_key_dict,
+            cache=local_cache,
+            data={"model": "gpt-3.5-turbo"},
+            call_type="",
+        )
+
+    # Verify the exception uses fallback values
+    assert exc_info.value.status_code == 429
+    assert "Rate limit exceeded for nonexistent_key: unknown" in exc_info.value.detail
+    assert "requests" in exc_info.value.detail
+    assert "Current limit: 2" in exc_info.value.detail
+
+
 @pytest.mark.asyncio
 async def test_execute_token_increment_script_cluster_compatibility():
     """