Added LiteLLM to the stack

2025-08-18 09:40:50 +00:00
parent 0648c1968c
commit d220b04e32
2682 changed files with 533609 additions and 1 deletions
--- a/Development/litellm/tests/local_testing/test_router_cooldowns.py
+++ b/Development/litellm/tests/local_testing/test_router_cooldowns.py
@@ -0,0 +1,869 @@
+#### What this tests ####
+#    This tests calling router with fallback models
+
+import asyncio
+import os
+import random
+import sys
+import time
+import traceback
+
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system-path
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+import openai
+
+import litellm
+from litellm import Router
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.router_utils.cooldown_handlers import (
+    _async_get_cooldown_deployments,
+    _should_run_cooldown_logic,
+)
+from litellm.types.router import (
+    DeploymentTypedDict,
+    LiteLLMParamsTypedDict,
+    AllowedFailsPolicy,
+)
+
+
+@pytest.mark.asyncio
+async def test_cooldown_badrequest_error():
+    """
+    Test 1. It SHOULD NOT cooldown a deployment on a BadRequestError
+    """
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "azure/chatgpt-v-3",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+            }
+        ],
+        debug_level="DEBUG",
+        set_verbose=True,
+        cooldown_time=300,
+        num_retries=0,
+        allowed_fails=0,
+    )
+
+    # Act & Assert
+    try:
+
+        response = await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "gm"}],
+            bad_param=200,
+        )
+    except Exception:
+        pass
+
+    await asyncio.sleep(3)  # wait for deployment to get cooled-down
+
+    response = await router.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": "gm"}],
+        mock_response="hello",
+    )
+
+    assert response is not None
+
+    print(response)
+
+
+@pytest.mark.asyncio
+async def test_dynamic_cooldowns():
+    """
+    Assert kwargs for completion/embedding have 'cooldown_time' as a litellm_param
+    """
+    # litellm.set_verbose = True
+    tmp_mock = MagicMock()
+
+    litellm.failure_callback = [tmp_mock]
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": "my-fake-model",
+                "litellm_params": {
+                    "model": "openai/gpt-1",
+                    "api_key": "my-key",
+                    "mock_response": Exception("this is an error"),
+                },
+            }
+        ],
+        cooldown_time=60,
+    )
+
+    try:
+        _ = router.completion(
+            model="my-fake-model",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            cooldown_time=0,
+            num_retries=0,
+        )
+    except Exception:
+        pass
+
+    tmp_mock.assert_called_once()
+
+    print(tmp_mock.call_count)
+
+    assert "cooldown_time" in tmp_mock.call_args[0][0]["litellm_params"]
+    assert tmp_mock.call_args[0][0]["litellm_params"]["cooldown_time"] == 0
+
+
+@pytest.mark.asyncio
+async def test_cooldown_time_zero_uses_zero_not_default():
+    """
+    Test that when cooldown_time=0 is passed, it uses 0 instead of the default cooldown time
+    AND that the early exit logic prevents cooldown entirely
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "cooldown_time": 0,
+                },
+            },
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4",
+                },
+            },
+        ],
+        cooldown_time=300,  # Default cooldown time is 300 seconds
+        num_retries=0,
+    )
+
+    # Mock the add_deployment_to_cooldown method to verify it's NOT called
+    with patch.object(
+        router.cooldown_cache, "add_deployment_to_cooldown"
+    ) as mock_add_cooldown:
+        try:
+            await router.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response="litellm.RateLimitError",
+            )
+        except litellm.RateLimitError:
+            pass
+
+        # Verify that add_deployment_to_cooldown was NOT called due to early exit
+        mock_add_cooldown.assert_not_called()
+
+    # Also verify the deployment is not in cooldown
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    assert len(cooldown_list) == 0
+
+    # Verify the deployment is still healthy and available
+    healthy_deployments, _ = await router._async_get_healthy_deployments(
+        model="gpt-3.5-turbo", parent_otel_span=None
+    )
+    assert len(healthy_deployments) == 1
+
+
+def test_should_run_cooldown_logic_early_exit_on_zero_cooldown():
+    """
+    Unit test for _should_run_cooldown_logic to verify early exit when time_to_cooldown is 0
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+                "model_info": {
+                    "id": "test-deployment-id",
+                },
+            }
+        ],
+        cooldown_time=300,
+    )
+
+    # Test with time_to_cooldown = 0 - should return False (don't run cooldown logic)
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id",
+        exception_status=429,
+        original_exception=litellm.RateLimitError(
+            "test error", "openai", "gpt-3.5-turbo"
+        ),
+        time_to_cooldown=0.0,
+    )
+    assert result is False, "Should not run cooldown logic when time_to_cooldown is 0"
+
+    # Test with very small time_to_cooldown (effectively 0) - should return False
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id",
+        exception_status=429,
+        original_exception=litellm.RateLimitError(
+            "test error", "openai", "gpt-3.5-turbo"
+        ),
+        time_to_cooldown=1e-10,
+    )
+    assert (
+        result is False
+    ), "Should not run cooldown logic when time_to_cooldown is effectively 0"
+
+    # Test with None time_to_cooldown - should return True (use default cooldown logic)
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id",
+        exception_status=429,
+        original_exception=litellm.RateLimitError(
+            "test error", "openai", "gpt-3.5-turbo"
+        ),
+        time_to_cooldown=None,
+    )
+    assert result is True, "Should run cooldown logic when time_to_cooldown is None"
+
+    # Test with positive time_to_cooldown - should return True
+    result = _should_run_cooldown_logic(
+        litellm_router_instance=router,
+        deployment="test-deployment-id",
+        exception_status=429,
+        original_exception=litellm.RateLimitError(
+            "test error", "openai", "gpt-3.5-turbo"
+        ),
+        time_to_cooldown=60.0,
+    )
+    assert result is True, "Should run cooldown logic when time_to_cooldown is positive"
+
+
+@pytest.mark.parametrize("num_deployments", [1, 2])
+def test_single_deployment_no_cooldowns(num_deployments):
+    """
+    Do not cooldown on single deployment.
+
+    Cooldown on multiple deployments.
+    """
+    model_list = []
+    for i in range(num_deployments):
+        model = DeploymentTypedDict(
+            model_name="gpt-3.5-turbo",
+            litellm_params=LiteLLMParamsTypedDict(
+                model="gpt-3.5-turbo",
+            ),
+        )
+        model_list.append(model)
+
+    router = Router(model_list=model_list, num_retries=0)
+
+    with patch.object(
+        router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock()
+    ) as mock_client:
+        try:
+            router.completion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response="litellm.RateLimitError",
+            )
+        except litellm.RateLimitError:
+            pass
+
+        if num_deployments == 1:
+            mock_client.assert_not_called()
+        else:
+            mock_client.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_single_deployment_no_cooldowns_test_prod():
+    """
+    Do not cooldown on single deployment.
+
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+            {
+                "model_name": "gpt-5",
+                "litellm_params": {
+                    "model": "openai/gpt-5",
+                },
+            },
+            {
+                "model_name": "gpt-12",
+                "litellm_params": {
+                    "model": "openai/gpt-12",
+                },
+            },
+        ],
+        num_retries=0,
+    )
+
+    with patch.object(
+        router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock()
+    ) as mock_client:
+        try:
+            await router.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response="litellm.RateLimitError",
+            )
+        except litellm.RateLimitError:
+            pass
+
+        await asyncio.sleep(2)
+
+        mock_client.assert_not_called()
+
+
+@pytest.mark.asyncio
+async def test_single_deployment_cooldown_with_allowed_fails():
+    """
+    When `allowed_fails` is set, use the allowed_fails to determine cooldown for 1 deployment
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+            {
+                "model_name": "gpt-5",
+                "litellm_params": {
+                    "model": "openai/gpt-5",
+                },
+            },
+            {
+                "model_name": "gpt-12",
+                "litellm_params": {
+                    "model": "openai/gpt-12",
+                },
+            },
+        ],
+        allowed_fails=1,
+        num_retries=0,
+    )
+
+    with patch.object(
+        router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock()
+    ) as mock_client:
+        for _ in range(2):
+            try:
+                await router.acompletion(
+                    model="gpt-3.5-turbo",
+                    messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                    timeout=0.0001,
+                )
+            except litellm.Timeout:
+                pass
+
+        await asyncio.sleep(2)
+
+        mock_client.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_single_deployment_cooldown_with_allowed_fail_policy():
+    """
+    When `allowed_fails_policy` is set, use the allowed_fails_policy to determine cooldown for 1 deployment
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+            {
+                "model_name": "gpt-5",
+                "litellm_params": {
+                    "model": "openai/gpt-5",
+                },
+            },
+            {
+                "model_name": "gpt-12",
+                "litellm_params": {
+                    "model": "openai/gpt-12",
+                },
+            },
+        ],
+        allowed_fails_policy=AllowedFailsPolicy(
+            TimeoutErrorAllowedFails=1,
+        ),
+        num_retries=0,
+    )
+
+    with patch.object(
+        router.cooldown_cache, "add_deployment_to_cooldown", new=MagicMock()
+    ) as mock_client:
+        for _ in range(2):
+            try:
+                await router.acompletion(
+                    model="gpt-3.5-turbo",
+                    messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                    timeout=0.0001,
+                )
+            except litellm.Timeout:
+                pass
+
+        await asyncio.sleep(2)
+
+        mock_client.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_single_deployment_no_cooldowns_test_prod_mock_completion_calls():
+    """
+    Do not cooldown on single deployment.
+
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+            {
+                "model_name": "gpt-5",
+                "litellm_params": {
+                    "model": "openai/gpt-5",
+                },
+            },
+            {
+                "model_name": "gpt-12",
+                "litellm_params": {
+                    "model": "openai/gpt-12",
+                },
+            },
+        ],
+    )
+
+    for _ in range(20):
+        try:
+            await router.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response="litellm.RateLimitError",
+            )
+        except litellm.RateLimitError:
+            pass
+
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    assert len(cooldown_list) == 0
+
+    healthy_deployments, _ = await router._async_get_healthy_deployments(
+        model="gpt-3.5-turbo", parent_otel_span=None
+    )
+
+    print("healthy_deployments: ", healthy_deployments)
+
+
+"""
+E2E - Test router cooldowns 
+
+Test 1: 3 deployments, each deployment fails 25% requests. Assert that no deployments get put into cooldown
+Test 2: 3 deployments, 1- deployment fails 6/10 requests, assert that bad deployment gets put into cooldown
+Test 3: 3 deployments, 1 deployment has a period of 429 errors. Assert it is put into cooldown and other deployments work
+
+"""
+
+
+@pytest.mark.asyncio()
+async def test_high_traffic_cooldowns_all_healthy_deployments():
+    """
+    PROD TEST - 3 deployments, each deployment fails 25% requests. Assert that no deployments get put into cooldown
+    """
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com-2",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com-3",
+                },
+            },
+        ],
+        set_verbose=True,
+        debug_level="DEBUG",
+    )
+
+    all_deployment_ids = router.get_model_ids()
+
+    import random
+    from collections import defaultdict
+
+    # Create a defaultdict to track successes and failures for each model ID
+    model_stats = defaultdict(lambda: {"successes": 0, "failures": 0})
+
+    litellm.set_verbose = True
+    for _ in range(100):
+        try:
+            model_id = random.choice(all_deployment_ids)
+
+            num_successes = model_stats[model_id]["successes"]
+            num_failures = model_stats[model_id]["failures"]
+            total_requests = num_failures + num_successes
+            if total_requests > 0:
+                print(
+                    "num failures= ",
+                    num_failures,
+                    "num successes= ",
+                    num_successes,
+                    "num_failures/total = ",
+                    num_failures / total_requests,
+                )
+
+            if total_requests == 0:
+                mock_response = "hi"
+            elif num_failures / total_requests <= 0.25:
+                # Randomly decide between fail and succeed
+                if random.random() < 0.5:
+                    mock_response = "hi"
+                else:
+                    mock_response = "litellm.InternalServerError"
+            else:
+                mock_response = "hi"
+
+            await router.acompletion(
+                model=model_id,
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response=mock_response,
+            )
+            model_stats[model_id]["successes"] += 1
+
+            await asyncio.sleep(0.0001)
+        except litellm.InternalServerError:
+            model_stats[model_id]["failures"] += 1
+            pass
+        except Exception as e:
+            print("Failed test model stats=", model_stats)
+            raise e
+    print("model_stats: ", model_stats)
+
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    assert len(cooldown_list) == 0
+
+
+@pytest.mark.asyncio()
+async def test_high_traffic_cooldowns_one_bad_deployment():
+    """
+    PROD TEST - 3 deployments, 1- deployment fails 6/10 requests, assert that bad deployment gets put into cooldown
+    """
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com-2",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com-3",
+                },
+            },
+        ],
+        set_verbose=True,
+        debug_level="DEBUG",
+    )
+
+    all_deployment_ids = router.get_model_ids()
+
+    import random
+    from collections import defaultdict
+
+    # Create a defaultdict to track successes and failures for each model ID
+    model_stats = defaultdict(lambda: {"successes": 0, "failures": 0})
+    bad_deployment_id = random.choice(all_deployment_ids)
+    litellm.set_verbose = True
+    for _ in range(100):
+        try:
+            model_id = random.choice(all_deployment_ids)
+
+            num_successes = model_stats[model_id]["successes"]
+            num_failures = model_stats[model_id]["failures"]
+            total_requests = num_failures + num_successes
+            if total_requests > 0:
+                print(
+                    "num failures= ",
+                    num_failures,
+                    "num successes= ",
+                    num_successes,
+                    "num_failures/total = ",
+                    num_failures / total_requests,
+                )
+
+            if total_requests == 0:
+                mock_response = "hi"
+            elif bad_deployment_id == model_id:
+                if num_failures / total_requests <= 0.6:
+
+                    mock_response = "litellm.InternalServerError"
+
+            elif num_failures / total_requests <= 0.25:
+                # Randomly decide between fail and succeed
+                if random.random() < 0.5:
+                    mock_response = "hi"
+                else:
+                    mock_response = "litellm.InternalServerError"
+            else:
+                mock_response = "hi"
+
+            await router.acompletion(
+                model=model_id,
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response=mock_response,
+            )
+            model_stats[model_id]["successes"] += 1
+
+            await asyncio.sleep(0.0001)
+        except litellm.InternalServerError:
+            model_stats[model_id]["failures"] += 1
+            pass
+        except Exception as e:
+            print("Failed test model stats=", model_stats)
+            raise e
+    print("model_stats: ", model_stats)
+
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    assert len(cooldown_list) == 1
+
+
+@pytest.mark.asyncio()
+async def test_high_traffic_cooldowns_one_rate_limited_deployment():
+    """
+    PROD TEST - 3 deployments, 1- deployment fails 6/10 requests, assert that bad deployment gets put into cooldown
+    """
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com-2",
+                },
+            },
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_base": "https://api.openai.com-3",
+                },
+            },
+        ],
+        set_verbose=True,
+        debug_level="DEBUG",
+    )
+
+    all_deployment_ids = router.get_model_ids()
+
+    import random
+    from collections import defaultdict
+
+    # Create a defaultdict to track successes and failures for each model ID
+    model_stats = defaultdict(lambda: {"successes": 0, "failures": 0})
+    bad_deployment_id = random.choice(all_deployment_ids)
+    litellm.set_verbose = True
+    for _ in range(100):
+        try:
+            model_id = random.choice(all_deployment_ids)
+
+            num_successes = model_stats[model_id]["successes"]
+            num_failures = model_stats[model_id]["failures"]
+            total_requests = num_failures + num_successes
+            if total_requests > 0:
+                print(
+                    "num failures= ",
+                    num_failures,
+                    "num successes= ",
+                    num_successes,
+                    "num_failures/total = ",
+                    num_failures / total_requests,
+                )
+
+            if total_requests == 0:
+                mock_response = "hi"
+            elif bad_deployment_id == model_id:
+                if num_failures / total_requests <= 0.6:
+
+                    mock_response = "litellm.RateLimitError"
+
+            elif num_failures / total_requests <= 0.25:
+                # Randomly decide between fail and succeed
+                if random.random() < 0.5:
+                    mock_response = "hi"
+                else:
+                    mock_response = "litellm.InternalServerError"
+            else:
+                mock_response = "hi"
+
+            await router.acompletion(
+                model=model_id,
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response=mock_response,
+            )
+            model_stats[model_id]["successes"] += 1
+
+            await asyncio.sleep(0.0001)
+        except litellm.InternalServerError:
+            model_stats[model_id]["failures"] += 1
+            pass
+        except litellm.RateLimitError:
+            model_stats[bad_deployment_id]["failures"] += 1
+            pass
+        except Exception as e:
+            print("Failed test model stats=", model_stats)
+            raise e
+    print("model_stats: ", model_stats)
+
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    assert len(cooldown_list) == 1
+
+
+"""
+Unit tests for router set_cooldowns
+
+1. _set_cooldown_deployments() will cooldown a deployment after it fails 50% requests
+"""
+
+
+def test_router_fallbacks_with_cooldowns_and_model_id():
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {"model": "gpt-3.5-turbo", "rpm": 1},
+                "model_info": {
+                    "id": "123",
+                },
+            }
+        ],
+        routing_strategy="usage-based-routing-v2",
+        fallbacks=[{"gpt-3.5-turbo": ["123"]}],
+    )
+
+    ## trigger ratelimit
+    try:
+        router.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "hi"}],
+            mock_response="litellm.RateLimitError",
+        )
+    except litellm.RateLimitError:
+        pass
+
+    router.completion(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": "hi"}],
+    )
+
+
+@pytest.mark.asyncio()
+async def test_router_fallbacks_with_cooldowns_and_dynamic_credentials():
+    """
+    Ensure cooldown on credential 1 does not affect credential 2
+    """
+    from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments
+
+    litellm._turn_on_debug()
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {"model": "gpt-3.5-turbo", "rpm": 1},
+                "model_info": {
+                    "id": "123",
+                },
+            }
+        ]
+    )
+
+    ## trigger ratelimit
+    try:
+        await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "hi"}],
+            api_key="my-bad-key-1",
+            mock_response="litellm.RateLimitError",
+        )
+        pytest.fail("Expected RateLimitError")
+    except litellm.RateLimitError:
+        pass
+
+    await asyncio.sleep(1)
+
+    cooldown_list = await _async_get_cooldown_deployments(
+        litellm_router_instance=router, parent_otel_span=None
+    )
+    print("cooldown_list: ", cooldown_list)
+    assert len(cooldown_list) == 1
+
+    await router.acompletion(
+        model="gpt-3.5-turbo",
+        api_key=os.getenv("OPENAI_API_KEY"),
+        messages=[{"role": "user", "content": "hi"}],
+    )