Added LiteLLM to the stack

2025-08-18 09:40:50 +00:00
parent 0648c1968c
commit d220b04e32
2682 changed files with 533609 additions and 1 deletions
--- a/Development/litellm/tests/test_fallbacks.py
+++ b/Development/litellm/tests/test_fallbacks.py
@@ -0,0 +1,338 @@
+# What is this?
+## This tests if the proxy fallbacks work as expected
+import pytest
+import asyncio
+import aiohttp
+from tests.large_text import text
+import time
+from typing import Optional
+
+
+async def generate_key(
+    session,
+    i,
+    models: list,
+    calling_key="sk-1234",
+):
+    url = "http://0.0.0.0:4000/key/generate"
+    headers = {
+        "Authorization": f"Bearer {calling_key}",
+        "Content-Type": "application/json",
+    }
+    data = {
+        "models": models,
+    }
+
+    print(f"data: {data}")
+
+    async with session.post(url, headers=headers, json=data) as response:
+        status = response.status
+        response_text = await response.text()
+
+        print(f"Response {i} (Status code: {status}):")
+        print(response_text)
+        print()
+
+        if status != 200:
+            raise Exception(f"Request {i} did not return a 200 status code: {status}")
+
+        return await response.json()
+
+
+async def chat_completion(
+    session,
+    key: str,
+    model: str,
+    messages: list,
+    return_headers: bool = False,
+    extra_headers: Optional[dict] = None,
+    **kwargs,
+):
+    url = "http://0.0.0.0:4000/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {key}",
+        "Content-Type": "application/json",
+    }
+    if extra_headers is not None:
+        headers.update(extra_headers)
+    data = {"model": model, "messages": messages, **kwargs}
+
+    async with session.post(url, headers=headers, json=data) as response:
+        status = response.status
+        response_text = await response.text()
+
+        print(response_text)
+        print()
+
+        if status != 200:
+            if return_headers:
+                return None, response.headers
+            else:
+                raise Exception(f"Request did not return a 200 status code: {status}")
+
+        if return_headers:
+            return await response.json(), response.headers
+        else:
+            return await response.json()
+
+
+@pytest.mark.asyncio
+async def test_chat_completion():
+    """
+    make chat completion call with prompt > context window. expect it to work with fallback
+    """
+    async with aiohttp.ClientSession() as session:
+        model = "gpt-3.5-turbo"
+        messages = [
+            {"role": "system", "content": text},
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+        await chat_completion(
+            session=session, key="sk-1234", model=model, messages=messages
+        )
+
+
+@pytest.mark.parametrize("has_access", [True, False])
+@pytest.mark.asyncio
+async def test_chat_completion_client_fallbacks(has_access):
+    """
+    make chat completion call with prompt > context window. expect it to work with fallback
+    """
+
+    async with aiohttp.ClientSession() as session:
+        models = ["gpt-3.5-turbo"]
+
+        if has_access:
+            models.append("gpt-instruct")
+
+        ## CREATE KEY WITH MODELS
+        generated_key = await generate_key(session=session, i=0, models=models)
+        calling_key = generated_key["key"]
+        model = "gpt-3.5-turbo"
+        messages = [
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+
+        ## CALL PROXY
+        try:
+            await chat_completion(
+                session=session,
+                key=calling_key,
+                model=model,
+                messages=messages,
+                mock_testing_fallbacks=True,
+                fallbacks=["gpt-instruct"],
+            )
+            if not has_access:
+                pytest.fail(
+                    "Expected this to fail, submitted fallback model that key did not have access to"
+                )
+        except Exception as e:
+            if has_access:
+                pytest.fail("Expected this to work: {}".format(str(e)))
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_retries():
+    """
+    make chat completion call with prompt > context window. expect it to work with fallback
+    """
+    async with aiohttp.ClientSession() as session:
+        model = "fake-openai-endpoint-4"
+        messages = [
+            {"role": "system", "content": text},
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+        response, headers = await chat_completion(
+            session=session,
+            key="sk-1234",
+            model=model,
+            messages=messages,
+            mock_testing_rate_limit_error=True,
+            return_headers=True,
+        )
+        print(f"headers: {headers}")
+        assert headers["x-litellm-attempted-retries"] == "1"
+        assert headers["x-litellm-max-retries"] == "50"
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_fallbacks():
+    """
+    make chat completion call with prompt > context window. expect it to work with fallback
+    """
+    async with aiohttp.ClientSession() as session:
+        model = "badly-configured-openai-endpoint"
+        messages = [
+            {"role": "system", "content": text},
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+        response, headers = await chat_completion(
+            session=session,
+            key="sk-1234",
+            model=model,
+            messages=messages,
+            fallbacks=["fake-openai-endpoint-5"],
+            return_headers=True,
+        )
+        print(f"headers: {headers}")
+        assert headers["x-litellm-attempted-fallbacks"] == "1"
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_timeout():
+    """
+    make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers.
+    """
+    async with aiohttp.ClientSession() as session:
+        model = "fake-openai-endpoint-5"
+        messages = [
+            {"role": "system", "content": text},
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+        start_time = time.time()
+        response, headers = await chat_completion(
+            session=session,
+            key="sk-1234",
+            model=model,
+            messages=messages,
+            num_retries=0,
+            mock_timeout=True,
+            return_headers=True,
+        )
+        end_time = time.time()
+        print(f"headers: {headers}")
+        assert (
+            headers["x-litellm-timeout"] == "1.0"
+        )  # assert model-specific timeout used
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_timeout_from_request():
+    """
+    make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers.
+    """
+    async with aiohttp.ClientSession() as session:
+        model = "fake-openai-endpoint-5"
+        messages = [
+            {"role": "system", "content": text},
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+        extra_headers = {
+            "x-litellm-timeout": "0.001",
+        }
+        start_time = time.time()
+        response, headers = await chat_completion(
+            session=session,
+            key="sk-1234",
+            model=model,
+            messages=messages,
+            num_retries=0,
+            mock_timeout=True,
+            extra_headers=extra_headers,
+            return_headers=True,
+        )
+        end_time = time.time()
+        print(f"headers: {headers}")
+        assert (
+            headers["x-litellm-timeout"] == "0.001"
+        )  # assert model-specific timeout used
+
+
+@pytest.mark.parametrize("has_access", [True, False])
+@pytest.mark.asyncio
+async def test_chat_completion_client_fallbacks_with_custom_message(has_access):
+    """
+    make chat completion call with prompt > context window. expect it to work with fallback
+    """
+
+    async with aiohttp.ClientSession() as session:
+        models = ["gpt-3.5-turbo"]
+
+        if has_access:
+            models.append("gpt-instruct")
+
+        ## CREATE KEY WITH MODELS
+        generated_key = await generate_key(session=session, i=0, models=models)
+        calling_key = generated_key["key"]
+        model = "gpt-3.5-turbo"
+        messages = [
+            {"role": "user", "content": "Who was Alexander?"},
+        ]
+
+        ## CALL PROXY
+        try:
+            await chat_completion(
+                session=session,
+                key=calling_key,
+                model=model,
+                messages=messages,
+                mock_testing_fallbacks=True,
+                fallbacks=[
+                    {
+                        "model": "gpt-instruct",
+                        "messages": [
+                            {
+                                "role": "assistant",
+                                "content": "This is a custom message",
+                            }
+                        ],
+                    }
+                ],
+            )
+            if not has_access:
+                pytest.fail(
+                    "Expected this to fail, submitted fallback model that key did not have access to"
+                )
+        except Exception as e:
+            if has_access:
+                pytest.fail("Expected this to work: {}".format(str(e)))
+
+
+import asyncio
+from openai import AsyncOpenAI
+from typing import List
+import time
+
+
+async def make_request(client: AsyncOpenAI, model: str) -> bool:
+    try:
+        await client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": "Who was Alexander?"}],
+        )
+        return True
+    except Exception as e:
+        print(f"Error with {model}: {str(e)}")
+        return False
+
+
+async def run_good_model_test(client: AsyncOpenAI, num_requests: int) -> bool:
+    tasks = [make_request(client, "good-model") for _ in range(num_requests)]
+    good_results = await asyncio.gather(*tasks)
+    return all(good_results)
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_bad_and_good_model():
+    """
+    Prod test - ensure even if bad model is down, good model is still working.
+    """
+    client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+    num_requests = 100
+    num_iterations = 3
+
+    for iteration in range(num_iterations):
+        print(f"\nIteration {iteration + 1}/{num_iterations}")
+        start_time = time.time()
+
+        # Fire and forget bad model requests
+        for _ in range(num_requests):
+            asyncio.create_task(make_request(client, "bad-model"))
+
+        # Wait only for good model requests
+        success = await run_good_model_test(client, num_requests)
+        print(
+            f"Iteration {iteration + 1}: {'✓' if success else '✗'} ({time.time() - start_time:.2f}s)"
+        )
+        assert success, "Not all good model requests succeeded"