Homelab/Development/litellm/tests/guardrails_tests/test_presidio_pii.py

import sys
import os
import io, asyncio
import pytest
import time
from litellm import mock_completion
from unittest.mock import MagicMock, AsyncMock, patch
sys.path.insert(0, os.path.abspath("../.."))
import litellm
from litellm.proxy.guardrails.guardrail_hooks.presidio import _OPTIONAL_PresidioPIIMasking, PresidioPerRequestConfig
from litellm.types.guardrails import PiiEntityType, PiiAction
from litellm.proxy._types import UserAPIKeyAuth
from litellm.caching.caching import DualCache
from litellm.exceptions import BlockedPiiEntityError
from litellm.types.utils import CallTypes as LitellmCallTypes


@pytest.mark.asyncio
async def test_presidio_with_entities_config():
    """Test for Presidio guardrail with entities config - requires actual Presidio API"""
    # Setup the guardrail with specific entities config
    litellm._turn_on_debug()
    pii_entities_config = {
        PiiEntityType.CREDIT_CARD: PiiAction.MASK,
        PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK,
    }

    presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config=pii_entities_config,
        presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
        presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
    )

    # Test text with different PII types
    test_text = "My credit card number is 4111-1111-1111-1111, my email is test@example.com, and my phone is 555-123-4567"

    # Test the analyze request configuration
    analyze_request = presidio_guardrail._get_presidio_analyze_request_payload(
        text=test_text,
        presidio_config=None,
        request_data={}
    )

    # Verify entities were passed correctly
    assert "entities" in analyze_request
    assert set(analyze_request["entities"]) == set(pii_entities_config.keys())

    # Test the check_pii method - this will call the actual Presidio API
    redacted_text = await presidio_guardrail.check_pii(
        text=test_text,
        output_parse_pii=True,
        presidio_config=None,
        request_data={}
    )

    # Verify PII has been masked/replaced/redacted in the result
    assert "4111-1111-1111-1111" not in redacted_text
    assert "test@example.com" not in redacted_text

    # Since this entity is not in the config, it should not be masked
    assert "555-123-4567" in redacted_text

    # The specific replacements will vary based on Presidio's implementation
    print(f"Redacted text: {redacted_text}")


@pytest.mark.asyncio
async def test_presidio_apply_guardrail():
    """Test for Presidio guardrail apply guardrail - requires actual Presidio API"""
    litellm._turn_on_debug()
    presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config={},
        presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
        presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
    )


    response = await presidio_guardrail.apply_guardrail(
        text="My credit card number is 4111-1111-1111-1111 and my email is test@example.com",
        language="en",
    )
    print("response from apply guardrail for presidio: ", response)

    # assert tthe default config masks the credit card and email
    assert "4111-1111-1111-1111" not in response
    assert "test@example.com" not in response

@pytest.mark.asyncio
async def test_presidio_with_blocked_entities():
    """Test for Presidio guardrail with blocked entities - requires actual Presidio API"""
    # Setup the guardrail with specific entities config - BLOCK for credit card
    litellm._turn_on_debug()
    pii_entities_config = {
        PiiEntityType.CREDIT_CARD: PiiAction.BLOCK,  # This entity should cause a block
        PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK,  # This entity should be masked
    }

    presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config=pii_entities_config,
        presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
        presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
    )

    # Test text with blocked PII type
    test_text = "My credit card number is 4111-1111-1111-1111 and my email is test@example.com"

    # Verify the analyze request configuration
    analyze_request = presidio_guardrail._get_presidio_analyze_request_payload(
        text=test_text,
        presidio_config=None,
        request_data={}
    )

    # Verify entities were passed correctly
    assert "entities" in analyze_request
    assert set(analyze_request["entities"]) == set(pii_entities_config.keys())

    # Test that BlockedPiiEntityError is raised when check_pii is called
    with pytest.raises(BlockedPiiEntityError) as excinfo:
        await presidio_guardrail.check_pii(
            text=test_text,
            output_parse_pii=True,
            presidio_config=None,
            request_data={}
        )

    # Verify the error contains the correct entity type
    assert excinfo.value.entity_type == PiiEntityType.CREDIT_CARD
    assert excinfo.value.guardrail_name == presidio_guardrail.guardrail_name


@pytest.mark.asyncio
async def test_presidio_pre_call_hook_with_blocked_entities():
    """Test for Presidio guardrail pre-call hook with blocked entities on a chat completion request"""
    # Setup the guardrail with specific entities config
    pii_entities_config = {
        PiiEntityType.CREDIT_CARD: PiiAction.BLOCK,  # This entity should cause a block
        PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK,  # This entity should be masked
    }

    presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config=pii_entities_config,
        presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
        presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
    )

    # Create a sample chat completion request with PII data
    data = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "My credit card is 4111-1111-1111-1111 and my email is test@example.com."}
        ],
        "model": "gpt-3.5-turbo"
    }

    # Mock objects needed for the pre-call hook
    user_api_key_dict = UserAPIKeyAuth(api_key="test_key")
    cache = DualCache()

    # Call the pre-call hook and expect BlockedPiiEntityError
    with pytest.raises(BlockedPiiEntityError) as excinfo:
        await presidio_guardrail.async_pre_call_hook(
            user_api_key_dict=user_api_key_dict,
            cache=cache,
            data=data,
            call_type="completion"
        )

    print(f"got error: {excinfo}")

    # Verify the error contains the correct entity type
    assert excinfo.value.entity_type == PiiEntityType.CREDIT_CARD
    assert excinfo.value.guardrail_name == presidio_guardrail.guardrail_name


@pytest.mark.asyncio
@pytest.mark.parametrize("call_type", ["completion", "acompletion"])
async def test_presidio_pre_call_hook_with_different_call_types(call_type):
    """Test for Presidio guardrail pre-call hook with both completion and acompletion call types"""
    # Setup the guardrail with specific entities config
    pii_entities_config = {
        PiiEntityType.CREDIT_CARD: PiiAction.MASK,
        PiiEntityType.EMAIL_ADDRESS: PiiAction.MASK,
    }

    presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config=pii_entities_config,
        presidio_analyzer_api_base=os.environ.get("PRESIDIO_ANALYZER_API_BASE"),
        presidio_anonymizer_api_base=os.environ.get("PRESIDIO_ANONYMIZER_API_BASE")
    )

    # Create a sample request with PII data
    data = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "My credit card is 4111-1111-1111-1111 and my email is test@example.com. My phone number is 555-123-4567"}
        ],
        "model": "gpt-3.5-turbo"
    }

    # Mock objects needed for the pre-call hook
    user_api_key_dict = UserAPIKeyAuth(api_key="test_key")
    cache = DualCache()

    # Call the pre-call hook with the specified call type
    modified_data = await presidio_guardrail.async_pre_call_hook(
        user_api_key_dict=user_api_key_dict,
        cache=cache,
        data=data,
        call_type=call_type
    )

    # Verify the messages have been modified to mask PII
    assert modified_data["messages"][0]["content"] == "You are a helpful assistant."  # System prompt should be unchanged

    user_message = modified_data["messages"][1]["content"]
    assert "4111-1111-1111-1111" not in user_message
    assert "test@example.com" not in user_message

    # Since this entity is not in the config, it should not be masked
    assert "555-123-4567" in user_message

    print(f"Modified user message for call_type={call_type}: {user_message}")


@pytest.mark.parametrize(
    "base_url",
    [
        "presidio-analyzer-s3pa:10000",
        "https://presidio-analyzer-s3pa:10000",
        "http://presidio-analyzer-s3pa:10000",
    ],
)
def test_validate_environment_missing_http(base_url):
    pii_masking = _OPTIONAL_PresidioPIIMasking(mock_testing=True)

    # Use patch.dict to temporarily modify environment variables only for this test
    env_vars = {
        "PRESIDIO_ANALYZER_API_BASE": f"{base_url}/analyze",
        "PRESIDIO_ANONYMIZER_API_BASE": f"{base_url}/anonymize"
    }
    with patch.dict(os.environ, env_vars):
        pii_masking.validate_environment()

        expected_url = base_url
        if not (base_url.startswith("https://") or base_url.startswith("http://")):
            expected_url = "http://" + base_url

        assert (
            pii_masking.presidio_anonymizer_api_base == f"{expected_url}/anonymize/"
        ), "Got={}, Expected={}".format(
            pii_masking.presidio_anonymizer_api_base, f"{expected_url}/anonymize/"
        )
        assert pii_masking.presidio_analyzer_api_base == f"{expected_url}/analyze/"


@pytest.mark.asyncio
async def test_output_parsing():
    """
    - have presidio pii masking - mask an input message
    - make llm completion call
    - have presidio pii masking - output parse message
    - assert that no masked tokens are in the input message
    """
    litellm.set_verbose = True
    litellm.output_parse_pii = True
    pii_masking = _OPTIONAL_PresidioPIIMasking(mock_testing=True)

    initial_message = [
        {
            "role": "user",
            "content": "hello world, my name is Jane Doe. My number is: 034453334",
        }
    ]

    filtered_message = [
        {
            "role": "user",
            "content": "hello world, my name is <PERSON>. My number is: <PHONE_NUMBER>",
        }
    ]

    pii_masking.pii_tokens = {"<PERSON>": "Jane Doe", "<PHONE_NUMBER>": "034453334"}

    response = mock_completion(
        model="gpt-3.5-turbo",
        messages=filtered_message,
        mock_response="Hello <PERSON>! How can I assist you today?",
    )
    new_response = await pii_masking.async_post_call_success_hook(
        user_api_key_dict=UserAPIKeyAuth(),
        data={
            "messages": [{"role": "system", "content": "You are an helpfull assistant"}]
        },
        response=response,
    )

    assert (
        new_response.choices[0].message.content
        == "Hello Jane Doe! How can I assist you today?"
    )


# asyncio.run(test_output_parsing())


### UNIT TESTS FOR PRESIDIO PII MASKING ###

input_a_anonymizer_results = {
    "text": "hello world, my name is <PERSON>. My number is: <PHONE_NUMBER>",
    "items": [
        {
            "start": 48,
            "end": 62,
            "entity_type": "PHONE_NUMBER",
            "text": "<PHONE_NUMBER>",
            "operator": "replace",
        },
        {
            "start": 24,
            "end": 32,
            "entity_type": "PERSON",
            "text": "<PERSON>",
            "operator": "replace",
        },
    ],
}

input_b_anonymizer_results = {
    "text": "My name is <PERSON>, who are you? Say my name in your response",
    "items": [
        {
            "start": 11,
            "end": 19,
            "entity_type": "PERSON",
            "text": "<PERSON>",
            "operator": "replace",
        }
    ],
}


#   Test if PII masking works with input A
@pytest.mark.asyncio
async def test_presidio_pii_masking_input_a():
    """
    Tests to see if correct parts of sentence anonymized
    """
    pii_masking = _OPTIONAL_PresidioPIIMasking(
        mock_testing=True, mock_redacted_text=input_a_anonymizer_results
    )

    _api_key = "sk-12345"
    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key)
    local_cache = DualCache()

    new_data = await pii_masking.async_pre_call_hook(
        user_api_key_dict=user_api_key_dict,
        cache=local_cache,
        data={
            "messages": [
                {
                    "role": "user",
                    "content": "hello world, my name is Jane Doe. My number is: 23r323r23r2wwkl",
                }
            ]
        },
        call_type="completion",
    )

    assert "<PERSON>" in new_data["messages"][0]["content"]
    assert "<PHONE_NUMBER>" in new_data["messages"][0]["content"]


#   Test if PII masking works with input B (also test if the response != A's response)
@pytest.mark.asyncio
async def test_presidio_pii_masking_input_b():
    """
    Tests to see if correct parts of sentence anonymized
    """
    pii_masking = _OPTIONAL_PresidioPIIMasking(
        mock_testing=True, mock_redacted_text=input_b_anonymizer_results
    )

    _api_key = "sk-12345"
    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key)
    local_cache = DualCache()

    new_data = await pii_masking.async_pre_call_hook(
        user_api_key_dict=user_api_key_dict,
        cache=local_cache,
        data={
            "messages": [
                {
                    "role": "user",
                    "content": "My name is Jane Doe, who are you? Say my name in your response",
                }
            ]
        },
        call_type="completion",
    )

    assert "<PERSON>" in new_data["messages"][0]["content"]
    assert "<PHONE_NUMBER>" not in new_data["messages"][0]["content"]


@pytest.mark.asyncio
async def test_presidio_pii_masking_logging_output_only_no_pre_api_hook():
    from litellm.types.guardrails import GuardrailEventHooks

    pii_masking = _OPTIONAL_PresidioPIIMasking(
        logging_only=True,
        mock_testing=True,
        mock_redacted_text=input_b_anonymizer_results,
    )

    _api_key = "sk-12345"
    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key)
    local_cache = DualCache()

    test_messages = [
        {
            "role": "user",
            "content": "My name is Jane Doe, who are you? Say my name in your response",
        }
    ]

    assert (
        pii_masking.should_run_guardrail(
            data={"messages": test_messages},
            event_type=GuardrailEventHooks.pre_call,
        )
        is False
    )


@pytest.mark.asyncio
@patch.dict(os.environ, {
    "PRESIDIO_ANALYZER_API_BASE": "http://localhost:5002",
    "PRESIDIO_ANONYMIZER_API_BASE": "http://localhost:5001"
})
async def test_presidio_pii_masking_logging_output_only_logged_response_guardrails_config():
    from typing import Dict, List, Optional

    import litellm
    from litellm.proxy.guardrails.init_guardrails import initialize_guardrails
    from litellm.types.guardrails import (
        GuardrailItem,
        GuardrailItemSpec,
        GuardrailEventHooks,
    )

    litellm.set_verbose = True
    # Environment variables are now patched via the decorator instead of setting them directly

    guardrails_config: List[Dict[str, GuardrailItemSpec]] = [
        {
            "pii_masking": {
                "callbacks": ["presidio"],
                "default_on": True,
                "logging_only": True,
            }
        }
    ]
    litellm_settings = {"guardrails": guardrails_config}

    assert len(litellm.guardrail_name_config_map) == 0
    initialize_guardrails(
        guardrails_config=guardrails_config,
        premium_user=True,
        config_file_path="",
        litellm_settings=litellm_settings,
    )

    assert len(litellm.guardrail_name_config_map) == 1

    pii_masking_obj: Optional[_OPTIONAL_PresidioPIIMasking] = None
    for callback in litellm.callbacks:
        print(f"CALLBACK: {callback}")
        if isinstance(callback, _OPTIONAL_PresidioPIIMasking):
            pii_masking_obj = callback

    assert pii_masking_obj is not None

    assert hasattr(pii_masking_obj, "logging_only")
    assert pii_masking_obj.event_hook == GuardrailEventHooks.logging_only

    assert pii_masking_obj.should_run_guardrail(
        data={}, event_type=GuardrailEventHooks.logging_only
    )


@pytest.mark.asyncio
async def test_presidio_language_configuration():
    """Test that presidio_language parameter is properly set and used in analyze requests"""
    litellm._turn_on_debug()

    # Test with German language using mock testing to avoid API calls
    presidio_guardrail_de = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config={},
        presidio_language="de",
        mock_testing=True  # This bypasses the API validation
    )

    test_text = "Meine Telefonnummer ist +49 30 12345678"

    # Test the analyze request configuration
    analyze_request = presidio_guardrail_de._get_presidio_analyze_request_payload(
        text=test_text,
        presidio_config=None,
        request_data={}
    )

    # Verify the language is set to German
    assert analyze_request["language"] == "de"
    assert analyze_request["text"] == test_text

    # Test with Spanish language
    presidio_guardrail_es = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config={},
        presidio_language="es",
        mock_testing=True
    )

    test_text_es = "Mi número de teléfono es +34 912 345 678"

    analyze_request_es = presidio_guardrail_es._get_presidio_analyze_request_payload(
        text=test_text_es,
        presidio_config=None,
        request_data={}
    )

    # Verify the language is set to Spanish
    assert analyze_request_es["language"] == "es"
    assert analyze_request_es["text"] == test_text_es

    # Test default language (English) when not specified
    presidio_guardrail_default = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config={},
        mock_testing=True
    )

    test_text_en = "My phone number is +1 555-123-4567"

    analyze_request_default = presidio_guardrail_default._get_presidio_analyze_request_payload(
        text=test_text_en,
        presidio_config=None,
        request_data={}
    )

    # Verify the language defaults to English
    assert analyze_request_default["language"] == "en"
    assert analyze_request_default["text"] == test_text_en


@pytest.mark.asyncio
async def test_presidio_language_configuration_with_per_request_override():
    """Test that per-request language configuration overrides the default configured language"""
    litellm._turn_on_debug()

    # Set up guardrail with German as default language
    presidio_guardrail = _OPTIONAL_PresidioPIIMasking(
        pii_entities_config={},
        presidio_language="de",
        mock_testing=True
    )

    test_text = "Test text with PII"

    # Test with per-request config overriding the default language
    presidio_config = PresidioPerRequestConfig(language="fr")

    analyze_request = presidio_guardrail._get_presidio_analyze_request_payload(
        text=test_text,
        presidio_config=presidio_config,
        request_data={}
    )

    # Verify the per-request language (French) overrides the default (German)
    assert analyze_request["language"] == "fr"
    assert analyze_request["text"] == test_text

    # Test without per-request config - should use default language
    analyze_request_default = presidio_guardrail._get_presidio_analyze_request_payload(
        text=test_text,
        presidio_config=None,
        request_data={}
    )

    # Verify the default language (German) is used
    assert analyze_request_default["language"] == "de"
    assert analyze_request_default["text"] == test_text