#!/usr/bin/env python3 """ Run LLM Translation Tests and Generate Beautiful Markdown Report This script runs the LLM translation tests and generates a comprehensive markdown report with provider-specific breakdowns and test statistics. """ import os import sys import subprocess import xml.etree.ElementTree as ET from collections import defaultdict from datetime import datetime from pathlib import Path import json from typing import Dict, List, Tuple, Optional # ANSI color codes for terminal output class Colors: GREEN = '\033[92m' RED = '\033[91m' YELLOW = '\033[93m' BLUE = '\033[94m' PURPLE = '\033[95m' CYAN = '\033[96m' RESET = '\033[0m' BOLD = '\033[1m' def print_colored(message: str, color: str = Colors.RESET): """Print colored message to terminal""" print(f"{color}{message}{Colors.RESET}") def get_provider_from_test_file(test_file: str) -> str: """Map test file names to provider names""" provider_mapping = { 'test_anthropic': 'Anthropic', 'test_azure': 'Azure', 'test_bedrock': 'AWS Bedrock', 'test_openai': 'OpenAI', 'test_vertex': 'Google Vertex AI', 'test_gemini': 'Google Vertex AI', 'test_cohere': 'Cohere', 'test_databricks': 'Databricks', 'test_groq': 'Groq', 'test_together': 'Together AI', 'test_mistral': 'Mistral', 'test_deepseek': 'DeepSeek', 'test_replicate': 'Replicate', 'test_huggingface': 'HuggingFace', 'test_fireworks': 'Fireworks AI', 'test_perplexity': 'Perplexity', 'test_cloudflare': 'Cloudflare', 'test_voyage': 'Voyage AI', 'test_xai': 'xAI', 'test_nvidia': 'NVIDIA', 'test_watsonx': 'IBM watsonx', 'test_azure_ai': 'Azure AI', 'test_snowflake': 'Snowflake', 'test_infinity': 'Infinity', 'test_jina': 'Jina AI', 'test_deepgram': 'Deepgram', 'test_clarifai': 'Clarifai', 'test_triton': 'Triton', } for key, provider in provider_mapping.items(): if key in test_file: return provider # For cross-provider test files if any(name in test_file for name in ['test_optional_params', 'test_prompt_factory', 'test_router', 'test_text_completion']): return f'Cross-Provider Tests ({test_file})' return 'Other Tests' def format_duration(seconds: float) -> str: """Format duration in human-readable format""" if seconds < 60: return f"{seconds:.2f}s" elif seconds < 3600: minutes = int(seconds // 60) secs = seconds % 60 return f"{minutes}m {secs:.0f}s" else: hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) return f"{hours}h {minutes}m" def generate_markdown_report(junit_xml_path: str, output_path: str, tag: str = None, commit: str = None): """Generate a beautiful markdown report from JUnit XML""" try: tree = ET.parse(junit_xml_path) root = tree.getroot() # Handle both testsuite and testsuites root if root.tag == 'testsuites': suites = root.findall('testsuite') else: suites = [root] # Overall statistics total_tests = 0 total_failures = 0 total_errors = 0 total_skipped = 0 total_time = 0.0 # Provider breakdown provider_stats = defaultdict(lambda: {'passed': 0, 'failed': 0, 'skipped': 0, 'errors': 0, 'time': 0.0}) provider_tests = defaultdict(list) for suite in suites: total_tests += int(suite.get('tests', 0)) total_failures += int(suite.get('failures', 0)) total_errors += int(suite.get('errors', 0)) total_skipped += int(suite.get('skipped', 0)) total_time += float(suite.get('time', 0)) for testcase in suite.findall('testcase'): classname = testcase.get('classname', '') test_name = testcase.get('name', '') test_time = float(testcase.get('time', 0)) # Extract test file name from classname if '.' in classname: parts = classname.split('.') test_file = parts[-2] if len(parts) > 1 else 'unknown' else: test_file = 'unknown' provider = get_provider_from_test_file(test_file) provider_stats[provider]['time'] += test_time # Check test status if testcase.find('failure') is not None: provider_stats[provider]['failed'] += 1 failure = testcase.find('failure') failure_msg = failure.get('message', '') if failure is not None else '' provider_tests[provider].append({ 'name': test_name, 'status': 'FAILED', 'time': test_time, 'message': failure_msg }) elif testcase.find('error') is not None: provider_stats[provider]['errors'] += 1 error = testcase.find('error') error_msg = error.get('message', '') if error is not None else '' provider_tests[provider].append({ 'name': test_name, 'status': 'ERROR', 'time': test_time, 'message': error_msg }) elif testcase.find('skipped') is not None: provider_stats[provider]['skipped'] += 1 skip = testcase.find('skipped') skip_msg = skip.get('message', '') if skip is not None else '' provider_tests[provider].append({ 'name': test_name, 'status': 'SKIPPED', 'time': test_time, 'message': skip_msg }) else: provider_stats[provider]['passed'] += 1 provider_tests[provider].append({ 'name': test_name, 'status': 'PASSED', 'time': test_time, 'message': '' }) passed = total_tests - total_failures - total_errors - total_skipped # Generate the markdown report with open(output_path, 'w') as f: # Header f.write("# LLM Translation Test Results\n\n") # Metadata table f.write("## Test Run Information\n\n") f.write("| Field | Value |\n") f.write("|-------|-------|\n") f.write(f"| **Tag** | `{tag or 'N/A'}` |\n") f.write(f"| **Date** | {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')} |\n") f.write(f"| **Commit** | `{commit or 'N/A'}` |\n") f.write(f"| **Duration** | {format_duration(total_time)} |\n") f.write("\n") # Overall statistics with visual elements f.write("## Overall Statistics\n\n") # Summary box f.write("```\n") f.write(f"Total Tests: {total_tests}\n") f.write(f"├── Passed: {passed:>4} ({(passed/total_tests)*100 if total_tests > 0 else 0:.1f}%)\n") f.write(f"├── Failed: {total_failures:>4} ({(total_failures/total_tests)*100 if total_tests > 0 else 0:.1f}%)\n") f.write(f"├── Errors: {total_errors:>4} ({(total_errors/total_tests)*100 if total_tests > 0 else 0:.1f}%)\n") f.write(f"└── Skipped: {total_skipped:>4} ({(total_skipped/total_tests)*100 if total_tests > 0 else 0:.1f}%)\n") f.write("```\n\n") # Provider summary table f.write("## Results by Provider\n\n") f.write("| Provider | Total | Pass | Fail | Error | Skip | Pass Rate | Duration |\n") f.write("|----------|-------|------|------|-------|------|-----------|----------|") # Sort providers: specific providers first, then cross-provider tests sorted_providers = [] cross_provider = [] for p in sorted(provider_stats.keys()): if 'Cross-Provider' in p or p == 'Other Tests': cross_provider.append(p) else: sorted_providers.append(p) all_providers = sorted_providers + cross_provider for provider in all_providers: stats = provider_stats[provider] total = stats['passed'] + stats['failed'] + stats['errors'] + stats['skipped'] pass_rate = (stats['passed'] / total * 100) if total > 0 else 0 f.write(f"\n| {provider} | {total} | {stats['passed']} | {stats['failed']} | ") f.write(f"{stats['errors']} | {stats['skipped']} | {pass_rate:.1f}% | ") f.write(f"{format_duration(stats['time'])} |") # Detailed test results by provider f.write("\n\n## Detailed Test Results\n\n") for provider in sorted_providers: if provider_tests[provider]: stats = provider_stats[provider] total = stats['passed'] + stats['failed'] + stats['errors'] + stats['skipped'] f.write(f"### {provider}\n\n") f.write(f"**Summary:** {stats['passed']}/{total} passed ") f.write(f"({(stats['passed']/total)*100 if total > 0 else 0:.1f}%) ") f.write(f"in {format_duration(stats['time'])}\n\n") # Group tests by status tests_by_status = defaultdict(list) for test in provider_tests[provider]: tests_by_status[test['status']].append(test) # Show failed tests first (if any) if tests_by_status['FAILED']: f.write("
\nFailed Tests\n\n") for test in tests_by_status['FAILED']: f.write(f"- `{test['name']}` ({test['time']:.2f}s)\n") if test['message']: # Truncate long error messages msg = test['message'][:200] + '...' if len(test['message']) > 200 else test['message'] f.write(f" > {msg}\n") f.write("\n
\n\n") # Show errors (if any) if tests_by_status['ERROR']: f.write("
\nError Tests\n\n") for test in tests_by_status['ERROR']: f.write(f"- `{test['name']}` ({test['time']:.2f}s)\n") f.write("\n
\n\n") # Show passed tests in collapsible section if tests_by_status['PASSED']: f.write("
\nPassed Tests\n\n") for test in tests_by_status['PASSED']: f.write(f"- `{test['name']}` ({test['time']:.2f}s)\n") f.write("\n
\n\n") # Show skipped tests (if any) if tests_by_status['SKIPPED']: f.write("
\nSkipped Tests\n\n") for test in tests_by_status['SKIPPED']: f.write(f"- `{test['name']}`\n") f.write("\n
\n\n") # Cross-provider tests in a separate section if cross_provider: f.write("### Cross-Provider Tests\n\n") for provider in cross_provider: if provider_tests[provider]: stats = provider_stats[provider] total = stats['passed'] + stats['failed'] + stats['errors'] + stats['skipped'] f.write(f"#### {provider}\n\n") f.write(f"**Summary:** {stats['passed']}/{total} passed ") f.write(f"({(stats['passed']/total)*100 if total > 0 else 0:.1f}%)\n\n") # For cross-provider tests, just show counts f.write(f"- Passed: {stats['passed']}\n") if stats['failed'] > 0: f.write(f"- Failed: {stats['failed']}\n") if stats['errors'] > 0: f.write(f"- Errors: {stats['errors']}\n") if stats['skipped'] > 0: f.write(f"- Skipped: {stats['skipped']}\n") f.write("\n") print_colored(f"Report generated: {output_path}", Colors.GREEN) except Exception as e: print_colored(f"Error generating report: {e}", Colors.RED) raise def run_tests(test_path: str = "tests/llm_translation/", junit_xml: str = "test-results/junit.xml", report_path: str = "test-results/llm_translation_report.md", tag: str = None, commit: str = None) -> int: """Run the LLM translation tests and generate report""" # Create test results directory os.makedirs(os.path.dirname(junit_xml), exist_ok=True) print_colored("Starting LLM Translation Tests", Colors.BOLD + Colors.BLUE) print_colored(f"Test directory: {test_path}", Colors.CYAN) print_colored(f"Output: {junit_xml}", Colors.CYAN) print() # Run pytest cmd = [ "poetry", "run", "pytest", test_path, f"--junitxml={junit_xml}", "-v", "--tb=short", "--maxfail=500", "-n", "auto" ] # Add timeout if pytest-timeout is installed try: subprocess.run(["poetry", "run", "python", "-c", "import pytest_timeout"], capture_output=True, check=True) cmd.extend(["--timeout=300"]) except: print_colored("Warning: pytest-timeout not installed, skipping timeout option", Colors.YELLOW) print_colored("Running pytest with command:", Colors.YELLOW) print(f" {' '.join(cmd)}") print() # Run the tests result = subprocess.run(cmd, capture_output=False) # Generate the report regardless of test outcome if os.path.exists(junit_xml): print() print_colored("Generating test report...", Colors.BLUE) generate_markdown_report(junit_xml, report_path, tag, commit) # Print summary to console print() print_colored("Test Summary:", Colors.BOLD + Colors.PURPLE) # Parse XML for quick summary tree = ET.parse(junit_xml) root = tree.getroot() if root.tag == 'testsuites': suites = root.findall('testsuite') else: suites = [root] total = sum(int(s.get('tests', 0)) for s in suites) failures = sum(int(s.get('failures', 0)) for s in suites) errors = sum(int(s.get('errors', 0)) for s in suites) skipped = sum(int(s.get('skipped', 0)) for s in suites) passed = total - failures - errors - skipped print(f" Total: {total}") print_colored(f" Passed: {passed}", Colors.GREEN) if failures > 0: print_colored(f" Failed: {failures}", Colors.RED) if errors > 0: print_colored(f" Errors: {errors}", Colors.RED) if skipped > 0: print_colored(f" Skipped: {skipped}", Colors.YELLOW) if total > 0: pass_rate = (passed / total) * 100 color = Colors.GREEN if pass_rate >= 80 else Colors.YELLOW if pass_rate >= 60 else Colors.RED print_colored(f" Pass Rate: {pass_rate:.1f}%", color) else: print_colored("No test results found!", Colors.RED) print() print_colored("Test run complete!", Colors.BOLD + Colors.GREEN) return result.returncode if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Run LLM Translation Tests") parser.add_argument("--test-path", default="tests/llm_translation/", help="Path to test directory") parser.add_argument("--junit-xml", default="test-results/junit.xml", help="Path for JUnit XML output") parser.add_argument("--report", default="test-results/llm_translation_report.md", help="Path for markdown report") parser.add_argument("--tag", help="Git tag or version") parser.add_argument("--commit", help="Git commit SHA") args = parser.parse_args() # Get git info if not provided if not args.commit: try: result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True) if result.returncode == 0: args.commit = result.stdout.strip() except: pass if not args.tag: try: result = subprocess.run(["git", "describe", "--tags", "--abbrev=0"], capture_output=True, text=True) if result.returncode == 0: args.tag = result.stdout.strip() except: pass exit_code = run_tests( test_path=args.test_path, junit_xml=args.junit_xml, report_path=args.report, tag=args.tag, commit=args.commit ) sys.exit(exit_code)