import asyncio import logging import os import time from datetime import datetime from pathlib import Path from test_runner import run_single_test_with_timeout from utils import scan_test_files logger = logging.getLogger(__name__) async def run_reliability_test(computer, test_path, rp_client=None, launch_id=None, max_turns=30, jan_app_path=None, jan_process_name="Jan.exe", agent_config=None, enable_reportportal=False, phase="development", runs=5): """ Run a single test case multiple times to verify reliability and stability Args: computer: Computer agent instance test_path: Path to the test file to run rp_client: ReportPortal client (optional) launch_id: ReportPortal launch ID (optional) max_turns: Maximum turns per test jan_app_path: Path to Jan application jan_process_name: Jan process name for monitoring agent_config: Agent configuration enable_reportportal: Whether to upload to ReportPortal phase: "development" (5 runs) or "deployment" (20 runs) runs: Number of runs to execute (overrides phase if specified) Returns: dict with reliability test results """ # Determine number of runs based on phase if phase == "development": target_runs = 5 elif phase == "deployment": target_runs = 20 else: target_runs = runs logger.info("=" * 100) logger.info(f"RELIABILITY TESTING: {test_path.upper()}") logger.info("=" * 100) logger.info(f"Phase: {phase.upper()}") logger.info(f"Target runs: {target_runs}") logger.info(f"Test file: {test_path}") logger.info("") # Load test content if not os.path.exists(test_path): raise FileNotFoundError(f"Test file not found: {test_path}") with open(test_path, "r", encoding="utf-8") as f: test_content = f.read() test_data = { "path": test_path, "prompt": test_content } # Initialize results tracking reliability_results = { "test_path": test_path, "phase": phase, "target_runs": target_runs, "completed_runs": 0, "successful_runs": 0, "failed_runs": 0, "run_details": [], "start_time": datetime.now(), "end_time": None, "success_rate": 0.0, "overall_success": False } logger.info(f"Starting reliability testing with {target_runs} runs...") logger.info("=" * 80) try: for run_number in range(1, target_runs + 1): logger.info(f"Run {run_number}/{target_runs}") logger.info("-" * 40) run_start_time = datetime.now() try: # Run the test test_result = await run_single_test_with_timeout( computer=computer, test_data=test_data, rp_client=rp_client, launch_id=launch_id, max_turns=max_turns, jan_app_path=jan_app_path, jan_process_name=jan_process_name, agent_config=agent_config, enable_reportportal=enable_reportportal ) # Extract success status success = False if test_result: if isinstance(test_result, dict): success = test_result.get('success', False) elif isinstance(test_result, bool): success = test_result elif hasattr(test_result, 'success'): success = getattr(test_result, 'success', False) else: success = bool(test_result) run_end_time = datetime.now() run_duration = (run_end_time - run_start_time).total_seconds() # Record run result run_result = { "run_number": run_number, "success": success, "start_time": run_start_time, "end_time": run_end_time, "duration_seconds": run_duration, "test_result": test_result } reliability_results["run_details"].append(run_result) reliability_results["completed_runs"] += 1 if success: reliability_results["successful_runs"] += 1 logger.info(f"✅ Run {run_number}: SUCCESS ({run_duration:.1f}s)") else: reliability_results["failed_runs"] += 1 logger.error(f"❌ Run {run_number}: FAILED ({run_duration:.1f}s)") # Calculate current success rate current_success_rate = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100 logger.info(f"Current success rate: {reliability_results['successful_runs']}/{reliability_results['completed_runs']} ({current_success_rate:.1f}%)") except Exception as e: run_end_time = datetime.now() run_duration = (run_end_time - run_start_time).total_seconds() # Record failed run run_result = { "run_number": run_number, "success": False, "start_time": run_start_time, "end_time": run_end_time, "duration_seconds": run_duration, "error": str(e) } reliability_results["run_details"].append(run_result) reliability_results["completed_runs"] += 1 reliability_results["failed_runs"] += 1 logger.error(f"❌ Run {run_number}: EXCEPTION ({run_duration:.1f}s) - {e}") # Calculate current success rate current_success_rate = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100 logger.info(f"Current success rate: {reliability_results['successful_runs']}/{reliability_results['completed_runs']} ({current_success_rate:.1f}%)") # Add delay between runs (except for the last run) if run_number < target_runs: delay_seconds = 5 logger.info(f"Waiting {delay_seconds} seconds before next run...") await asyncio.sleep(delay_seconds) # Final calculations reliability_results["end_time"] = datetime.now() total_duration = (reliability_results["end_time"] - reliability_results["start_time"]).total_seconds() reliability_results["total_duration_seconds"] = total_duration if reliability_results["completed_runs"] > 0: reliability_results["success_rate"] = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100 # Determine overall success based on phase if phase == "development": # Development phase: 80% success rate required reliability_results["overall_success"] = reliability_results["success_rate"] >= 80.0 else: # Deployment phase: 90% success rate required reliability_results["overall_success"] = reliability_results["success_rate"] >= 90.0 # Print final summary logger.info("=" * 80) logger.info("RELIABILITY TEST SUMMARY") logger.info("=" * 80) logger.info(f"Test: {test_path}") logger.info(f"Phase: {phase.upper()}") logger.info(f"Completed runs: {reliability_results['completed_runs']}/{target_runs}") logger.info(f"Successful runs: {reliability_results['successful_runs']}") logger.info(f"Failed runs: {reliability_results['failed_runs']}") logger.info(f"Success rate: {reliability_results['success_rate']:.1f}%") logger.info(f"Total duration: {total_duration:.1f} seconds") logger.info(f"Average duration per run: {total_duration / reliability_results['completed_runs']:.1f} seconds") logger.info(f"Overall result: {'✅ PASSED' if reliability_results['overall_success'] else '❌ FAILED'}") # Phase-specific requirements if phase == "development": logger.info("Development phase requirement: ≥80% success rate") else: logger.info("Deployment phase requirement: ≥90% success rate") return reliability_results except Exception as e: logger.error(f"Reliability testing failed with exception: {e}") reliability_results["end_time"] = datetime.now() reliability_results["error_message"] = str(e) return reliability_results async def run_reliability_tests(computer, test_paths, rp_client=None, launch_id=None, max_turns=30, jan_app_path=None, jan_process_name="Jan.exe", agent_config=None, enable_reportportal=False, phase="development", runs=None): """ Run reliability tests for multiple test files Args: computer: Computer agent instance test_paths: List of test file paths or single path rp_client: ReportPortal client (optional) launch_id: ReportPortal launch ID (optional) max_turns: Maximum turns per test jan_app_path: Path to Jan application jan_process_name: Jan process name for monitoring agent_config: Agent configuration enable_reportportal: Whether to upload to ReportPortal phase: "development" (5 runs) or "deployment" (20 runs) runs: Number of runs to execute (overrides phase if specified) Returns: dict with overall reliability test results """ # Convert single path to list if isinstance(test_paths, str): test_paths = [test_paths] logger.info("=" * 100) logger.info("RELIABILITY TESTING SUITE") logger.info("=" * 100) logger.info(f"Phase: {phase.upper()}") logger.info(f"Test files: {len(test_paths)}") logger.info(f"Test paths: {', '.join(test_paths)}") logger.info("") overall_results = { "phase": phase, "total_tests": len(test_paths), "completed_tests": 0, "passed_tests": 0, "failed_tests": 0, "test_results": {}, "start_time": datetime.now(), "end_time": None, "overall_success": False } try: for i, test_path in enumerate(test_paths, 1): logger.info(f"Starting reliability test {i}/{len(test_paths)}: {test_path}") test_result = await run_reliability_test( computer=computer, test_path=test_path, rp_client=rp_client, launch_id=launch_id, max_turns=max_turns, jan_app_path=jan_app_path, jan_process_name=jan_process_name, agent_config=agent_config, enable_reportportal=enable_reportportal, phase=phase, runs=runs ) overall_results["test_results"][test_path] = test_result overall_results["completed_tests"] += 1 if test_result and test_result.get("overall_success", False): overall_results["passed_tests"] += 1 logger.info(f"✅ Test {i} PASSED: {test_path}") else: overall_results["failed_tests"] += 1 logger.error(f"❌ Test {i} FAILED: {test_path}") # Add delay between tests (except for the last test) if i < len(test_paths): delay_seconds = 10 logger.info(f"Waiting {delay_seconds} seconds before next test...") await asyncio.sleep(delay_seconds) # Final calculations overall_results["end_time"] = datetime.now() total_duration = (overall_results["end_time"] - overall_results["start_time"]).total_seconds() overall_results["total_duration_seconds"] = total_duration if overall_results["completed_tests"] > 0: overall_results["overall_success"] = overall_results["failed_tests"] == 0 # Print overall summary logger.info("=" * 100) logger.info("RELIABILITY TESTING SUITE SUMMARY") logger.info("=" * 100) logger.info(f"Phase: {phase.upper()}") logger.info(f"Total tests: {overall_results['total_tests']}") logger.info(f"Completed tests: {overall_results['completed_tests']}") logger.info(f"Passed tests: {overall_results['passed_tests']}") logger.info(f"Failed tests: {overall_results['failed_tests']}") logger.info(f"Total duration: {total_duration:.1f} seconds") logger.info(f"Overall result: {'✅ PASSED' if overall_results['overall_success'] else '❌ FAILED'}") # Individual test results logger.info("") logger.info("Individual Test Results:") for test_path, test_result in overall_results["test_results"].items(): if test_result: status = "✅ PASSED" if test_result.get("overall_success", False) else "❌ FAILED" success_rate = test_result.get("success_rate", 0.0) logger.info(f" {test_path}: {status} ({success_rate:.1f}% success rate)") else: logger.info(f" {test_path}: ❌ ERROR (no result)") return overall_results except Exception as e: logger.error(f"Reliability testing suite failed with exception: {e}") overall_results["end_time"] = datetime.now() overall_results["error_message"] = str(e) return overall_results