jan/autoqa/reliability_runner.py

335 lines
14 KiB
Python

import asyncio
import logging
import os
import time
from datetime import datetime
from pathlib import Path
from test_runner import run_single_test_with_timeout
from utils import scan_test_files
logger = logging.getLogger(__name__)
async def run_reliability_test(computer, test_path, rp_client=None, launch_id=None,
max_turns=30, jan_app_path=None, jan_process_name="Jan.exe",
agent_config=None, enable_reportportal=False,
phase="development", runs=5):
"""
Run a single test case multiple times to verify reliability and stability
Args:
computer: Computer agent instance
test_path: Path to the test file to run
rp_client: ReportPortal client (optional)
launch_id: ReportPortal launch ID (optional)
max_turns: Maximum turns per test
jan_app_path: Path to Jan application
jan_process_name: Jan process name for monitoring
agent_config: Agent configuration
enable_reportportal: Whether to upload to ReportPortal
phase: "development" (5 runs) or "deployment" (20 runs)
runs: Number of runs to execute (overrides phase if specified)
Returns:
dict with reliability test results
"""
# Determine number of runs based on phase
if phase == "development":
target_runs = 5
elif phase == "deployment":
target_runs = 20
else:
target_runs = runs
logger.info("=" * 100)
logger.info(f"RELIABILITY TESTING: {test_path.upper()}")
logger.info("=" * 100)
logger.info(f"Phase: {phase.upper()}")
logger.info(f"Target runs: {target_runs}")
logger.info(f"Test file: {test_path}")
logger.info("")
# Load test content
if not os.path.exists(test_path):
raise FileNotFoundError(f"Test file not found: {test_path}")
with open(test_path, "r", encoding="utf-8") as f:
test_content = f.read()
test_data = {
"path": test_path,
"prompt": test_content
}
# Initialize results tracking
reliability_results = {
"test_path": test_path,
"phase": phase,
"target_runs": target_runs,
"completed_runs": 0,
"successful_runs": 0,
"failed_runs": 0,
"run_details": [],
"start_time": datetime.now(),
"end_time": None,
"success_rate": 0.0,
"overall_success": False
}
logger.info(f"Starting reliability testing with {target_runs} runs...")
logger.info("=" * 80)
try:
for run_number in range(1, target_runs + 1):
logger.info(f"Run {run_number}/{target_runs}")
logger.info("-" * 40)
run_start_time = datetime.now()
try:
# Run the test
test_result = await run_single_test_with_timeout(
computer=computer,
test_data=test_data,
rp_client=rp_client,
launch_id=launch_id,
max_turns=max_turns,
jan_app_path=jan_app_path,
jan_process_name=jan_process_name,
agent_config=agent_config,
enable_reportportal=enable_reportportal
)
# Extract success status
success = False
if test_result:
if isinstance(test_result, dict):
success = test_result.get('success', False)
elif isinstance(test_result, bool):
success = test_result
elif hasattr(test_result, 'success'):
success = getattr(test_result, 'success', False)
else:
success = bool(test_result)
run_end_time = datetime.now()
run_duration = (run_end_time - run_start_time).total_seconds()
# Record run result
run_result = {
"run_number": run_number,
"success": success,
"start_time": run_start_time,
"end_time": run_end_time,
"duration_seconds": run_duration,
"test_result": test_result
}
reliability_results["run_details"].append(run_result)
reliability_results["completed_runs"] += 1
if success:
reliability_results["successful_runs"] += 1
logger.info(f"✅ Run {run_number}: SUCCESS ({run_duration:.1f}s)")
else:
reliability_results["failed_runs"] += 1
logger.error(f"❌ Run {run_number}: FAILED ({run_duration:.1f}s)")
# Calculate current success rate
current_success_rate = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100
logger.info(f"Current success rate: {reliability_results['successful_runs']}/{reliability_results['completed_runs']} ({current_success_rate:.1f}%)")
except Exception as e:
run_end_time = datetime.now()
run_duration = (run_end_time - run_start_time).total_seconds()
# Record failed run
run_result = {
"run_number": run_number,
"success": False,
"start_time": run_start_time,
"end_time": run_end_time,
"duration_seconds": run_duration,
"error": str(e)
}
reliability_results["run_details"].append(run_result)
reliability_results["completed_runs"] += 1
reliability_results["failed_runs"] += 1
logger.error(f"❌ Run {run_number}: EXCEPTION ({run_duration:.1f}s) - {e}")
# Calculate current success rate
current_success_rate = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100
logger.info(f"Current success rate: {reliability_results['successful_runs']}/{reliability_results['completed_runs']} ({current_success_rate:.1f}%)")
# Add delay between runs (except for the last run)
if run_number < target_runs:
delay_seconds = 5
logger.info(f"Waiting {delay_seconds} seconds before next run...")
await asyncio.sleep(delay_seconds)
# Final calculations
reliability_results["end_time"] = datetime.now()
total_duration = (reliability_results["end_time"] - reliability_results["start_time"]).total_seconds()
reliability_results["total_duration_seconds"] = total_duration
if reliability_results["completed_runs"] > 0:
reliability_results["success_rate"] = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100
# Determine overall success based on phase
if phase == "development":
# Development phase: 80% success rate required
reliability_results["overall_success"] = reliability_results["success_rate"] >= 80.0
else:
# Deployment phase: 90% success rate required
reliability_results["overall_success"] = reliability_results["success_rate"] >= 90.0
# Print final summary
logger.info("=" * 80)
logger.info("RELIABILITY TEST SUMMARY")
logger.info("=" * 80)
logger.info(f"Test: {test_path}")
logger.info(f"Phase: {phase.upper()}")
logger.info(f"Completed runs: {reliability_results['completed_runs']}/{target_runs}")
logger.info(f"Successful runs: {reliability_results['successful_runs']}")
logger.info(f"Failed runs: {reliability_results['failed_runs']}")
logger.info(f"Success rate: {reliability_results['success_rate']:.1f}%")
logger.info(f"Total duration: {total_duration:.1f} seconds")
logger.info(f"Average duration per run: {total_duration / reliability_results['completed_runs']:.1f} seconds")
logger.info(f"Overall result: {'✅ PASSED' if reliability_results['overall_success'] else '❌ FAILED'}")
# Phase-specific requirements
if phase == "development":
logger.info("Development phase requirement: ≥80% success rate")
else:
logger.info("Deployment phase requirement: ≥90% success rate")
return reliability_results
except Exception as e:
logger.error(f"Reliability testing failed with exception: {e}")
reliability_results["end_time"] = datetime.now()
reliability_results["error_message"] = str(e)
return reliability_results
async def run_reliability_tests(computer, test_paths, rp_client=None, launch_id=None,
max_turns=30, jan_app_path=None, jan_process_name="Jan.exe",
agent_config=None, enable_reportportal=False,
phase="development", runs=None):
"""
Run reliability tests for multiple test files
Args:
computer: Computer agent instance
test_paths: List of test file paths or single path
rp_client: ReportPortal client (optional)
launch_id: ReportPortal launch ID (optional)
max_turns: Maximum turns per test
jan_app_path: Path to Jan application
jan_process_name: Jan process name for monitoring
agent_config: Agent configuration
enable_reportportal: Whether to upload to ReportPortal
phase: "development" (5 runs) or "deployment" (20 runs)
runs: Number of runs to execute (overrides phase if specified)
Returns:
dict with overall reliability test results
"""
# Convert single path to list
if isinstance(test_paths, str):
test_paths = [test_paths]
logger.info("=" * 100)
logger.info("RELIABILITY TESTING SUITE")
logger.info("=" * 100)
logger.info(f"Phase: {phase.upper()}")
logger.info(f"Test files: {len(test_paths)}")
logger.info(f"Test paths: {', '.join(test_paths)}")
logger.info("")
overall_results = {
"phase": phase,
"total_tests": len(test_paths),
"completed_tests": 0,
"passed_tests": 0,
"failed_tests": 0,
"test_results": {},
"start_time": datetime.now(),
"end_time": None,
"overall_success": False
}
try:
for i, test_path in enumerate(test_paths, 1):
logger.info(f"Starting reliability test {i}/{len(test_paths)}: {test_path}")
test_result = await run_reliability_test(
computer=computer,
test_path=test_path,
rp_client=rp_client,
launch_id=launch_id,
max_turns=max_turns,
jan_app_path=jan_app_path,
jan_process_name=jan_process_name,
agent_config=agent_config,
enable_reportportal=enable_reportportal,
phase=phase,
runs=runs
)
overall_results["test_results"][test_path] = test_result
overall_results["completed_tests"] += 1
if test_result and test_result.get("overall_success", False):
overall_results["passed_tests"] += 1
logger.info(f"✅ Test {i} PASSED: {test_path}")
else:
overall_results["failed_tests"] += 1
logger.error(f"❌ Test {i} FAILED: {test_path}")
# Add delay between tests (except for the last test)
if i < len(test_paths):
delay_seconds = 10
logger.info(f"Waiting {delay_seconds} seconds before next test...")
await asyncio.sleep(delay_seconds)
# Final calculations
overall_results["end_time"] = datetime.now()
total_duration = (overall_results["end_time"] - overall_results["start_time"]).total_seconds()
overall_results["total_duration_seconds"] = total_duration
if overall_results["completed_tests"] > 0:
overall_results["overall_success"] = overall_results["failed_tests"] == 0
# Print overall summary
logger.info("=" * 100)
logger.info("RELIABILITY TESTING SUITE SUMMARY")
logger.info("=" * 100)
logger.info(f"Phase: {phase.upper()}")
logger.info(f"Total tests: {overall_results['total_tests']}")
logger.info(f"Completed tests: {overall_results['completed_tests']}")
logger.info(f"Passed tests: {overall_results['passed_tests']}")
logger.info(f"Failed tests: {overall_results['failed_tests']}")
logger.info(f"Total duration: {total_duration:.1f} seconds")
logger.info(f"Overall result: {'✅ PASSED' if overall_results['overall_success'] else '❌ FAILED'}")
# Individual test results
logger.info("")
logger.info("Individual Test Results:")
for test_path, test_result in overall_results["test_results"].items():
if test_result:
status = "✅ PASSED" if test_result.get("overall_success", False) else "❌ FAILED"
success_rate = test_result.get("success_rate", 0.0)
logger.info(f" {test_path}: {status} ({success_rate:.1f}% success rate)")
else:
logger.info(f" {test_path}: ❌ ERROR (no result)")
return overall_results
except Exception as e:
logger.error(f"Reliability testing suite failed with exception: {e}")
overall_results["end_time"] = datetime.now()
overall_results["error_message"] = str(e)
return overall_results