diff --git a/autoqa/COMMAND_REFERENCE.md b/autoqa/COMMAND_REFERENCE.md index 5978f31a8..a40e91b29 100644 --- a/autoqa/COMMAND_REFERENCE.md +++ b/autoqa/COMMAND_REFERENCE.md @@ -140,8 +140,6 @@ MAX_TURNS=50 DELAY_BETWEEN_TESTS=5 python main.py ## Migration Testing Arguments -**Note**: These arguments are planned for future implementation based on your sample commands. - | Argument | Environment Variable | Default | Description | |----------|---------------------|---------|-------------| | `--enable-migration-test` | `ENABLE_MIGRATION_TEST` | `false` | Enable migration testing mode | @@ -150,6 +148,15 @@ MAX_TURNS=50 DELAY_BETWEEN_TESTS=5 python main.py | `--old-version` | `OLD_VERSION` | - | Path to old version installer | | `--new-version` | `NEW_VERSION` | - | Path to new version installer | +## Reliability Testing Arguments + +| Argument | Environment Variable | Default | Description | +|----------|---------------------|---------|-------------| +| `--enable-reliability-test` | `ENABLE_RELIABILITY_TEST` | `false` | Enable reliability testing mode | +| `--reliability-phase` | `RELIABILITY_PHASE` | `development` | Testing phase: development (5 runs) or deployment (20 runs) | +| `--reliability-runs` | `RELIABILITY_RUNS` | `0` | Custom number of runs (overrides phase setting) | +| `--reliability-test-path` | `RELIABILITY_TEST_PATH` | - | Specific test file path for reliability testing | + **Examples:** ```bash # Basic migration test @@ -216,6 +223,52 @@ python main.py \ --enable-reportportal \ --rp-token "YOUR_TOKEN" \ --rp-project "jan_migration_tests" + +# Reliability testing - deployment phase with ReportPortal +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --reliability-test-path "tests/base/default-jan-assistant.txt" \ + --max-turns 50 \ + --enable-reportportal \ + --rp-token "YOUR_TOKEN" \ + --rp-project "jan_reliability_tests" + +### Reliability Testing + +```bash +# Development phase reliability test (5 runs) +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --max-turns 40 + +# Deployment phase reliability test (20 runs) +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --max-turns 40 + +# Custom number of runs +python main.py \ + --enable-reliability-test \ + --reliability-runs 10 \ + --max-turns 40 + +# Test specific file with reliability testing +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --reliability-test-path "tests/base/default-jan-assistant.txt" \ + --max-turns 40 + +# Reliability testing with ReportPortal +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --enable-reportportal \ + --rp-token "YOUR_TOKEN" \ + --max-turns 40 ``` ### Advanced Configuration @@ -265,13 +318,19 @@ python main.py \ - `TESTS_DIR`: Test files directory - `DELAY_BETWEEN_TESTS`: Delay between tests -### Migration Testing (Planned) +### Migration Testing - `ENABLE_MIGRATION_TEST`: Enable migration mode - `MIGRATION_TEST_CASE`: Migration test case - `MIGRATION_BATCH_MODE`: Use batch mode - `OLD_VERSION`: Old installer path - `NEW_VERSION`: New installer path +### Reliability Testing +- `ENABLE_RELIABILITY_TEST`: Enable reliability testing mode +- `RELIABILITY_PHASE`: Testing phase (development/deployment) +- `RELIABILITY_RUNS`: Custom number of runs +- `RELIABILITY_TEST_PATH`: Specific test file path + ## Help and Information ### Get Help diff --git a/autoqa/QUICK_START.md b/autoqa/QUICK_START.md index 8127114c7..eb145757d 100644 --- a/autoqa/QUICK_START.md +++ b/autoqa/QUICK_START.md @@ -46,6 +46,28 @@ python main.py \ --max-turns 75 ``` +### 4. Reliability Testing + +```bash +# Development phase (5 runs) +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --max-turns 40 + +# Deployment phase (20 runs) +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --max-turns 40 + +# Custom number of runs +python main.py \ + --enable-reliability-test \ + --reliability-runs 10 \ + --max-turns 40 +``` + ## Test Types ### Base Test Cases @@ -61,6 +83,11 @@ python main.py \ - **`assistants`**: Test custom assistants persist after upgrade - **`assistants-complete`**: Test both creation and chat functionality +### Reliability Testing +- **Development Phase**: Run test 5 times to verify basic stability (โ‰ฅ80% success rate) +- **Deployment Phase**: Run test 20 times to verify production readiness (โ‰ฅ90% success rate) +- **Custom Runs**: Specify custom number of runs for specific testing needs + ## Common Commands ### Basic Workflow @@ -101,7 +128,18 @@ python main.py \ --migration-batch-mode \ --old-version "path/to/old.exe" \ --new-version "path/to/new.exe" -``` + +# Test reliability - development phase +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --max-turns 40 + +# Test reliability - deployment phase +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --max-turns 40 ## Configuration Options @@ -130,6 +168,14 @@ python main.py \ | `--rp-endpoint` | RP endpoint URL | No | | `--rp-project` | RP project name | No | +### Reliability Testing Arguments +| Argument | Description | Required | +|----------|-------------|----------| +| `--enable-reliability-test` | Enable reliability mode | Yes | +| `--reliability-phase` | Testing phase (development/deployment) | No | +| `--reliability-runs` | Custom number of runs | No | +| `--reliability-test-path` | Specific test file path | No | + ## Environment Variables ```bash @@ -179,6 +225,19 @@ python main.py \ --rp-project "jan_migration_tests" ``` +### Example 4: Reliability Testing +```bash +# Test reliability with deployment phase +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --reliability-test-path "tests/base/default-jan-assistant.txt" \ + --max-turns 50 \ + --enable-reportportal \ + --rp-token "YOUR_TOKEN" \ + --rp-project "jan_reliability_tests" +``` + ## Troubleshooting ### Common Issues diff --git a/autoqa/README.md b/autoqa/README.md index 1851909de..eb7e3d5c8 100644 --- a/autoqa/README.md +++ b/autoqa/README.md @@ -12,6 +12,7 @@ - ๐ŸŽฏ **Flexible Configuration**: Command-line arguments and environment variables - ๐ŸŒ **Cross-platform**: Windows, macOS, and Linux support - ๐Ÿ“ **Test Discovery**: Automatically scans test files from directory +- ๐Ÿงช **Reliability Testing**: Run tests multiple times to verify stability (development: 5 runs, deployment: 20 runs) ## Prerequisites @@ -74,6 +75,25 @@ python main.py \ --rp-token "YOUR_API_TOKEN" ``` +### Reliability Testing + +```bash +# Development phase (5 runs) - verify basic stability +python main.py --enable-reliability-test --reliability-phase development + +# Deployment phase (20 runs) - verify production readiness +python main.py --enable-reliability-test --reliability-phase deployment + +# Custom number of runs +python main.py --enable-reliability-test --reliability-runs 10 + +# Test specific file with reliability testing +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --reliability-test-path "tests/base/default-jan-assistant.txt" +``` + ## Configuration ### Command Line Arguments diff --git a/autoqa/RELIABILITY_TESTING.md b/autoqa/RELIABILITY_TESTING.md new file mode 100644 index 000000000..782c68d69 --- /dev/null +++ b/autoqa/RELIABILITY_TESTING.md @@ -0,0 +1,296 @@ +# AutoQA Reliability Testing Guide + +๐Ÿš€ Comprehensive guide for running reliability tests with AutoQA to verify test case stability and reliability. + +## Overview + +Reliability testing is designed to verify that your test cases are stable and reliable by running them multiple times. This helps identify flaky tests and ensures consistent behavior before deploying to production. + +## Two Testing Phases + +### 1. Development Phase +- **Purpose**: Verify basic stability during development +- **Runs**: 5 times +- **Success Rate Requirement**: โ‰ฅ80% +- **Use Case**: During development to catch obvious stability issues + +### 2. Deployment Phase +- **Purpose**: Verify production readiness +- **Runs**: 20 times +- **Success Rate Requirement**: โ‰ฅ90% +- **Use Case**: Before deploying to production to ensure reliability + +## Command Line Usage + +### Basic Reliability Testing + +```bash +# Development phase (5 runs) +python main.py --enable-reliability-test --reliability-phase development + +# Deployment phase (20 runs) +python main.py --enable-reliability-test --reliability-phase deployment +``` + +### Custom Configuration + +```bash +# Custom number of runs +python main.py --enable-reliability-test --reliability-runs 10 + +# Specific test file +python main.py --enable-reliability-test --reliability-test-path "tests/base/default-jan-assistant.txt" + +# Custom max turns +python main.py --enable-reliability-test --reliability-phase development --max-turns 50 +``` + +### With ReportPortal Integration + +```bash +# Development phase with ReportPortal +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --enable-reportportal \ + --rp-token "YOUR_TOKEN" \ + --rp-project "jan_reliability_tests" + +# Deployment phase with ReportPortal +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --enable-reportportal \ + --rp-token "YOUR_TOKEN" \ + --rp-project "jan_reliability_tests" +``` + +## Environment Variables + +```bash +# Enable reliability testing +export ENABLE_RELIABILITY_TEST=true + +# Set phase +export RELIABILITY_PHASE=deployment + +# Custom runs (overrides phase) +export RELIABILITY_RUNS=15 + +# Specific test path +export RELIABILITY_TEST_PATH="tests/base/my-test.txt" + +# Run with environment variables +python main.py --enable-reliability-test +``` + +## Command Line Arguments + +| Argument | Environment Variable | Default | Description | +|----------|---------------------|---------|-------------| +| `--enable-reliability-test` | `ENABLE_RELIABILITY_TEST` | `false` | Enable reliability testing mode | +| `--reliability-phase` | `RELIABILITY_PHASE` | `development` | Testing phase: development or deployment | +| `--reliability-runs` | `RELIABILITY_RUNS` | `0` | Custom number of runs (overrides phase) | +| `--reliability-test-path` | `RELIABILITY_TEST_PATH` | - | Specific test file path | + +## Test Execution Flow + +### Single Test Reliability Testing + +1. **Load Test File**: Read the specified test file +2. **Run Multiple Times**: Execute the test the specified number of times +3. **Track Results**: Monitor success/failure for each run +4. **Calculate Success Rate**: Determine overall reliability +5. **Generate Report**: Provide detailed results and statistics + +### Multiple Tests Reliability Testing + +1. **Scan Test Files**: Find all test files in the specified directory +2. **Run Reliability Tests**: Execute reliability testing on each test file +3. **Aggregate Results**: Combine results from all tests +4. **Overall Assessment**: Determine if the entire test suite is reliable + +## Output and Results + +### Success Rate Calculation + +``` +Success Rate = (Successful Runs / Total Runs) ร— 100 +``` + +### Development Phase Requirements +- **Target**: 5 runs +- **Minimum Success Rate**: 80% +- **Result**: PASS if โ‰ฅ80%, FAIL if <80% + +### Deployment Phase Requirements +- **Target**: 20 runs +- **Minimum Success Rate**: 90% +- **Result**: PASS if โ‰ฅ90%, FAIL if <90% + +### Sample Output + +``` +========================================== +RELIABILITY TEST SUMMARY +========================================== +Test: tests/base/default-jan-assistant.txt +Phase: DEVELOPMENT +Completed runs: 5/5 +Successful runs: 4 +Failed runs: 1 +Success rate: 80.0% +Total duration: 125.3 seconds +Average duration per run: 25.1 seconds +Overall result: โœ… PASSED +Development phase requirement: โ‰ฅ80% success rate +``` + +## Use Cases + +### 1. New Test Development +```bash +# Test a new test case for basic stability +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --reliability-test-path "tests/base/my-new-test.txt" +``` + +### 2. Pre-Production Validation +```bash +# Verify test suite is production-ready +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --tests-dir "tests/base" +``` + +### 3. Flaky Test Investigation +```bash +# Run a potentially flaky test multiple times +python main.py \ + --enable-reliability-test \ + --reliability-runs 25 \ + --reliability-test-path "tests/base/flaky-test.txt" +``` + +### 4. CI/CD Integration +```bash +# Automated reliability testing in CI/CD +ENABLE_RELIABILITY_TEST=true \ +RELIABILITY_PHASE=deployment \ +python main.py --max-turns 40 +``` + +## Best Practices + +### 1. Start with Development Phase +- Begin with 5 runs to catch obvious issues +- Use during active development +- Quick feedback on test stability + +### 2. Use Deployment Phase for Production +- Run 20 times before production deployment +- Ensures high reliability standards +- Catches intermittent failures + +### 3. Custom Runs for Specific Needs +- Use custom run counts for special testing scenarios +- Investigate flaky tests with higher run counts +- Balance between thoroughness and execution time + +### 4. Monitor Execution Time +- Reliability testing takes longer than single runs +- Plan accordingly for CI/CD pipelines +- Consider parallel execution for multiple test files + +## Troubleshooting + +### Common Issues + +#### 1. Test File Not Found +```bash +# Ensure test path is correct +python main.py \ + --enable-reliability-test \ + --reliability-test-path "tests/base/existing-test.txt" +``` + +#### 2. Low Success Rate +- Check test environment stability +- Verify test dependencies +- Review test logic for race conditions + +#### 3. Long Execution Time +- Reduce max turns if appropriate +- Use development phase for quick feedback +- Consider running fewer test files + +### Debug Mode + +```bash +# Enable debug logging +export LOG_LEVEL=DEBUG +export PYTHONPATH=. + +# Run with verbose output +python main.py --enable-reliability-test --reliability-phase development +``` + +## Integration with Existing Workflows + +### Migration Testing +```bash +# Run reliability tests on migration test cases +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --tests-dir "tests/migration" +``` + +### Base Testing +```bash +# Run reliability tests on base test cases +python main.py \ + --enable-reliability-test \ + --reliability-phase development \ + --tests-dir "tests/base" +``` + +### Custom Test Directories +```bash +# Run reliability tests on custom test directory +python main.py \ + --enable-reliability-test \ + --reliability-phase deployment \ + --tests-dir "my_custom_tests" +``` + +## Performance Considerations + +### Execution Time +- **Development Phase**: ~5x single test execution time +- **Deployment Phase**: ~20x single test execution time +- **Multiple Tests**: Multiply by number of test files + +### Resource Usage +- Screen recordings for each run +- Trajectory data for each run +- ReportPortal uploads (if enabled) + +### Optimization Tips +- Use development phase for quick feedback +- Run deployment phase during off-peak hours +- Consider parallel execution for multiple test files +- Clean up old recordings and trajectories regularly + +## Next Steps + +1. **Start Simple**: Begin with development phase on single test files +2. **Scale Up**: Move to deployment phase for critical tests +3. **Automate**: Integrate into CI/CD pipelines +4. **Monitor**: Track reliability trends over time +5. **Improve**: Use results to identify and fix flaky tests + +For more information, see the main [README.md](README.md), [QUICK_START.md](QUICK_START.md), and explore the test files in the `tests/` directory. diff --git a/autoqa/batch_migration_runner.py b/autoqa/batch_migration_runner.py index 31759c59e..87095f794 100644 --- a/autoqa/batch_migration_runner.py +++ b/autoqa/batch_migration_runner.py @@ -209,7 +209,7 @@ async def run_batch_migration_test(computer, old_version_path, new_version_path, test_case_setup_success = False continue - with open(setup_test_path, "r") as f: + with open(setup_test_path, "r", encoding="utf-8") as f: setup_content = f.read() setup_test_data = { @@ -331,7 +331,7 @@ async def run_batch_migration_test(computer, old_version_path, new_version_path, test_case_verify_success = False continue - with open(verify_test_path, "r") as f: + with open(verify_test_path, "r", encoding="utf-8") as f: verify_content = f.read() verify_test_data = { diff --git a/autoqa/individual_migration_runner.py b/autoqa/individual_migration_runner.py index e8df1b4ed..56dca89eb 100644 --- a/autoqa/individual_migration_runner.py +++ b/autoqa/individual_migration_runner.py @@ -101,7 +101,7 @@ async def run_individual_migration_test(computer, test_case_key, old_version_pat if not os.path.exists(setup_test_path): raise FileNotFoundError(f"Setup test file not found: {setup_test_path}") - with open(setup_test_path, "r") as f: + with open(setup_test_path, "r", encoding="utf-8") as f: setup_content = f.read() setup_test_data = { @@ -151,7 +151,7 @@ async def run_individual_migration_test(computer, test_case_key, old_version_pat if not os.path.exists(verify_test_path): raise FileNotFoundError(f"Verification test file not found: {verify_test_path}") - with open(verify_test_path, "r") as f: + with open(verify_test_path, "r", encoding="utf-8") as f: verify_content = f.read() verify_test_data = { diff --git a/autoqa/main.py b/autoqa/main.py index c26f9c3b7..5f009c566 100644 --- a/autoqa/main.py +++ b/autoqa/main.py @@ -13,6 +13,7 @@ from reportportal_client.helpers import timestamp from utils import scan_test_files from test_runner import run_single_test_with_timeout from individual_migration_runner import run_individual_migration_test, run_all_migration_tests, MIGRATION_TEST_CASES +from reliability_runner import run_reliability_test, run_reliability_tests # Configure logging logging.basicConfig( @@ -184,8 +185,21 @@ Examples: # Run with different model python main.py --model-name "gpt-4" --model-base-url "https://api.openai.com/v1" + # Reliability testing - development phase (5 runs) + python main.py --enable-reliability-test --reliability-phase development + + # Reliability testing - deployment phase (20 runs) + python main.py --enable-reliability-test --reliability-phase deployment + + # Reliability testing - custom number of runs + python main.py --enable-reliability-test --reliability-runs 10 + + # Reliability testing - specific test file + python main.py --enable-reliability-test --reliability-test-path "tests/base/default-jan-assistant.txt" + # Using environment variables ENABLE_REPORTPORTAL=true RP_TOKEN=xxx MODEL_NAME=gpt-4 python main.py + ENABLE_RELIABILITY_TEST=true RELIABILITY_PHASE=deployment python main.py """ ) @@ -321,6 +335,32 @@ Examples: help='List available migration test cases and exit' ) + # Reliability testing arguments + reliability_group = parser.add_argument_group('Reliability Testing Configuration') + reliability_group.add_argument( + '--enable-reliability-test', + action='store_true', + default=os.getenv('ENABLE_RELIABILITY_TEST', 'false').lower() == 'true', + help='Enable reliability testing mode (env: ENABLE_RELIABILITY_TEST, default: false)' + ) + reliability_group.add_argument( + '--reliability-phase', + choices=['development', 'deployment'], + default=os.getenv('RELIABILITY_PHASE', 'development'), + help='Reliability testing phase: development (5 runs) or deployment (20 runs) (env: RELIABILITY_PHASE, default: development)' + ) + reliability_group.add_argument( + '--reliability-runs', + type=int, + default=int(os.getenv('RELIABILITY_RUNS', '0')), + help='Custom number of runs for reliability testing (overrides phase setting) (env: RELIABILITY_RUNS, default: 0)' + ) + reliability_group.add_argument( + '--reliability-test-path', + default=os.getenv('RELIABILITY_TEST_PATH'), + help='Specific test file path for reliability testing (env: RELIABILITY_TEST_PATH, if not specified, uses --tests-dir)' + ) + args = parser.parse_args() # Handle list migration tests @@ -407,6 +447,17 @@ async def main(): if args.enable_migration_test: logger.info(f"Old version installer: {args.old_version}") logger.info(f"New version installer: {args.new_version}") + logger.info(f"Reliability testing: {'ENABLED' if args.enable_reliability_test else 'DISABLED'}") + if args.enable_reliability_test: + logger.info(f"Reliability phase: {args.reliability_phase}") + if args.reliability_runs > 0: + logger.info(f"Custom runs: {args.reliability_runs}") + else: + logger.info(f"Phase runs: {5 if args.reliability_phase == 'development' else 20}") + if args.reliability_test_path: + logger.info(f"Specific test path: {args.reliability_test_path}") + else: + logger.info(f"Tests directory: {args.tests_dir}") logger.info("======================") # Initialize ReportPortal client only if enabled @@ -463,8 +514,65 @@ async def main(): await computer.run() logger.info("Computer environment ready") + # Check if reliability testing is enabled + if args.enable_reliability_test: + logger.info("=" * 60) + logger.info("RELIABILITY TESTING MODE ENABLED") + logger.info("=" * 60) + logger.info(f"Phase: {args.reliability_phase}") + if args.reliability_runs > 0: + logger.info(f"Custom runs: {args.reliability_runs}") + else: + logger.info(f"Phase runs: {5 if args.reliability_phase == 'development' else 20}") + + # Determine test paths for reliability testing + if args.reliability_test_path: + # Use specific test path + if not os.path.exists(args.reliability_test_path): + logger.error(f"Reliability test file not found: {args.reliability_test_path}") + final_exit_code = 1 + return final_exit_code + test_paths = [args.reliability_test_path] + logger.info(f"Running reliability test on specific file: {args.reliability_test_path}") + else: + # Use tests directory + test_files = scan_test_files(args.tests_dir) + if not test_files: + logger.warning(f"No test files found in directory: {args.tests_dir}") + return + test_paths = [test_data['path'] for test_data in test_files] + logger.info(f"Running reliability tests on {len(test_paths)} test files from: {args.tests_dir}") + + # Run reliability tests + reliability_results = await run_reliability_tests( + computer=computer, + test_paths=test_paths, + rp_client=rp_client, + launch_id=launch_id, + max_turns=args.max_turns, + jan_app_path=args.jan_app_path, + jan_process_name=args.jan_process_name, + agent_config=agent_config, + enable_reportportal=args.enable_reportportal, + phase=args.reliability_phase, + runs=args.reliability_runs if args.reliability_runs > 0 else None + ) + + # Handle reliability test results + if reliability_results and reliability_results.get("overall_success", False): + logger.info(f"[SUCCESS] Reliability testing completed successfully!") + final_exit_code = 0 + else: + logger.error(f"[FAILED] Reliability testing failed!") + if reliability_results and reliability_results.get("error_message"): + logger.error(f"Error: {reliability_results['error_message']}") + final_exit_code = 1 + + # Skip regular test execution in reliability mode + logger.info("Reliability testing completed. Skipping regular test execution.") + # Check if migration testing is enabled - if args.enable_migration_test: + elif args.enable_migration_test: logger.info("=" * 60) logger.info("MIGRATION TESTING MODE ENABLED") logger.info("=" * 60) diff --git a/autoqa/reliability_runner.py b/autoqa/reliability_runner.py new file mode 100644 index 000000000..2039eed37 --- /dev/null +++ b/autoqa/reliability_runner.py @@ -0,0 +1,334 @@ +import asyncio +import logging +import os +import time +from datetime import datetime +from pathlib import Path + +from test_runner import run_single_test_with_timeout +from utils import scan_test_files + +logger = logging.getLogger(__name__) + +async def run_reliability_test(computer, test_path, rp_client=None, launch_id=None, + max_turns=30, jan_app_path=None, jan_process_name="Jan.exe", + agent_config=None, enable_reportportal=False, + phase="development", runs=5): + """ + Run a single test case multiple times to verify reliability and stability + + Args: + computer: Computer agent instance + test_path: Path to the test file to run + rp_client: ReportPortal client (optional) + launch_id: ReportPortal launch ID (optional) + max_turns: Maximum turns per test + jan_app_path: Path to Jan application + jan_process_name: Jan process name for monitoring + agent_config: Agent configuration + enable_reportportal: Whether to upload to ReportPortal + phase: "development" (5 runs) or "deployment" (20 runs) + runs: Number of runs to execute (overrides phase if specified) + + Returns: + dict with reliability test results + """ + # Determine number of runs based on phase + if phase == "development": + target_runs = 5 + elif phase == "deployment": + target_runs = 20 + else: + target_runs = runs + + logger.info("=" * 100) + logger.info(f"RELIABILITY TESTING: {test_path.upper()}") + logger.info("=" * 100) + logger.info(f"Phase: {phase.upper()}") + logger.info(f"Target runs: {target_runs}") + logger.info(f"Test file: {test_path}") + logger.info("") + + # Load test content + if not os.path.exists(test_path): + raise FileNotFoundError(f"Test file not found: {test_path}") + + with open(test_path, "r", encoding="utf-8") as f: + test_content = f.read() + + test_data = { + "path": test_path, + "prompt": test_content + } + + # Initialize results tracking + reliability_results = { + "test_path": test_path, + "phase": phase, + "target_runs": target_runs, + "completed_runs": 0, + "successful_runs": 0, + "failed_runs": 0, + "run_details": [], + "start_time": datetime.now(), + "end_time": None, + "success_rate": 0.0, + "overall_success": False + } + + logger.info(f"Starting reliability testing with {target_runs} runs...") + logger.info("=" * 80) + + try: + for run_number in range(1, target_runs + 1): + logger.info(f"Run {run_number}/{target_runs}") + logger.info("-" * 40) + + run_start_time = datetime.now() + + try: + # Run the test + test_result = await run_single_test_with_timeout( + computer=computer, + test_data=test_data, + rp_client=rp_client, + launch_id=launch_id, + max_turns=max_turns, + jan_app_path=jan_app_path, + jan_process_name=jan_process_name, + agent_config=agent_config, + enable_reportportal=enable_reportportal + ) + + # Extract success status + success = False + if test_result: + if isinstance(test_result, dict): + success = test_result.get('success', False) + elif isinstance(test_result, bool): + success = test_result + elif hasattr(test_result, 'success'): + success = getattr(test_result, 'success', False) + else: + success = bool(test_result) + + run_end_time = datetime.now() + run_duration = (run_end_time - run_start_time).total_seconds() + + # Record run result + run_result = { + "run_number": run_number, + "success": success, + "start_time": run_start_time, + "end_time": run_end_time, + "duration_seconds": run_duration, + "test_result": test_result + } + + reliability_results["run_details"].append(run_result) + reliability_results["completed_runs"] += 1 + + if success: + reliability_results["successful_runs"] += 1 + logger.info(f"โœ… Run {run_number}: SUCCESS ({run_duration:.1f}s)") + else: + reliability_results["failed_runs"] += 1 + logger.error(f"โŒ Run {run_number}: FAILED ({run_duration:.1f}s)") + + # Calculate current success rate + current_success_rate = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100 + logger.info(f"Current success rate: {reliability_results['successful_runs']}/{reliability_results['completed_runs']} ({current_success_rate:.1f}%)") + + except Exception as e: + run_end_time = datetime.now() + run_duration = (run_end_time - run_start_time).total_seconds() + + # Record failed run + run_result = { + "run_number": run_number, + "success": False, + "start_time": run_start_time, + "end_time": run_end_time, + "duration_seconds": run_duration, + "error": str(e) + } + + reliability_results["run_details"].append(run_result) + reliability_results["completed_runs"] += 1 + reliability_results["failed_runs"] += 1 + + logger.error(f"โŒ Run {run_number}: EXCEPTION ({run_duration:.1f}s) - {e}") + + # Calculate current success rate + current_success_rate = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100 + logger.info(f"Current success rate: {reliability_results['successful_runs']}/{reliability_results['completed_runs']} ({current_success_rate:.1f}%)") + + # Add delay between runs (except for the last run) + if run_number < target_runs: + delay_seconds = 5 + logger.info(f"Waiting {delay_seconds} seconds before next run...") + await asyncio.sleep(delay_seconds) + + # Final calculations + reliability_results["end_time"] = datetime.now() + total_duration = (reliability_results["end_time"] - reliability_results["start_time"]).total_seconds() + reliability_results["total_duration_seconds"] = total_duration + + if reliability_results["completed_runs"] > 0: + reliability_results["success_rate"] = (reliability_results["successful_runs"] / reliability_results["completed_runs"]) * 100 + + # Determine overall success based on phase + if phase == "development": + # Development phase: 80% success rate required + reliability_results["overall_success"] = reliability_results["success_rate"] >= 80.0 + else: + # Deployment phase: 90% success rate required + reliability_results["overall_success"] = reliability_results["success_rate"] >= 90.0 + + # Print final summary + logger.info("=" * 80) + logger.info("RELIABILITY TEST SUMMARY") + logger.info("=" * 80) + logger.info(f"Test: {test_path}") + logger.info(f"Phase: {phase.upper()}") + logger.info(f"Completed runs: {reliability_results['completed_runs']}/{target_runs}") + logger.info(f"Successful runs: {reliability_results['successful_runs']}") + logger.info(f"Failed runs: {reliability_results['failed_runs']}") + logger.info(f"Success rate: {reliability_results['success_rate']:.1f}%") + logger.info(f"Total duration: {total_duration:.1f} seconds") + logger.info(f"Average duration per run: {total_duration / reliability_results['completed_runs']:.1f} seconds") + logger.info(f"Overall result: {'โœ… PASSED' if reliability_results['overall_success'] else 'โŒ FAILED'}") + + # Phase-specific requirements + if phase == "development": + logger.info("Development phase requirement: โ‰ฅ80% success rate") + else: + logger.info("Deployment phase requirement: โ‰ฅ90% success rate") + + return reliability_results + + except Exception as e: + logger.error(f"Reliability testing failed with exception: {e}") + reliability_results["end_time"] = datetime.now() + reliability_results["error_message"] = str(e) + return reliability_results + +async def run_reliability_tests(computer, test_paths, rp_client=None, launch_id=None, + max_turns=30, jan_app_path=None, jan_process_name="Jan.exe", + agent_config=None, enable_reportportal=False, + phase="development", runs=None): + """ + Run reliability tests for multiple test files + + Args: + computer: Computer agent instance + test_paths: List of test file paths or single path + rp_client: ReportPortal client (optional) + launch_id: ReportPortal launch ID (optional) + max_turns: Maximum turns per test + jan_app_path: Path to Jan application + jan_process_name: Jan process name for monitoring + agent_config: Agent configuration + enable_reportportal: Whether to upload to ReportPortal + phase: "development" (5 runs) or "deployment" (20 runs) + runs: Number of runs to execute (overrides phase if specified) + + Returns: + dict with overall reliability test results + """ + # Convert single path to list + if isinstance(test_paths, str): + test_paths = [test_paths] + + logger.info("=" * 100) + logger.info("RELIABILITY TESTING SUITE") + logger.info("=" * 100) + logger.info(f"Phase: {phase.upper()}") + logger.info(f"Test files: {len(test_paths)}") + logger.info(f"Test paths: {', '.join(test_paths)}") + logger.info("") + + overall_results = { + "phase": phase, + "total_tests": len(test_paths), + "completed_tests": 0, + "passed_tests": 0, + "failed_tests": 0, + "test_results": {}, + "start_time": datetime.now(), + "end_time": None, + "overall_success": False + } + + try: + for i, test_path in enumerate(test_paths, 1): + logger.info(f"Starting reliability test {i}/{len(test_paths)}: {test_path}") + + test_result = await run_reliability_test( + computer=computer, + test_path=test_path, + rp_client=rp_client, + launch_id=launch_id, + max_turns=max_turns, + jan_app_path=jan_app_path, + jan_process_name=jan_process_name, + agent_config=agent_config, + enable_reportportal=enable_reportportal, + phase=phase, + runs=runs + ) + + overall_results["test_results"][test_path] = test_result + overall_results["completed_tests"] += 1 + + if test_result and test_result.get("overall_success", False): + overall_results["passed_tests"] += 1 + logger.info(f"โœ… Test {i} PASSED: {test_path}") + else: + overall_results["failed_tests"] += 1 + logger.error(f"โŒ Test {i} FAILED: {test_path}") + + # Add delay between tests (except for the last test) + if i < len(test_paths): + delay_seconds = 10 + logger.info(f"Waiting {delay_seconds} seconds before next test...") + await asyncio.sleep(delay_seconds) + + # Final calculations + overall_results["end_time"] = datetime.now() + total_duration = (overall_results["end_time"] - overall_results["start_time"]).total_seconds() + overall_results["total_duration_seconds"] = total_duration + + if overall_results["completed_tests"] > 0: + overall_results["overall_success"] = overall_results["failed_tests"] == 0 + + # Print overall summary + logger.info("=" * 100) + logger.info("RELIABILITY TESTING SUITE SUMMARY") + logger.info("=" * 100) + logger.info(f"Phase: {phase.upper()}") + logger.info(f"Total tests: {overall_results['total_tests']}") + logger.info(f"Completed tests: {overall_results['completed_tests']}") + logger.info(f"Passed tests: {overall_results['passed_tests']}") + logger.info(f"Failed tests: {overall_results['failed_tests']}") + logger.info(f"Total duration: {total_duration:.1f} seconds") + logger.info(f"Overall result: {'โœ… PASSED' if overall_results['overall_success'] else 'โŒ FAILED'}") + + # Individual test results + logger.info("") + logger.info("Individual Test Results:") + for test_path, test_result in overall_results["test_results"].items(): + if test_result: + status = "โœ… PASSED" if test_result.get("overall_success", False) else "โŒ FAILED" + success_rate = test_result.get("success_rate", 0.0) + logger.info(f" {test_path}: {status} ({success_rate:.1f}% success rate)") + else: + logger.info(f" {test_path}: โŒ ERROR (no result)") + + return overall_results + + except Exception as e: + logger.error(f"Reliability testing suite failed with exception: {e}") + overall_results["end_time"] = datetime.now() + overall_results["error_message"] = str(e) + return overall_results diff --git a/autoqa/tests/migration/assistants/setup-chat-with-assistant.txt b/autoqa/tests/migration/assistants/setup-chat-with-assistant.txt index 9d38e3870..df526f1b2 100644 --- a/autoqa/tests/migration/assistants/setup-chat-with-assistant.txt +++ b/autoqa/tests/migration/assistants/setup-chat-with-assistant.txt @@ -49,7 +49,7 @@ Step-by-step instructions: - Choose: `jan-nano-gguf` under the `Llama.Cpp` section. 5. Send a test message: - - Type: `Hello world` and press Enter or click send message (button with right arrow). + - Type: `Hello world` and press Enter or click send message (button with right arrow). You should click at the center of the button. - Wait up to 1โ€“2 minutes for the model to load and respond. 6. Verify the model responds: