From c5ccd14048f4b702105491e7ef54077edf698756 Mon Sep 17 00:00:00 2001 From: NicholaiVogel Date: Mon, 20 Oct 2025 16:43:35 -0600 Subject: [PATCH] feat: Initial Whisper Transcription TUI implementation - Added Textual-based TUI with file selection and progress monitoring - Implemented transcription service with OpenAI API and local Whisper backends - Added markdown formatter for transcription output - Configuration management for persistent API keys and output directory - Comprehensive README with installation and usage instructions - Support for multi-file batch processing - Beautiful terminal UI with modal dialogs for user input --- .env.example | 8 + .gitignore | 55 +++++- README.md | 258 +++++++++++++++++++++++++ main.py | 26 +++ requirements.txt | 26 ++- src/__init__.py | 1 + src/app.py | 444 ++++++++++++++++++++++++++++++++++++++++++++ src/config.py | 93 ++++++++++ src/file_handler.py | 121 ++++++++++++ src/formatter.py | 79 ++++++++ src/transcriber.py | 141 ++++++++++++++ 11 files changed, 1237 insertions(+), 15 deletions(-) create mode 100644 .env.example create mode 100644 README.md create mode 100644 main.py create mode 100644 src/__init__.py create mode 100644 src/app.py create mode 100644 src/config.py create mode 100644 src/file_handler.py create mode 100644 src/formatter.py create mode 100644 src/transcriber.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..14cf71a --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +# OpenAI API Key (required for API-based transcription) +# Get your key from: https://platform.openai.com/api-keys +OPENAI_API_KEY=your_api_key_here + +# Optional: Whisper Model Size for local transcription +# Options: tiny, base, small, medium, large +# Default: base +WHISPER_MODEL=base diff --git a/.gitignore b/.gitignore index 6c531da..2440f65 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,55 @@ -opencode +# Environment and Configuration .env +.env.local +config.json + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual Environments +venv/ +ENV/ +env/ +.venv + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Test and coverage +.pytest_cache/ +.coverage +htmlcov/ + +# Output +transcriptions/ +output/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..69f0a5b --- /dev/null +++ b/README.md @@ -0,0 +1,258 @@ +# Whisper Transcription TUI + +A modern terminal user interface (TUI) application for transcribing audio and video files using OpenAI's Whisper, with support for both cloud-based API and local processing. + +## Features + +- **Dual Transcription Methods**: + - OpenAI Whisper API (fast, cloud-based, paid per minute) + - Local Whisper (free, offline, slower) +- **Flexible File Selection**: Browse and select single or multiple audio/video files +- **Persistent Configuration**: Remember API keys and output directory between sessions +- **Beautiful TUI**: Modern terminal interface similar to OpenCode +- **Markdown Output**: Transcriptions saved as markdown with metadata headers +- **Multi-file Processing**: Batch transcribe multiple files sequentially + +## Supported Formats + +### Audio +MP3, WAV, M4A, FLAC, OGG, WMA, AAC + +### Video +MP4, AVI, MKV, MOV, WebM, FLV, WMV, M4V + +## Installation + +### Prerequisites + +- Python 3.8 or higher +- FFmpeg (for audio/video processing) + +### FFmpeg Installation + +#### Windows +```bash +# Using Chocolatey +choco install ffmpeg + +# Or using Scoop +scoop install ffmpeg + +# Or download from https://ffmpeg.org/download.html +``` + +#### macOS +```bash +brew install ffmpeg +``` + +#### Linux +```bash +# Ubuntu/Debian +sudo apt-get install ffmpeg + +# Fedora +sudo dnf install ffmpeg + +# Arch +sudo pacman -S ffmpeg +``` + +### Python Package Installation + +1. Clone the repository: +```bash +git clone +cd 00_Whisper +``` + +2. Create and activate a virtual environment: +```bash +# Windows +python -m venv venv +venv\Scripts\activate + +# macOS/Linux +python3 -m venv venv +source venv/bin/activate +``` + +3. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Configuration + +### OpenAI API Key Setup + +1. Get your API key from [OpenAI Platform](https://platform.openai.com/api-keys) +2. The first time you select the "OpenAI Whisper API" method, you'll be prompted to enter your API key +3. The key will be saved to `.env` file and reused in future sessions + +⚠️ **Important**: Never commit your `.env` file to version control. It's already in `.gitignore`. + +### Local Whisper Model + +You can configure the local Whisper model size by editing your `.env` file: + +```bash +# Options: tiny, base, small, medium, large +# Default: base +WHISPER_MODEL=base +``` + +Larger models are more accurate but require more memory and time. + +## Usage + +### Running the Application + +```bash +python main.py +``` + +### Basic Workflow + +1. **Select Output Directory** (first time only) + - Navigate using arrow keys + - Press "Select" to choose a directory for transcription outputs + +2. **Select Files** + - Navigate the directory tree to find your audio/video files + - Click "Add File" to add files to the queue + - Add multiple files or just one + - Click "Continue" when done + +3. **Choose Transcription Method** + - Select between OpenAI API (fast, paid) or Local Whisper (free, slower) + - If using OpenAI and no key is configured, you'll be prompted to enter it + +4. **View Progress** + - Monitor transcription progress in real-time + - Each file is processed sequentially + +5. **Review Results** + - See list of successfully transcribed files + - Output files are saved as markdown in your configured directory + +### Output Format + +Each transcription is saved as a markdown file with the format: `{filename}_transcription.md` + +```markdown +# Transcription: meeting.mp4 + +**Source File:** meeting.mp4 +**Date:** 2025-01-20T10:30:45.123456 +**Duration:** 1:23:45 +**Language:** en +**Word Count:** 2847 + +--- + +[Transcribed text here...] +``` + +## Keyboard Shortcuts + +- **Tab**: Navigate between buttons and widgets +- **Enter**: Activate selected button +- **Arrow Keys**: Navigate menus and trees +- **Ctrl+C**: Exit application at any time + +## Troubleshooting + +### "API key not found" error +Make sure you've entered your OpenAI API key when prompted. Check that your `.env` file contains: +``` +OPENAI_API_KEY=your_key_here +``` + +### "File format not supported" error +Check that your file extension is in the supported formats list. The app needs proper file extensions to recognize formats. + +### Local Whisper is very slow +This is normal for larger model sizes. Try using the "tiny" or "base" model in your `.env`: +``` +WHISPER_MODEL=tiny +``` + +### FFmpeg not found error +Ensure FFmpeg is installed and in your system PATH. Test by running: +```bash +ffmpeg -version +``` + +### Permission denied errors +Ensure the output directory is writable and the application has permission to create files there. + +## Project Structure + +``` +. +├── main.py # Entry point +├── requirements.txt # Python dependencies +├── .env.example # Environment template +├── .gitignore # Git ignore rules +├── README.md # This file +└── src/ + ├── __init__.py + ├── app.py # Main TUI application + ├── config.py # Configuration management + ├── transcriber.py # Transcription service + ├── formatter.py # Markdown formatting + └── file_handler.py # File handling utilities +``` + +## Development + +### Running Tests +```bash +# Coming soon +``` + +### Building Documentation +```bash +# Coming soon +``` + +## Performance Tips + +1. **Use Local Whisper for Small Files**: Local processing is free and fast for small audio files +2. **Use OpenAI API for Accuracy**: The cloud version handles edge cases better +3. **Set Appropriate Model Size**: Larger models are more accurate but slower and use more memory +4. **Batch Processing**: Process multiple files to amortize setup time + +## API Costs + +### OpenAI Whisper API +- Current pricing: $0.02 per minute of audio +- Minimum charge per request + +Check [OpenAI Pricing](https://openai.com/pricing/) for current rates. + +## Limitations + +- Local Whisper models can be large (up to 3GB for "large") +- First-time local Whisper setup downloads the model (~1-2GB for "base") +- OpenAI API requires internet connection and active API credits +- Batch processing is sequential (one file at a time) + +## Contributing + +Contributions are welcome! Please feel free to submit pull requests or open issues. + +## License + +This project is provided as-is for educational and personal use. + +## Support + +For issues, questions, or feature requests, please open an issue on GitHub or contact the maintainers. + +## Acknowledgments + +- [OpenAI Whisper](https://github.com/openai/whisper) for transcription technology +- [Textual](https://github.com/textualize/textual) for the TUI framework +- [OpenCode](https://opencode.ai) for UI/UX inspiration diff --git a/main.py b/main.py new file mode 100644 index 0000000..758dc4c --- /dev/null +++ b/main.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +"""Whisper Transcription TUI - Main entry point.""" +import sys +from pathlib import Path + +# Add src to path for imports +sys.path.insert(0, str(Path(__file__).parent / "src")) + +from app import TranscriptionApp + + +def main() -> None: + """Run the transcription application.""" + try: + app = TranscriptionApp() + app.run() + except KeyboardInterrupt: + print("\nApplication interrupted by user.") + sys.exit(0) + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index c4e47c5..aac85a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,17 @@ -# Minimal requirements for Phase 1 MVP -# UI / tray choices are optional at this stage - keep core deps minimal +# TUI Transcription Application Dependencies -# For local whisper (optional when implemented) -# faster-whisper +# Terminal UI Framework +textual>=0.40.0 -# For audio capture (choose one) -# sounddevice -# pyaudio +# OpenAI API and Whisper +openai>=1.0.0 +openai-whisper>=20240314 -# OpenAI API client -openai>=0.27.0 +# Configuration and Environment +python-dotenv>=1.0.0 -# Optional: a GUI toolkit for Phase 2 -# PySide6 +# Utilities and Formatting +rich>=13.0.0 -# For packaging and utilities -rich -python-dotenv +# Audio processing +ffmpeg-python>=0.2.1 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..256ff30 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +"""Whisper Transcription TUI Application""" diff --git a/src/app.py b/src/app.py new file mode 100644 index 0000000..c248a48 --- /dev/null +++ b/src/app.py @@ -0,0 +1,444 @@ +"""Main Textual TUI application for transcription.""" +from pathlib import Path +from typing import Optional + +from textual.app import ComposeResult, App +from textual.containers import Container, Vertical +from textual.screen import Screen, ModalScreen +from textual.widgets import ( + Header, + Footer, + Static, + Input, + Button, + Label, + DirectoryTree, + SelectionList, + Select, + RichLog, +) +from textual.widgets.selection_list import Selection + +from .config import ConfigManager +from .file_handler import FileHandler +from .transcriber import TranscriptionService +from .formatter import MarkdownFormatter + + +class ApiKeyModal(ModalScreen): + """Modal screen for entering OpenAI API key.""" + + def __init__(self, config: ConfigManager): + """Initialize API key modal.""" + super().__init__() + self.config = config + self.api_key: Optional[str] = None + + def compose(self) -> ComposeResult: + """Compose modal widgets.""" + yield Vertical( + Label("OpenAI API Key Required"), + Label("Enter your OpenAI API key (get it from https://platform.openai.com/api-keys):"), + Input(id="api_key_input", password=True), + Container( + Button("Save", id="save_api", variant="primary"), + Button("Cancel", id="cancel_api"), + ), + id="api_key_modal", + ) + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button press.""" + if event.button.id == "save_api": + api_key_input = self.query_one("#api_key_input", Input) + if api_key_input.value.strip(): + self.api_key = api_key_input.value.strip() + self.config.set_api_key(self.api_key) + self.app.pop_screen() + else: + self.app.notify("API key cannot be empty", timeout=2) + elif event.button.id == "cancel_api": + self.app.pop_screen() + + +class MethodSelectModal(ModalScreen): + """Modal screen for selecting transcription method.""" + + def __init__(self): + """Initialize method selection modal.""" + super().__init__() + self.selected_method: Optional[str] = None + + def compose(self) -> ComposeResult: + """Compose modal widgets.""" + yield Vertical( + Label("Select Transcription Method"), + Select( + options=[ + ("OpenAI Whisper API (fast, costs money)", "openai"), + ("Local Whisper (free, slower)", "local"), + ], + id="method_select", + ), + Container( + Button("Select", id="select_method", variant="primary"), + Button("Cancel", id="cancel_method"), + ), + id="method_modal", + ) + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button press.""" + if event.button.id == "select_method": + select = self.query_one("#method_select", Select) + if select.value != Select.BLANK: + self.selected_method = select.value + self.app.pop_screen() + else: + self.app.notify("Please select a method", timeout=2) + elif event.button.id == "cancel_method": + self.app.pop_screen() + + +class OutputDirModal(ModalScreen): + """Modal screen for selecting output directory.""" + + def __init__(self, config: ConfigManager): + """Initialize output directory modal.""" + super().__init__() + self.config = config + self.selected_dir: Optional[Path] = None + + def compose(self) -> ComposeResult: + """Compose modal widgets.""" + yield Vertical( + Label("Select Output Directory"), + DirectoryTree("/", id="dir_tree"), + Container( + Button("Select", id="select_dir", variant="primary"), + Button("Cancel", id="cancel_dir"), + ), + id="output_dir_modal", + ) + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button press.""" + if event.button.id == "select_dir": + tree = self.query_one("#dir_tree", DirectoryTree) + if tree.cursor_node: + self.selected_dir = Path(tree.cursor_node.data) + is_valid, error = FileHandler.validate_directory(self.selected_dir) + if is_valid: + self.config.set_output_directory(self.selected_dir) + self.app.pop_screen() + else: + self.app.notify(f"Error: {error}", timeout=3) + else: + self.app.notify("Please select a directory", timeout=2) + elif event.button.id == "cancel_dir": + self.app.pop_screen() + + +class FileSelectScreen(Screen): + """Screen for selecting files to transcribe.""" + + def __init__(self, config: ConfigManager): + """Initialize file selection screen.""" + super().__init__() + self.config = config + self.selected_files: list[Path] = [] + + def compose(self) -> ComposeResult: + """Compose screen widgets.""" + yield Header() + yield Vertical( + Label("Select files to transcribe (Supported: MP3, WAV, M4A, FLAC, MP4, AVI, MKV, MOV)"), + DirectoryTree("/", id="file_tree"), + Static(id="file_info"), + Container( + Button("Add File", id="add_file", variant="primary"), + Button("Continue", id="continue_btn", variant="success"), + Button("Cancel", id="cancel_btn", variant="error"), + ), + id="file_select_container", + ) + yield Footer() + + def on_mount(self) -> None: + """Called when screen is mounted.""" + self.query_one("#file_tree", DirectoryTree).focus() + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button press.""" + if event.button.id == "add_file": + tree = self.query_one("#file_tree", DirectoryTree) + if tree.cursor_node: + file_path = Path(tree.cursor_node.data) + is_valid, error = FileHandler.validate_file(file_path) + if is_valid: + if file_path not in self.selected_files: + self.selected_files.append(file_path) + self._update_file_info() + else: + self.app.notify("File already added", timeout=2) + else: + self.app.notify(f"Error: {error}", timeout=2) + else: + self.app.notify("Please select a file", timeout=2) + elif event.button.id == "continue_btn": + if self.selected_files: + self.app.post_message(self.FileSelected(self.selected_files)) + else: + self.app.notify("Please select at least one file", timeout=2) + elif event.button.id == "cancel_btn": + self.app.exit() + + def _update_file_info(self) -> None: + """Update file information display.""" + info = self.query_one("#file_info", Static) + file_list = "\n".join([f"✓ {f.name}" for f in self.selected_files]) + info.update(f"Selected files:\n{file_list}\n\nTotal: {len(self.selected_files)}") + + class FileSelected: + """Message for file selection.""" + + def __init__(self, files: list[Path]): + """Initialize message.""" + self.files = files + + +class ProgressScreen(Screen): + """Screen showing transcription progress.""" + + def __init__(self, files: list[Path], method: str, config: ConfigManager): + """Initialize progress screen.""" + super().__init__() + self.files = files + self.method = method + self.config = config + self.service = TranscriptionService() + self.results: list[tuple[Path, str]] = [] + + def compose(self) -> ComposeResult: + """Compose screen widgets.""" + yield Header() + yield Vertical( + Label("Transcription Progress"), + RichLog(id="progress_log", markup=True), + Container( + Button("View Results", id="view_results", variant="primary"), + Button("Exit", id="exit_btn", variant="error"), + id="progress_controls", + ), + ) + yield Footer() + + def on_mount(self) -> None: + """Called when screen is mounted.""" + self.app.call_later(self._run_transcription) + + def _run_transcription(self) -> None: + """Run transcription on all files.""" + log = self.query_one("#progress_log", RichLog) + + # Setup transcription service + try: + if self.method == "openai": + api_key = self.config.get_api_key() + if not api_key: + log.write("[red]Error: No API key configured[/red]") + return + self.service.set_openai_backend(api_key) + log.write("[green]Using OpenAI Whisper API[/green]") + else: + model_size = self.config.get_whisper_model() + self.service.set_local_backend(model_size) + log.write(f"[green]Using Local Whisper ({model_size})[/green]") + except Exception as e: + log.write(f"[red]Error initializing backend: {str(e)}[/red]") + return + + output_dir = self.config.get_output_directory() + if not output_dir: + log.write("[red]Error: Output directory not configured[/red]") + return + + # Process each file + for i, file_path in enumerate(self.files, 1): + log.write(f"\n[yellow]Processing {i}/{len(self.files)}: {file_path.name}[/yellow]") + + try: + # Transcribe + result = self.service.transcribe(file_path) + log.write(f"[green]✓ Transcribed[/green]") + + # Format as markdown + markdown = MarkdownFormatter.format_transcription( + result["text"], + file_path, + result.get("duration", 0.0), + result.get("language", "en"), + ) + + # Save to file + output_filename = MarkdownFormatter.get_output_filename(file_path) + output_path = FileHandler.get_output_path(file_path, output_dir, output_filename) + output_path.write_text(markdown, encoding="utf-8") + log.write(f"[green]✓ Saved to {output_path.name}[/green]") + + self.results.append((file_path, str(output_path))) + except Exception as e: + log.write(f"[red]✗ Error: {str(e)}[/red]") + + log.write(f"\n[cyan]Completed {len(self.results)}/{len(self.files)} files[/cyan]") + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button press.""" + if event.button.id == "view_results": + self.app.push_screen(ResultsScreen(self.results)) + elif event.button.id == "exit_btn": + self.app.exit() + + +class ResultsScreen(Screen): + """Screen displaying transcription results.""" + + def __init__(self, results: list[tuple[Path, str]]): + """Initialize results screen.""" + super().__init__() + self.results = results + + def compose(self) -> ComposeResult: + """Compose screen widgets.""" + yield Header() + yield Vertical( + Label("Transcription Results"), + RichLog(id="results_log", markup=True), + Container( + Button("Back", id="back_btn"), + Button("Exit", id="exit_btn", variant="error"), + ), + ) + yield Footer() + + def on_mount(self) -> None: + """Called when screen is mounted.""" + log = self.query_one("#results_log", RichLog) + log.write("[cyan]Transcription Results[/cyan]\n") + for source, output in self.results: + log.write(f"[green]✓[/green] {source.name}") + log.write(f" → {Path(output).name}\n") + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button press.""" + if event.button.id == "back_btn": + self.app.pop_screen() + elif event.button.id == "exit_btn": + self.app.exit() + + +class TranscriptionApp(App): + """Main transcription application.""" + + CSS = """ + Screen { + layout: vertical; + } + + #api_key_modal { + width: 60; + height: 12; + border: solid green; + } + + #method_modal { + width: 50; + height: 10; + border: solid blue; + } + + #output_dir_modal { + width: 80; + height: 20; + border: solid purple; + } + + #file_select_container { + width: 100%; + height: 100%; + } + + DirectoryTree { + width: 1fr; + height: 1fr; + } + + #file_info { + width: 100%; + height: auto; + border: solid $accent; + padding: 1; + } + + #progress_log { + width: 100%; + height: 1fr; + border: solid $accent; + padding: 1; + } + + Container { + height: auto; + margin: 1; + } + + Button { + margin-right: 1; + } + + Label { + margin-bottom: 1; + } + """ + + def __init__(self): + """Initialize application.""" + super().__init__() + self.config = ConfigManager() + + def on_mount(self) -> None: + """Called when app is mounted.""" + self.title = "Whisper Transcription TUI" + self._check_setup() + + def _check_setup(self) -> None: + """Check if setup is needed.""" + if not self.config.output_directory_configured(): + self.push_screen_wait(OutputDirModal(self.config), self._output_dir_set) + else: + self.push_screen(FileSelectScreen(self.config)) + + def _output_dir_set(self) -> None: + """Called when output directory is set.""" + self.push_screen(FileSelectScreen(self.config)) + + def on_file_select_screen_file_selected(self, message: FileSelectScreen.FileSelected) -> None: + """Handle file selection.""" + self.push_screen_wait(MethodSelectModal(), self._method_selected(message.files)) + + def _method_selected(self, files: list[Path]): + """Return handler for method selection.""" + def handler(modal: MethodSelectModal) -> None: + if modal.selected_method: + if modal.selected_method == "openai": + if not self.config.api_key_configured(): + self.push_screen_wait(ApiKeyModal(self.config), lambda: self._start_transcription(files, "openai")) + else: + self._start_transcription(files, "openai") + else: + self._start_transcription(files, "local") + return handler + + def _start_transcription(self, files: list[Path], method: str) -> None: + """Start transcription process.""" + self.push_screen(ProgressScreen(files, method, self.config)) diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..ca29983 --- /dev/null +++ b/src/config.py @@ -0,0 +1,93 @@ +"""Configuration management for API keys and output directory.""" +import json +import os +from pathlib import Path +from typing import Optional + +from dotenv import load_dotenv, set_key + + +class ConfigManager: + """Manages configuration including API keys and output directory preferences.""" + + def __init__(self): + """Initialize config manager and load existing configuration.""" + self.env_path = Path(".env") + self.config_path = Path("config.json") + self.config_data: dict = {} + load_dotenv() + self._load_config() + + def _load_config(self) -> None: + """Load configuration from config.json if it exists.""" + if self.config_path.exists(): + try: + with open(self.config_path, "r") as f: + self.config_data = json.load(f) + except (json.JSONDecodeError, IOError): + self.config_data = {} + + def _save_config(self) -> None: + """Save configuration to config.json.""" + with open(self.config_path, "w") as f: + json.dump(self.config_data, f, indent=2) + + def get_api_key(self) -> Optional[str]: + """ + Get OpenAI API key from environment. + + Returns: + The API key if set, None otherwise. + """ + return os.getenv("OPENAI_API_KEY") + + def set_api_key(self, api_key: str) -> None: + """ + Save API key to .env file. + + Args: + api_key: The API key to save. + """ + if not self.env_path.exists(): + self.env_path.write_text("# OpenAI Configuration\n") + set_key(str(self.env_path), "OPENAI_API_KEY", api_key) + os.environ["OPENAI_API_KEY"] = api_key + + def get_output_directory(self) -> Optional[Path]: + """ + Get the configured output directory. + + Returns: + Path to output directory if configured, None otherwise. + """ + output_dir = self.config_data.get("output_directory") + if output_dir: + return Path(output_dir) + return None + + def set_output_directory(self, directory: Path) -> None: + """ + Save output directory preference to config. + + Args: + directory: The output directory path. + """ + self.config_data["output_directory"] = str(directory.resolve()) + self._save_config() + + def get_whisper_model(self) -> str: + """ + Get the Whisper model size from environment. + + Returns: + Model size (default: "base"). + """ + return os.getenv("WHISPER_MODEL", "base") + + def api_key_configured(self) -> bool: + """Check if API key is configured.""" + return bool(self.get_api_key()) + + def output_directory_configured(self) -> bool: + """Check if output directory is configured.""" + return bool(self.get_output_directory()) diff --git a/src/file_handler.py b/src/file_handler.py new file mode 100644 index 0000000..5761c00 --- /dev/null +++ b/src/file_handler.py @@ -0,0 +1,121 @@ +"""File handling and validation for audio/video files.""" +from pathlib import Path +from typing import List + +SUPPORTED_AUDIO_FORMATS = { + ".mp3", ".wav", ".m4a", ".flac", ".ogg", ".wma", ".aac", +} + +SUPPORTED_VIDEO_FORMATS = { + ".mp4", ".avi", ".mkv", ".mov", ".webm", ".flv", ".wmv", ".m4v", +} + +SUPPORTED_FORMATS = SUPPORTED_AUDIO_FORMATS | SUPPORTED_VIDEO_FORMATS + + +class FileHandler: + """Handles file operations for transcription.""" + + @staticmethod + def is_supported_file(file_path: Path) -> bool: + """ + Check if file is a supported audio/video format. + + Args: + file_path: Path to file. + + Returns: + True if file is supported, False otherwise. + """ + return file_path.suffix.lower() in SUPPORTED_FORMATS + + @staticmethod + def validate_file(file_path: Path) -> tuple[bool, str]: + """ + Validate that file exists and is supported. + + Args: + file_path: Path to file. + + Returns: + Tuple of (is_valid, error_message). + """ + if not file_path.exists(): + return False, f"File not found: {file_path}" + + if not file_path.is_file(): + return False, f"Path is not a file: {file_path}" + + if not FileHandler.is_supported_file(file_path): + return False, f"Unsupported format: {file_path.suffix}" + + return True, "" + + @staticmethod + def validate_directory(directory: Path) -> tuple[bool, str]: + """ + Validate that directory exists and is writable. + + Args: + directory: Path to directory. + + Returns: + Tuple of (is_valid, error_message). + """ + try: + directory.mkdir(parents=True, exist_ok=True) + test_file = directory / ".write_test" + test_file.write_text("test") + test_file.unlink() + return True, "" + except Exception as e: + return False, f"Directory not writable: {str(e)}" + + @staticmethod + def get_output_path(file_path: Path, output_dir: Path, filename: str) -> Path: + """ + Get the output path for a transcription. + + Args: + file_path: Source audio/video file. + output_dir: Output directory. + filename: Output filename. + + Returns: + Full output path. + """ + output_path = output_dir / filename + counter = 1 + base_stem = output_path.stem + suffix = output_path.suffix + + while output_path.exists(): + output_path = output_dir / f"{base_stem}_{counter}{suffix}" + counter += 1 + + return output_path + + @staticmethod + def get_supported_formats_display() -> str: + """ + Get display string of supported formats. + + Returns: + Formatted string listing supported formats. + """ + audio = ", ".join(sorted(SUPPORTED_AUDIO_FORMATS)) + video = ", ".join(sorted(SUPPORTED_VIDEO_FORMATS)) + return f"Audio: {audio}\nVideo: {video}" + + @staticmethod + def filter_supported_files(paths: List[Path]) -> List[Path]: + """ + Filter a list of paths to only supported files. + + Args: + paths: List of file paths. + + Returns: + Filtered list of supported files. + """ + return [p for p in paths if p.is_file() and FileHandler.is_supported_file(p)] diff --git a/src/formatter.py b/src/formatter.py new file mode 100644 index 0000000..4ee4303 --- /dev/null +++ b/src/formatter.py @@ -0,0 +1,79 @@ +"""Markdown formatting for transcriptions.""" +from datetime import datetime +from pathlib import Path + + +class MarkdownFormatter: + """Formats transcriptions as markdown with metadata headers.""" + + @staticmethod + def format_transcription( + text: str, + source_file: Path, + duration: float = 0.0, + language: str = "en", + ) -> str: + """ + Format transcription as markdown with metadata. + + Args: + text: The transcription text. + source_file: Path to the source audio/video file. + duration: Duration of the audio in seconds. + language: Language code of the transcription. + + Returns: + Formatted markdown string. + """ + timestamp = datetime.now().isoformat() + word_count = len(text.split()) + + markdown = f"""# Transcription: {source_file.name} + +**Source File:** {source_file.name} +**Date:** {timestamp} +**Duration:** {MarkdownFormatter._format_duration(duration)} +**Language:** {language} +**Word Count:** {word_count} + +--- + +{text} + +""" + return markdown + + @staticmethod + def _format_duration(seconds: float) -> str: + """ + Format duration in seconds to human-readable format. + + Args: + seconds: Duration in seconds. + + Returns: + Formatted duration string (e.g., "1:23:45"). + """ + if seconds <= 0: + return "Unknown" + + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + + if hours > 0: + return f"{hours}:{minutes:02d}:{secs:02d}" + return f"{minutes}:{secs:02d}" + + @staticmethod + def get_output_filename(source_file: Path) -> str: + """ + Generate markdown filename from source file. + + Args: + source_file: Path to the source audio/video file. + + Returns: + Filename with .md extension. + """ + return f"{source_file.stem}_transcription.md" diff --git a/src/transcriber.py b/src/transcriber.py new file mode 100644 index 0000000..a0a6792 --- /dev/null +++ b/src/transcriber.py @@ -0,0 +1,141 @@ +"""Transcription service supporting both OpenAI API and local Whisper.""" +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +import whisper +from openai import OpenAI + + +class TranscriberBackend(ABC): + """Abstract base class for transcription backends.""" + + @abstractmethod + def transcribe(self, file_path: Path) -> dict: + """ + Transcribe audio file. + + Args: + file_path: Path to audio/video file. + + Returns: + Dictionary with 'text' and optional 'language' and 'duration' keys. + """ + pass + + +class OpenAITranscriber(TranscriberBackend): + """OpenAI Whisper API transcriber.""" + + def __init__(self, api_key: str): + """ + Initialize OpenAI transcriber. + + Args: + api_key: OpenAI API key. + """ + self.client = OpenAI(api_key=api_key) + + def transcribe(self, file_path: Path) -> dict: + """ + Transcribe using OpenAI API. + + Args: + file_path: Path to audio/video file. + + Returns: + Dictionary with transcription result. + """ + with open(file_path, "rb") as audio_file: + transcript = self.client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="verbose_json", + language="en", + ) + + return { + "text": transcript.text, + "language": getattr(transcript, "language", "en"), + "duration": getattr(transcript, "duration", 0.0), + } + + +class LocalWhisperTranscriber(TranscriberBackend): + """Local Whisper model transcriber.""" + + def __init__(self, model_size: str = "base"): + """ + Initialize local Whisper transcriber. + + Args: + model_size: Size of the Whisper model (tiny, base, small, medium, large). + """ + self.model_size = model_size + self.model = whisper.load_model(model_size) + + def transcribe(self, file_path: Path) -> dict: + """ + Transcribe using local Whisper model. + + Args: + file_path: Path to audio/video file. + + Returns: + Dictionary with transcription result. + """ + result = self.model.transcribe( + str(file_path), + language="en", + fp16=False, + ) + + return { + "text": result["text"], + "language": result.get("language", "en"), + "duration": result.get("duration", 0.0), + } + + +class TranscriptionService: + """Main transcription service that manages both backends.""" + + def __init__(self): + """Initialize transcription service.""" + self.transcriber: Optional[TranscriberBackend] = None + + def set_openai_backend(self, api_key: str) -> None: + """ + Set OpenAI as the transcription backend. + + Args: + api_key: OpenAI API key. + """ + self.transcriber = OpenAITranscriber(api_key) + + def set_local_backend(self, model_size: str = "base") -> None: + """ + Set local Whisper as the transcription backend. + + Args: + model_size: Size of the Whisper model. + """ + self.transcriber = LocalWhisperTranscriber(model_size) + + def transcribe(self, file_path: Path) -> dict: + """ + Transcribe audio file using configured backend. + + Args: + file_path: Path to audio/video file. + + Returns: + Dictionary with transcription result. + + Raises: + RuntimeError: If no backend is configured. + """ + if not self.transcriber: + raise RuntimeError("No transcription backend configured") + + return self.transcriber.transcribe(file_path)