v1

2025-08-23 16:38:01 +02:00 · 2025-08-23 16:38:01 +02:00 · 5eb977dee6
commit 5eb977dee6
20 changed files with 591 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
 .env
 venv/
 __pycache__/
 *.pyc
 *.pyo
 *.pyd
 .Python
 env/
 pip-log.txt
 pip-delete-this-directory.txt
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.log
 .git
 .mypy_cache
 .pytest_cache
 .hypothesis
 *.db
 *.sqlite
 *.sqlite3
--- a/main.py
+++ b/main.py
@ -0,0 +1,109 @@
 import os
 from dotenv import load_dotenv
 from src.database.manager import setup_database_tables
 from src.database.video_operations import save_videos_to_database, save_video_features_to_database, get_unrated_videos_from_database
 from src.database.preference_operations import save_video_rating_to_database, get_training_data_from_database, get_unrated_videos_with_features_from_database, get_rated_count_from_database
 from src.youtube.search import search_youtube_videos_by_query, get_coding_search_queries
 from src.youtube.details import get_video_details_from_youtube
 from src.youtube.utils import remove_duplicate_videos
 from src.ml.feature_extraction import extract_all_features_from_video
 from src.ml.model_training import create_recommendation_model, train_model_on_user_preferences
 from src.ml.predictions import predict_video_preferences_with_model
 from src.rating.display import display_video_information_for_rating, display_rating_session_header, display_session_type_message
 from src.rating.user_input import get_user_rating_response, get_user_notes_for_rating
 from src.rating.session import process_user_rating_for_video, should_continue_rating_session, has_videos_to_rate
 load_dotenv()
 class VideoInspirationFinderApp:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.db_path = "video_inspiration.db"
        self.model = None
        self.model_trained = False
        setup_database_tables(self.db_path)
    def search_and_save_coding_videos(self):
        print("🔍 Searching for coding videos...")
        all_videos = []
        search_queries = get_coding_search_queries()
        for query in search_queries[:5]:
            video_ids = search_youtube_videos_by_query(self.api_key, query, 10)
            videos = get_video_details_from_youtube(self.api_key, video_ids)
            all_videos.extend(videos)
        unique_videos = remove_duplicate_videos(all_videos)
        save_videos_to_database(unique_videos, self.db_path)
        for video in unique_videos:
            features = extract_all_features_from_video(video)
            save_video_features_to_database(video['id'], features, self.db_path)
        print(f"Found and saved {len(unique_videos)} videos")
    def start_interactive_rating_session(self):
        display_rating_session_header()
        while True:
            videos = self._get_videos_for_rating()
            rated_count = get_rated_count_from_database(self.db_path)
            session_message = display_session_type_message(self.model_trained, rated_count)
            print(f"\n{session_message}")
            if not has_videos_to_rate(videos):
                print("No more videos to rate!")
                break
            for video in videos:
                display_video_information_for_rating(video)
                response = get_user_rating_response()
                if not should_continue_rating_session(response):
                    return
                def save_rating(video_id, liked, notes):
                    save_video_rating_to_database(video_id, liked, notes, self.db_path)
                process_user_rating_for_video(video, response, save_rating, get_user_notes_for_rating)
                self._try_train_model()
    def _get_videos_for_rating(self):
        if self.model_trained and self.model:
            video_features = get_unrated_videos_with_features_from_database(self.db_path)
            return predict_video_preferences_with_model(self.model, video_features)
        else:
            return get_unrated_videos_from_database(10, self.db_path)
    def _try_train_model(self):
        if not self.model_trained:
            if not self.model:
                self.model = create_recommendation_model()
            training_data = get_training_data_from_database(self.db_path)
            success = train_model_on_user_preferences(self.model, training_data)
            if success:
                self.model_trained = True
 def main():
    api_key = os.getenv('YOUTUBE_API_KEY')
    if not api_key:
        print("Error: YOUTUBE_API_KEY not found in environment variables")
        print("Please create a .env file with your YouTube API key")
        return
    app = VideoInspirationFinderApp(api_key)
    app.search_and_save_coding_videos()
    app.start_interactive_rating_session()
 if __name__ == "__main__":
    main()
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,23 @@
 #!/bin/bash
 echo "🔧 Setting up Video Inspiration Finder..."
 # Create virtual environment if it doesn't exist
 if [ ! -d "venv" ]; then
    echo "📦 Creating virtual environment..."
    python -m venv venv
 fi
 # Activate virtual environment
 echo "🔄 Activating virtual environment..."
 source venv/bin/activate
 # Install dependencies
 echo "📚 Installing dependencies..."
 pip install requests pandas scikit-learn numpy python-dotenv
 echo "✅ Setup complete!"
 echo "🚀 Running Video Inspiration Finder..."
 # Run the main script
 python main.py
--- a/src/init.py
+++ b/src/init.py
@ -0,0 +1 @@
 # Video Inspiration Finder package
--- a/src/database/init.py
+++ b/src/database/init.py
@ -0,0 +1 @@
 # Database operations package
--- a/src/database/manager.py
+++ b/src/database/manager.py
@ -0,0 +1,56 @@
 import sqlite3
 from datetime import datetime
 from typing import List, Dict
 def setup_database_tables(db_path: str):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS videos (
            id TEXT PRIMARY KEY,
            title TEXT,
            description TEXT,
            view_count INTEGER,
            like_count INTEGER,
            comment_count INTEGER,
            duration TEXT,
            published_at TEXT,
            channel_name TEXT,
            thumbnail_url TEXT,
            tags TEXT,
            category_id INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS preferences (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            video_id TEXT,
            liked BOOLEAN,
            notes TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (video_id) REFERENCES videos (id)
        )
    ''')
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS video_features (
            video_id TEXT PRIMARY KEY,
            title_length INTEGER,
            description_length INTEGER,
            view_like_ratio REAL,
            engagement_score REAL,
            title_sentiment REAL,
            has_tutorial_keywords BOOLEAN,
            has_time_constraint BOOLEAN,
            has_beginner_keywords BOOLEAN,
            has_ai_keywords BOOLEAN,
            has_challenge_keywords BOOLEAN,
            FOREIGN KEY (video_id) REFERENCES videos (id)
        )
    ''')
    conn.commit()
    conn.close()
--- a/src/database/preference_operations.py
+++ b/src/database/preference_operations.py
@ -0,0 +1,46 @@
 import sqlite3
 import pandas as pd
 def save_video_rating_to_database(video_id: str, liked: bool, notes: str, db_path: str):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        INSERT INTO preferences (video_id, liked, notes) VALUES (?, ?, ?)
    ''', (video_id, liked, notes))
    conn.commit()
    conn.close()
 def get_training_data_from_database(db_path: str) -> pd.DataFrame:
    conn = sqlite3.connect(db_path)
    query = '''
        SELECT vf.*, p.liked
        FROM video_features vf
        JOIN preferences p ON vf.video_id = p.video_id
    '''
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df
 def get_unrated_videos_with_features_from_database(db_path: str) -> pd.DataFrame:
    conn = sqlite3.connect(db_path)
    query = '''
        SELECT v.*, vf.*
        FROM videos v
        JOIN video_features vf ON v.id = vf.video_id
        LEFT JOIN preferences p ON v.id = p.video_id
        WHERE p.video_id IS NULL
        ORDER BY v.view_count DESC
    '''
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df
 def get_rated_count_from_database(db_path: str) -> int:
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM preferences")
    count = cursor.fetchone()[0]
    conn.close()
    return count
--- a/src/database/video_operations.py
+++ b/src/database/video_operations.py
@ -0,0 +1,58 @@
 import sqlite3
 from datetime import datetime
 from typing import List, Dict, Tuple
 def save_videos_to_database(videos: List[Dict], db_path: str):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    for video in videos:
        cursor.execute('''
            INSERT OR REPLACE INTO videos VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            video['id'], video['title'], video['description'],
            video['view_count'], video['like_count'], video['comment_count'],
            video['duration'], video['published_at'], video['channel_name'],
            video['thumbnail_url'], video['tags'], video['category_id'],
            datetime.now().isoformat()
        ))
    conn.commit()
    conn.close()
 def save_video_features_to_database(video_id: str, features: Tuple, db_path: str):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        INSERT OR REPLACE INTO video_features VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (video_id,) + features)
    conn.commit()
    conn.close()
 def get_unrated_videos_from_database(limit: int, db_path: str) -> List[Dict]:
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        SELECT v.*
        FROM videos v
        LEFT JOIN preferences p ON v.id = p.video_id
        WHERE p.video_id IS NULL
        ORDER BY v.view_count DESC
        LIMIT ?
    ''', (limit,))
    videos = []
    for row in cursor.fetchall():
        videos.append({
            'id': row[0],
            'title': row[1],
            'channel_name': row[8],
            'view_count': row[3],
            'url': f"https://www.youtube.com/watch?v={row[0]}"
        })
    conn.close()
    return videos
--- a/src/ml/init.py
+++ b/src/ml/init.py
@ -0,0 +1 @@
 # Machine learning operations package
--- a/src/ml/feature_extraction.py
+++ b/src/ml/feature_extraction.py
@ -0,0 +1,42 @@
 from typing import Dict, Tuple
 def calculate_basic_video_metrics(video: Dict) -> Tuple:
    title_length = len(video['title'])
    description_length = len(video['description'])
    view_like_ratio = video['like_count'] / max(video['view_count'], 1)
    engagement_score = (video['like_count'] + video['comment_count']) / max(video['view_count'], 1)
    return (title_length, description_length, view_like_ratio, engagement_score)
 def detect_keyword_features_in_video(title: str, description: str) -> Tuple:
    tutorial_keywords = ['tutorial', 'learn', 'course', 'guide', 'how to']
    time_keywords = ['24 hours', '1 day', '1 hour', 'minutes', 'seconds', 'crash course']
    beginner_keywords = ['beginner', 'start', 'basics', 'introduction', 'getting started']
    ai_keywords = ['ai', 'artificial intelligence', 'machine learning', 'neural network']
    challenge_keywords = ['challenge', 'build', 'create', 'project', 'coding']
    has_tutorial = any(kw in title or kw in description for kw in tutorial_keywords)
    has_time_constraint = any(kw in title for kw in time_keywords)
    has_beginner = any(kw in title or kw in description for kw in beginner_keywords)
    has_ai = any(kw in title or kw in description for kw in ai_keywords)
    has_challenge = any(kw in title for kw in challenge_keywords)
    return (has_tutorial, has_time_constraint, has_beginner, has_ai, has_challenge)
 def calculate_title_sentiment_score(title: str) -> float:
    positive_words = ['amazing', 'best', 'awesome', 'great', 'perfect', 'love', 'incredible']
    negative_words = ['hard', 'difficult', 'impossible', 'failed', 'broke', 'wrong']
    positive_count = sum(1 for word in positive_words if word in title)
    negative_count = sum(1 for word in negative_words if word in title)
    return positive_count - negative_count
 def extract_all_features_from_video(video: Dict) -> Tuple:
    title = video['title'].lower()
    description = video['description'].lower()
    basic_metrics = calculate_basic_video_metrics(video)
    keyword_features = detect_keyword_features_in_video(title, description)
    sentiment_score = calculate_title_sentiment_score(title)
    return basic_metrics + keyword_features + (sentiment_score,)
--- a/src/ml/model_training.py
+++ b/src/ml/model_training.py
@ -0,0 +1,23 @@
 from sklearn.ensemble import RandomForestClassifier
 import pandas as pd
 def create_recommendation_model():
    return RandomForestClassifier(n_estimators=100, random_state=42)
 def train_model_on_user_preferences(model, training_data: pd.DataFrame) -> bool:
    if len(training_data) < 10:
        print("Need at least 10 rated videos to train model")
        return False
    feature_columns = [
        'title_length', 'description_length', 'view_like_ratio', 'engagement_score',
        'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint',
        'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords'
    ]
    X = training_data[feature_columns]
    y = training_data['liked']
    model.fit(X, y)
    print(f"Model trained on {len(training_data)} rated videos")
    return True
--- a/src/ml/predictions.py
+++ b/src/ml/predictions.py
@ -0,0 +1,33 @@
 from typing import List, Dict
 import pandas as pd
 def predict_video_preferences_with_model(model, video_features: pd.DataFrame) -> List[Dict]:
    if video_features.empty:
        return []
    feature_columns = [
        'title_length', 'description_length', 'view_like_ratio', 'engagement_score',
        'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint',
        'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords'
    ]
    X = video_features[feature_columns]
    probabilities = model.predict_proba(X)[:, 1]
    video_features_copy = video_features.copy()
    video_features_copy['like_probability'] = probabilities
    top_videos = video_features_copy.nlargest(10, 'like_probability')
    recommendations = []
    for _, row in top_videos.iterrows():
        recommendations.append({
            'id': row['id'],
            'title': row['title'],
            'channel_name': row['channel_name'],
            'view_count': row['view_count'],
            'url': f"https://www.youtube.com/watch?v={row['id']}",
            'like_probability': row['like_probability']
        })
    return recommendations
--- a/src/rating/init.py
+++ b/src/rating/init.py
@ -0,0 +1 @@
 # Rating system operations package
--- a/src/rating/display.py
+++ b/src/rating/display.py
@ -0,0 +1,20 @@
 from typing import Dict
 def display_video_information_for_rating(video: Dict):
    print(f"\n{'='*50}")
    print(f"Title: {video['title']}")
    print(f"Channel: {video['channel_name']}")
    print(f"Views: {video['view_count']:,}")
    print(f"URL: {video['url']}")
    print(f"{'='*50}")
 def display_rating_session_header():
    print("🎯 Video Inspiration Finder - Interactive Session")
    print("Rate videos with 'y' (like), 'n' (dislike), 'q' (quit)")
 def display_session_type_message(is_ml_ready: bool, rated_count: int) -> str:
    if is_ml_ready:
        return "📊 ML Recommendations based on your preferences:"
    else:
        remaining_needed = 10 - rated_count
        return f"📹 Unrated videos (need {remaining_needed} more to train ML):"
--- a/src/rating/session.py
+++ b/src/rating/session.py
@ -0,0 +1,17 @@
 from typing import List, Dict
 def process_user_rating_for_video(video: Dict, response: str, save_rating_func, get_notes_func):
    if response == 'y':
        notes = get_notes_func(True)
        save_rating_func(video['id'], True, notes)
        print(f"Rated video {video['id']}: 👍")
    elif response == 'n':
        notes = get_notes_func(False)
        save_rating_func(video['id'], False, notes)
        print(f"Rated video {video['id']}: 👎")
 def should_continue_rating_session(response: str) -> bool:
    return response != 'q'
 def has_videos_to_rate(videos: List[Dict]) -> bool:
    return len(videos) > 0
--- a/src/rating/user_input.py
+++ b/src/rating/user_input.py
@ -0,0 +1,12 @@
 def get_user_rating_response() -> str:
    while True:
        response = input("Rate this video (y/n/q): ").strip().lower()
        if response in ['y', 'n', 'q']:
            return response
        print("Please enter 'y', 'n', or 'q'")
 def get_user_notes_for_rating(liked: bool) -> str:
    if liked:
        return input("Why did you like it? (optional): ").strip()
    else:
        return input("Why didn't you like it? (optional): ").strip()
--- a/src/youtube/init.py
+++ b/src/youtube/init.py
@ -0,0 +1 @@
 # YouTube API operations package
--- a/src/youtube/details.py
+++ b/src/youtube/details.py
@ -0,0 +1,67 @@
 import requests
 import json
 from typing import List, Dict
 def get_video_details_from_youtube(api_key: str, video_ids: List[str]) -> List[Dict]:
    if not video_ids:
        return []
    details_url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        'key': api_key,
        'id': ','.join(video_ids),
        'part': 'snippet,statistics,contentDetails'
    }
    try:
        response = requests.get(details_url, params=params)
        data = response.json()
        videos = []
        for item in data.get('items', []):
            video = parse_youtube_video_response(item)
            if is_relevant_coding_video(video):
                videos.append(video)
        return videos
    except Exception as e:
        print(f"Error getting video details: {e}")
        return []
 def parse_youtube_video_response(item: Dict) -> Dict:
    snippet = item['snippet']
    statistics = item['statistics']
    return {
        'id': item['id'],
        'title': snippet['title'],
        'description': snippet['description'],
        'view_count': int(statistics.get('viewCount', 0)),
        'like_count': int(statistics.get('likeCount', 0)),
        'comment_count': int(statistics.get('commentCount', 0)),
        'duration': item['contentDetails']['duration'],
        'published_at': snippet['publishedAt'],
        'channel_name': snippet['channelTitle'],
        'thumbnail_url': snippet['thumbnails']['high']['url'],
        'tags': json.dumps(snippet.get('tags', [])),
        'category_id': int(snippet.get('categoryId', 0)),
        'url': f"https://www.youtube.com/watch?v={item['id']}"
    }
 def is_relevant_coding_video(video: Dict) -> bool:
    title = video['title'].lower()
    description = video['description'].lower()
    programming_keywords = [
        'coding', 'programming', 'javascript', 'python', 'react', 'web development',
        'tutorial', 'learn', 'build', 'create', 'app', 'website', 'algorithm', 'ai'
    ]
    if video['view_count'] < 100000:
        return False
    has_programming = any(keyword in title or keyword in description
                        for keyword in programming_keywords)
    return has_programming
--- a/src/youtube/search.py
+++ b/src/youtube/search.py
@ -0,0 +1,43 @@
 import requests
 from typing import List, Dict
 def search_youtube_videos_by_query(api_key: str, query: str, max_results: int) -> List[Dict]:
    search_url = "https://www.googleapis.com/youtube/v3/search"
    params = {
        'key': api_key,
        'q': query,
        'part': 'snippet',
        'type': 'video',
        'order': 'viewCount',
        'maxResults': max_results,
        'videoCategoryId': '28',
        'publishedAfter': '2020-01-01T00:00:00Z'
    }
    try:
        response = requests.get(search_url, params=params)
        data = response.json()
        if 'items' not in data:
            return []
        video_ids = [item['id']['videoId'] for item in data['items']]
        return video_ids
    except Exception as e:
        print(f"Error searching videos: {e}")
        return []
 def get_coding_search_queries() -> List[str]:
    return [
        "coding tutorial millions views",
        "programming challenge viral", 
        "I built app hours",
        "learn programming beginner",
        "coding project from scratch",
        "AI coding tutorial",
        "web development crash course",
        "javascript tutorial millions",
        "python tutorial viral",
        "coding in 24 hours"
    ]
--- a/src/youtube/utils.py
+++ b/src/youtube/utils.py
@ -0,0 +1,12 @@
 from typing import List, Dict
 def remove_duplicate_videos(videos: List[Dict]) -> List[Dict]:
    seen_ids = set()
    unique_videos = []
    for video in videos:
        if video['id'] not in seen_ids:
            seen_ids.add(video['id'])
            unique_videos.append(video)
    return unique_videos