v1

2025-08-23 16:38:01 +02:00 · 2025-08-23 16:38:01 +02:00 · 5eb977dee6
commit 5eb977dee6
20 changed files with 591 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
+.env
+venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+env/
+pip-log.txt
+pip-delete-this-directory.txt
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.git
+.mypy_cache
+.pytest_cache
+.hypothesis
+*.db
+*.sqlite
+*.sqlite3
--- a/main.py
+++ b/main.py
@ -0,0 +1,109 @@
+import os
+from dotenv import load_dotenv
+
+from src.database.manager import setup_database_tables
+from src.database.video_operations import save_videos_to_database, save_video_features_to_database, get_unrated_videos_from_database
+from src.database.preference_operations import save_video_rating_to_database, get_training_data_from_database, get_unrated_videos_with_features_from_database, get_rated_count_from_database
+
+from src.youtube.search import search_youtube_videos_by_query, get_coding_search_queries
+from src.youtube.details import get_video_details_from_youtube
+from src.youtube.utils import remove_duplicate_videos
+
+from src.ml.feature_extraction import extract_all_features_from_video
+from src.ml.model_training import create_recommendation_model, train_model_on_user_preferences
+from src.ml.predictions import predict_video_preferences_with_model
+
+from src.rating.display import display_video_information_for_rating, display_rating_session_header, display_session_type_message
+from src.rating.user_input import get_user_rating_response, get_user_notes_for_rating
+from src.rating.session import process_user_rating_for_video, should_continue_rating_session, has_videos_to_rate
+
+load_dotenv()
+
+class VideoInspirationFinderApp:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.db_path = "video_inspiration.db"
+        self.model = None
+        self.model_trained = False
+        
+        setup_database_tables(self.db_path)
+
+    def search_and_save_coding_videos(self):
+        print("🔍 Searching for coding videos...")
+        
+        all_videos = []
+        search_queries = get_coding_search_queries()
+        
+        for query in search_queries[:5]:
+            video_ids = search_youtube_videos_by_query(self.api_key, query, 10)
+            videos = get_video_details_from_youtube(self.api_key, video_ids)
+            all_videos.extend(videos)
+        
+        unique_videos = remove_duplicate_videos(all_videos)
+        
+        save_videos_to_database(unique_videos, self.db_path)
+        
+        for video in unique_videos:
+            features = extract_all_features_from_video(video)
+            save_video_features_to_database(video['id'], features, self.db_path)
+        
+        print(f"Found and saved {len(unique_videos)} videos")
+
+    def start_interactive_rating_session(self):
+        display_rating_session_header()
+        
+        while True:
+            videos = self._get_videos_for_rating()
+            rated_count = get_rated_count_from_database(self.db_path)
+            session_message = display_session_type_message(self.model_trained, rated_count)
+            
+            print(f"\n{session_message}")
+            
+            if not has_videos_to_rate(videos):
+                print("No more videos to rate!")
+                break
+            
+            for video in videos:
+                display_video_information_for_rating(video)
+                
+                response = get_user_rating_response()
+                
+                if not should_continue_rating_session(response):
+                    return
+                
+                def save_rating(video_id, liked, notes):
+                    save_video_rating_to_database(video_id, liked, notes, self.db_path)
+                
+                process_user_rating_for_video(video, response, save_rating, get_user_notes_for_rating)
+                self._try_train_model()
+
+    def _get_videos_for_rating(self):
+        if self.model_trained and self.model:
+            video_features = get_unrated_videos_with_features_from_database(self.db_path)
+            return predict_video_preferences_with_model(self.model, video_features)
+        else:
+            return get_unrated_videos_from_database(10, self.db_path)
+
+    def _try_train_model(self):
+        if not self.model_trained:
+            if not self.model:
+                self.model = create_recommendation_model()
+            
+            training_data = get_training_data_from_database(self.db_path)
+            success = train_model_on_user_preferences(self.model, training_data)
+            if success:
+                self.model_trained = True
+
+def main():
+    api_key = os.getenv('YOUTUBE_API_KEY')
+    if not api_key:
+        print("Error: YOUTUBE_API_KEY not found in environment variables")
+        print("Please create a .env file with your YouTube API key")
+        return
+    
+    app = VideoInspirationFinderApp(api_key)
+    app.search_and_save_coding_videos()
+    app.start_interactive_rating_session()
+
+if __name__ == "__main__":
+    main()
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+echo "🔧 Setting up Video Inspiration Finder..."
+
+# Create virtual environment if it doesn't exist
+if [ ! -d "venv" ]; then
+    echo "📦 Creating virtual environment..."
+    python -m venv venv
+fi
+
+# Activate virtual environment
+echo "🔄 Activating virtual environment..."
+source venv/bin/activate
+
+# Install dependencies
+echo "📚 Installing dependencies..."
+pip install requests pandas scikit-learn numpy python-dotenv
+
+echo "✅ Setup complete!"
+echo "🚀 Running Video Inspiration Finder..."
+
+# Run the main script
+python main.py
--- a/src/init.py
+++ b/src/init.py
@ -0,0 +1 @@
+# Video Inspiration Finder package
--- a/src/database/init.py
+++ b/src/database/init.py
@ -0,0 +1 @@
+# Database operations package
--- a/src/database/manager.py
+++ b/src/database/manager.py
@ -0,0 +1,56 @@
+import sqlite3
+from datetime import datetime
+from typing import List, Dict
+
+def setup_database_tables(db_path: str):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS videos (
+            id TEXT PRIMARY KEY,
+            title TEXT,
+            description TEXT,
+            view_count INTEGER,
+            like_count INTEGER,
+            comment_count INTEGER,
+            duration TEXT,
+            published_at TEXT,
+            channel_name TEXT,
+            thumbnail_url TEXT,
+            tags TEXT,
+            category_id INTEGER,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    ''')
+
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS preferences (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            video_id TEXT,
+            liked BOOLEAN,
+            notes TEXT,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (video_id) REFERENCES videos (id)
+        )
+    ''')
+
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS video_features (
+            video_id TEXT PRIMARY KEY,
+            title_length INTEGER,
+            description_length INTEGER,
+            view_like_ratio REAL,
+            engagement_score REAL,
+            title_sentiment REAL,
+            has_tutorial_keywords BOOLEAN,
+            has_time_constraint BOOLEAN,
+            has_beginner_keywords BOOLEAN,
+            has_ai_keywords BOOLEAN,
+            has_challenge_keywords BOOLEAN,
+            FOREIGN KEY (video_id) REFERENCES videos (id)
+        )
+    ''')
+
+    conn.commit()
+    conn.close()
--- a/src/database/preference_operations.py
+++ b/src/database/preference_operations.py
@ -0,0 +1,46 @@
+import sqlite3
+import pandas as pd
+
+def save_video_rating_to_database(video_id: str, liked: bool, notes: str, db_path: str):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+
+    cursor.execute('''
+        INSERT INTO preferences (video_id, liked, notes) VALUES (?, ?, ?)
+    ''', (video_id, liked, notes))
+
+    conn.commit()
+    conn.close()
+
+def get_training_data_from_database(db_path: str) -> pd.DataFrame:
+    conn = sqlite3.connect(db_path)
+    query = '''
+        SELECT vf.*, p.liked
+        FROM video_features vf
+        JOIN preferences p ON vf.video_id = p.video_id
+    '''
+    df = pd.read_sql_query(query, conn)
+    conn.close()
+    return df
+
+def get_unrated_videos_with_features_from_database(db_path: str) -> pd.DataFrame:
+    conn = sqlite3.connect(db_path)
+    query = '''
+        SELECT v.*, vf.*
+        FROM videos v
+        JOIN video_features vf ON v.id = vf.video_id
+        LEFT JOIN preferences p ON v.id = p.video_id
+        WHERE p.video_id IS NULL
+        ORDER BY v.view_count DESC
+    '''
+    df = pd.read_sql_query(query, conn)
+    conn.close()
+    return df
+
+def get_rated_count_from_database(db_path: str) -> int:
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("SELECT COUNT(*) FROM preferences")
+    count = cursor.fetchone()[0]
+    conn.close()
+    return count
--- a/src/database/video_operations.py
+++ b/src/database/video_operations.py
@ -0,0 +1,58 @@
+import sqlite3
+from datetime import datetime
+from typing import List, Dict, Tuple
+
+def save_videos_to_database(videos: List[Dict], db_path: str):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+
+    for video in videos:
+        cursor.execute('''
+            INSERT OR REPLACE INTO videos VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        ''', (
+            video['id'], video['title'], video['description'],
+            video['view_count'], video['like_count'], video['comment_count'],
+            video['duration'], video['published_at'], video['channel_name'],
+            video['thumbnail_url'], video['tags'], video['category_id'],
+            datetime.now().isoformat()
+        ))
+
+    conn.commit()
+    conn.close()
+
+def save_video_features_to_database(video_id: str, features: Tuple, db_path: str):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+
+    cursor.execute('''
+        INSERT OR REPLACE INTO video_features VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    ''', (video_id,) + features)
+
+    conn.commit()
+    conn.close()
+
+def get_unrated_videos_from_database(limit: int, db_path: str) -> List[Dict]:
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+
+    cursor.execute('''
+        SELECT v.*
+        FROM videos v
+        LEFT JOIN preferences p ON v.id = p.video_id
+        WHERE p.video_id IS NULL
+        ORDER BY v.view_count DESC
+        LIMIT ?
+    ''', (limit,))
+
+    videos = []
+    for row in cursor.fetchall():
+        videos.append({
+            'id': row[0],
+            'title': row[1],
+            'channel_name': row[8],
+            'view_count': row[3],
+            'url': f"https://www.youtube.com/watch?v={row[0]}"
+        })
+
+    conn.close()
+    return videos
--- a/src/ml/init.py
+++ b/src/ml/init.py
@ -0,0 +1 @@
+# Machine learning operations package
--- a/src/ml/feature_extraction.py
+++ b/src/ml/feature_extraction.py
@ -0,0 +1,42 @@
+from typing import Dict, Tuple
+
+def calculate_basic_video_metrics(video: Dict) -> Tuple:
+    title_length = len(video['title'])
+    description_length = len(video['description'])
+    view_like_ratio = video['like_count'] / max(video['view_count'], 1)
+    engagement_score = (video['like_count'] + video['comment_count']) / max(video['view_count'], 1)
+
+    return (title_length, description_length, view_like_ratio, engagement_score)
+
+def detect_keyword_features_in_video(title: str, description: str) -> Tuple:
+    tutorial_keywords = ['tutorial', 'learn', 'course', 'guide', 'how to']
+    time_keywords = ['24 hours', '1 day', '1 hour', 'minutes', 'seconds', 'crash course']
+    beginner_keywords = ['beginner', 'start', 'basics', 'introduction', 'getting started']
+    ai_keywords = ['ai', 'artificial intelligence', 'machine learning', 'neural network']
+    challenge_keywords = ['challenge', 'build', 'create', 'project', 'coding']
+
+    has_tutorial = any(kw in title or kw in description for kw in tutorial_keywords)
+    has_time_constraint = any(kw in title for kw in time_keywords)
+    has_beginner = any(kw in title or kw in description for kw in beginner_keywords)
+    has_ai = any(kw in title or kw in description for kw in ai_keywords)
+    has_challenge = any(kw in title for kw in challenge_keywords)
+
+    return (has_tutorial, has_time_constraint, has_beginner, has_ai, has_challenge)
+
+def calculate_title_sentiment_score(title: str) -> float:
+    positive_words = ['amazing', 'best', 'awesome', 'great', 'perfect', 'love', 'incredible']
+    negative_words = ['hard', 'difficult', 'impossible', 'failed', 'broke', 'wrong']
+
+    positive_count = sum(1 for word in positive_words if word in title)
+    negative_count = sum(1 for word in negative_words if word in title)
+    return positive_count - negative_count
+
+def extract_all_features_from_video(video: Dict) -> Tuple:
+    title = video['title'].lower()
+    description = video['description'].lower()
+
+    basic_metrics = calculate_basic_video_metrics(video)
+    keyword_features = detect_keyword_features_in_video(title, description)
+    sentiment_score = calculate_title_sentiment_score(title)
+
+    return basic_metrics + keyword_features + (sentiment_score,)
--- a/src/ml/model_training.py
+++ b/src/ml/model_training.py
@ -0,0 +1,23 @@
+from sklearn.ensemble import RandomForestClassifier
+import pandas as pd
+
+def create_recommendation_model():
+    return RandomForestClassifier(n_estimators=100, random_state=42)
+
+def train_model_on_user_preferences(model, training_data: pd.DataFrame) -> bool:
+    if len(training_data) < 10:
+        print("Need at least 10 rated videos to train model")
+        return False
+
+    feature_columns = [
+        'title_length', 'description_length', 'view_like_ratio', 'engagement_score',
+        'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint',
+        'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords'
+    ]
+
+    X = training_data[feature_columns]
+    y = training_data['liked']
+
+    model.fit(X, y)
+    print(f"Model trained on {len(training_data)} rated videos")
+    return True
--- a/src/ml/predictions.py
+++ b/src/ml/predictions.py
@ -0,0 +1,33 @@
+from typing import List, Dict
+import pandas as pd
+
+def predict_video_preferences_with_model(model, video_features: pd.DataFrame) -> List[Dict]:
+    if video_features.empty:
+        return []
+
+    feature_columns = [
+        'title_length', 'description_length', 'view_like_ratio', 'engagement_score',
+        'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint',
+        'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords'
+    ]
+
+    X = video_features[feature_columns]
+    probabilities = model.predict_proba(X)[:, 1]
+    
+    video_features_copy = video_features.copy()
+    video_features_copy['like_probability'] = probabilities
+
+    top_videos = video_features_copy.nlargest(10, 'like_probability')
+
+    recommendations = []
+    for _, row in top_videos.iterrows():
+        recommendations.append({
+            'id': row['id'],
+            'title': row['title'],
+            'channel_name': row['channel_name'],
+            'view_count': row['view_count'],
+            'url': f"https://www.youtube.com/watch?v={row['id']}",
+            'like_probability': row['like_probability']
+        })
+
+    return recommendations
--- a/src/rating/init.py
+++ b/src/rating/init.py
@ -0,0 +1 @@
+# Rating system operations package
--- a/src/rating/display.py
+++ b/src/rating/display.py
@ -0,0 +1,20 @@
+from typing import Dict
+
+def display_video_information_for_rating(video: Dict):
+    print(f"\n{'='*50}")
+    print(f"Title: {video['title']}")
+    print(f"Channel: {video['channel_name']}")
+    print(f"Views: {video['view_count']:,}")
+    print(f"URL: {video['url']}")
+    print(f"{'='*50}")
+
+def display_rating_session_header():
+    print("🎯 Video Inspiration Finder - Interactive Session")
+    print("Rate videos with 'y' (like), 'n' (dislike), 'q' (quit)")
+
+def display_session_type_message(is_ml_ready: bool, rated_count: int) -> str:
+    if is_ml_ready:
+        return "📊 ML Recommendations based on your preferences:"
+    else:
+        remaining_needed = 10 - rated_count
+        return f"📹 Unrated videos (need {remaining_needed} more to train ML):"
--- a/src/rating/session.py
+++ b/src/rating/session.py
@ -0,0 +1,17 @@
+from typing import List, Dict
+
+def process_user_rating_for_video(video: Dict, response: str, save_rating_func, get_notes_func):
+    if response == 'y':
+        notes = get_notes_func(True)
+        save_rating_func(video['id'], True, notes)
+        print(f"Rated video {video['id']}: 👍")
+    elif response == 'n':
+        notes = get_notes_func(False)
+        save_rating_func(video['id'], False, notes)
+        print(f"Rated video {video['id']}: 👎")
+
+def should_continue_rating_session(response: str) -> bool:
+    return response != 'q'
+
+def has_videos_to_rate(videos: List[Dict]) -> bool:
+    return len(videos) > 0
--- a/src/rating/user_input.py
+++ b/src/rating/user_input.py
@ -0,0 +1,12 @@
+def get_user_rating_response() -> str:
+    while True:
+        response = input("Rate this video (y/n/q): ").strip().lower()
+        if response in ['y', 'n', 'q']:
+            return response
+        print("Please enter 'y', 'n', or 'q'")
+
+def get_user_notes_for_rating(liked: bool) -> str:
+    if liked:
+        return input("Why did you like it? (optional): ").strip()
+    else:
+        return input("Why didn't you like it? (optional): ").strip()
--- a/src/youtube/init.py
+++ b/src/youtube/init.py
@ -0,0 +1 @@
+# YouTube API operations package
--- a/src/youtube/details.py
+++ b/src/youtube/details.py
@ -0,0 +1,67 @@
+import requests
+import json
+from typing import List, Dict
+
+def get_video_details_from_youtube(api_key: str, video_ids: List[str]) -> List[Dict]:
+    if not video_ids:
+        return []
+
+    details_url = "https://www.googleapis.com/youtube/v3/videos"
+    params = {
+        'key': api_key,
+        'id': ','.join(video_ids),
+        'part': 'snippet,statistics,contentDetails'
+    }
+
+    try:
+        response = requests.get(details_url, params=params)
+        data = response.json()
+
+        videos = []
+        for item in data.get('items', []):
+            video = parse_youtube_video_response(item)
+            if is_relevant_coding_video(video):
+                videos.append(video)
+
+        return videos
+
+    except Exception as e:
+        print(f"Error getting video details: {e}")
+        return []
+
+def parse_youtube_video_response(item: Dict) -> Dict:
+    snippet = item['snippet']
+    statistics = item['statistics']
+
+    return {
+        'id': item['id'],
+        'title': snippet['title'],
+        'description': snippet['description'],
+        'view_count': int(statistics.get('viewCount', 0)),
+        'like_count': int(statistics.get('likeCount', 0)),
+        'comment_count': int(statistics.get('commentCount', 0)),
+        'duration': item['contentDetails']['duration'],
+        'published_at': snippet['publishedAt'],
+        'channel_name': snippet['channelTitle'],
+        'thumbnail_url': snippet['thumbnails']['high']['url'],
+        'tags': json.dumps(snippet.get('tags', [])),
+        'category_id': int(snippet.get('categoryId', 0)),
+        'url': f"https://www.youtube.com/watch?v={item['id']}"
+    }
+
+def is_relevant_coding_video(video: Dict) -> bool:
+    title = video['title'].lower()
+    description = video['description'].lower()
+
+    programming_keywords = [
+        'coding', 'programming', 'javascript', 'python', 'react', 'web development',
+        'tutorial', 'learn', 'build', 'create', 'app', 'website', 'algorithm', 'ai'
+    ]
+
+    if video['view_count'] < 100000:
+        return False
+
+    has_programming = any(keyword in title or keyword in description
+                        for keyword in programming_keywords)
+
+    return has_programming
--- a/src/youtube/search.py
+++ b/src/youtube/search.py
@ -0,0 +1,43 @@
+import requests
+from typing import List, Dict
+
+def search_youtube_videos_by_query(api_key: str, query: str, max_results: int) -> List[Dict]:
+    search_url = "https://www.googleapis.com/youtube/v3/search"
+    params = {
+        'key': api_key,
+        'q': query,
+        'part': 'snippet',
+        'type': 'video',
+        'order': 'viewCount',
+        'maxResults': max_results,
+        'videoCategoryId': '28',
+        'publishedAfter': '2020-01-01T00:00:00Z'
+    }
+
+    try:
+        response = requests.get(search_url, params=params)
+        data = response.json()
+
+        if 'items' not in data:
+            return []
+
+        video_ids = [item['id']['videoId'] for item in data['items']]
+        return video_ids
+
+    except Exception as e:
+        print(f"Error searching videos: {e}")
+        return []
+
+def get_coding_search_queries() -> List[str]:
+    return [
+        "coding tutorial millions views",
+        "programming challenge viral", 
+        "I built app hours",
+        "learn programming beginner",
+        "coding project from scratch",
+        "AI coding tutorial",
+        "web development crash course",
+        "javascript tutorial millions",
+        "python tutorial viral",
+        "coding in 24 hours"
+    ]
--- a/src/youtube/utils.py
+++ b/src/youtube/utils.py
@ -0,0 +1,12 @@
+from typing import List, Dict
+
+def remove_duplicate_videos(videos: List[Dict]) -> List[Dict]:
+    seen_ids = set()
+    unique_videos = []
+
+    for video in videos:
+        if video['id'] not in seen_ids:
+            seen_ids.add(video['id'])
+            unique_videos.append(video)
+
+    return unique_videos