commit 5eb977dee65acdf9c32c7a0a982dade1a58efb98 Author: Gauri Joshi <> Date: Sat Aug 23 16:38:01 2025 +0200 v1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..df5d347 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +.env +venv/ +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +env/ +pip-log.txt +pip-delete-this-directory.txt +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.log +.git +.mypy_cache +.pytest_cache +.hypothesis +*.db +*.sqlite +*.sqlite3 \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..360f3f2 --- /dev/null +++ b/main.py @@ -0,0 +1,109 @@ +import os +from dotenv import load_dotenv + +from src.database.manager import setup_database_tables +from src.database.video_operations import save_videos_to_database, save_video_features_to_database, get_unrated_videos_from_database +from src.database.preference_operations import save_video_rating_to_database, get_training_data_from_database, get_unrated_videos_with_features_from_database, get_rated_count_from_database + +from src.youtube.search import search_youtube_videos_by_query, get_coding_search_queries +from src.youtube.details import get_video_details_from_youtube +from src.youtube.utils import remove_duplicate_videos + +from src.ml.feature_extraction import extract_all_features_from_video +from src.ml.model_training import create_recommendation_model, train_model_on_user_preferences +from src.ml.predictions import predict_video_preferences_with_model + +from src.rating.display import display_video_information_for_rating, display_rating_session_header, display_session_type_message +from src.rating.user_input import get_user_rating_response, get_user_notes_for_rating +from src.rating.session import process_user_rating_for_video, should_continue_rating_session, has_videos_to_rate + +load_dotenv() + +class VideoInspirationFinderApp: + def __init__(self, api_key: str): + self.api_key = api_key + self.db_path = "video_inspiration.db" + self.model = None + self.model_trained = False + + setup_database_tables(self.db_path) + + def search_and_save_coding_videos(self): + print("🔍 Searching for coding videos...") + + all_videos = [] + search_queries = get_coding_search_queries() + + for query in search_queries[:5]: + video_ids = search_youtube_videos_by_query(self.api_key, query, 10) + videos = get_video_details_from_youtube(self.api_key, video_ids) + all_videos.extend(videos) + + unique_videos = remove_duplicate_videos(all_videos) + + save_videos_to_database(unique_videos, self.db_path) + + for video in unique_videos: + features = extract_all_features_from_video(video) + save_video_features_to_database(video['id'], features, self.db_path) + + print(f"Found and saved {len(unique_videos)} videos") + + def start_interactive_rating_session(self): + display_rating_session_header() + + while True: + videos = self._get_videos_for_rating() + rated_count = get_rated_count_from_database(self.db_path) + session_message = display_session_type_message(self.model_trained, rated_count) + + print(f"\n{session_message}") + + if not has_videos_to_rate(videos): + print("No more videos to rate!") + break + + for video in videos: + display_video_information_for_rating(video) + + response = get_user_rating_response() + + if not should_continue_rating_session(response): + return + + def save_rating(video_id, liked, notes): + save_video_rating_to_database(video_id, liked, notes, self.db_path) + + process_user_rating_for_video(video, response, save_rating, get_user_notes_for_rating) + self._try_train_model() + + def _get_videos_for_rating(self): + if self.model_trained and self.model: + video_features = get_unrated_videos_with_features_from_database(self.db_path) + return predict_video_preferences_with_model(self.model, video_features) + else: + return get_unrated_videos_from_database(10, self.db_path) + + def _try_train_model(self): + if not self.model_trained: + if not self.model: + self.model = create_recommendation_model() + + training_data = get_training_data_from_database(self.db_path) + success = train_model_on_user_preferences(self.model, training_data) + if success: + self.model_trained = True + +def main(): + api_key = os.getenv('YOUTUBE_API_KEY') + if not api_key: + print("Error: YOUTUBE_API_KEY not found in environment variables") + print("Please create a .env file with your YouTube API key") + return + + app = VideoInspirationFinderApp(api_key) + app.search_and_save_coding_videos() + app.start_interactive_rating_session() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..7c1aefd --- /dev/null +++ b/setup.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +echo "🔧 Setting up Video Inspiration Finder..." + +# Create virtual environment if it doesn't exist +if [ ! -d "venv" ]; then + echo "📦 Creating virtual environment..." + python -m venv venv +fi + +# Activate virtual environment +echo "🔄 Activating virtual environment..." +source venv/bin/activate + +# Install dependencies +echo "📚 Installing dependencies..." +pip install requests pandas scikit-learn numpy python-dotenv + +echo "✅ Setup complete!" +echo "🚀 Running Video Inspiration Finder..." + +# Run the main script +python main.py \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..743b728 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +# Video Inspiration Finder package \ No newline at end of file diff --git a/src/database/__init__.py b/src/database/__init__.py new file mode 100644 index 0000000..6f9fc55 --- /dev/null +++ b/src/database/__init__.py @@ -0,0 +1 @@ +# Database operations package \ No newline at end of file diff --git a/src/database/manager.py b/src/database/manager.py new file mode 100644 index 0000000..13cb176 --- /dev/null +++ b/src/database/manager.py @@ -0,0 +1,56 @@ +import sqlite3 +from datetime import datetime +from typing import List, Dict + +def setup_database_tables(db_path: str): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS videos ( + id TEXT PRIMARY KEY, + title TEXT, + description TEXT, + view_count INTEGER, + like_count INTEGER, + comment_count INTEGER, + duration TEXT, + published_at TEXT, + channel_name TEXT, + thumbnail_url TEXT, + tags TEXT, + category_id INTEGER, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS preferences ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + video_id TEXT, + liked BOOLEAN, + notes TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (video_id) REFERENCES videos (id) + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS video_features ( + video_id TEXT PRIMARY KEY, + title_length INTEGER, + description_length INTEGER, + view_like_ratio REAL, + engagement_score REAL, + title_sentiment REAL, + has_tutorial_keywords BOOLEAN, + has_time_constraint BOOLEAN, + has_beginner_keywords BOOLEAN, + has_ai_keywords BOOLEAN, + has_challenge_keywords BOOLEAN, + FOREIGN KEY (video_id) REFERENCES videos (id) + ) + ''') + + conn.commit() + conn.close() \ No newline at end of file diff --git a/src/database/preference_operations.py b/src/database/preference_operations.py new file mode 100644 index 0000000..8cde56a --- /dev/null +++ b/src/database/preference_operations.py @@ -0,0 +1,46 @@ +import sqlite3 +import pandas as pd + +def save_video_rating_to_database(video_id: str, liked: bool, notes: str, db_path: str): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + cursor.execute(''' + INSERT INTO preferences (video_id, liked, notes) VALUES (?, ?, ?) + ''', (video_id, liked, notes)) + + conn.commit() + conn.close() + +def get_training_data_from_database(db_path: str) -> pd.DataFrame: + conn = sqlite3.connect(db_path) + query = ''' + SELECT vf.*, p.liked + FROM video_features vf + JOIN preferences p ON vf.video_id = p.video_id + ''' + df = pd.read_sql_query(query, conn) + conn.close() + return df + +def get_unrated_videos_with_features_from_database(db_path: str) -> pd.DataFrame: + conn = sqlite3.connect(db_path) + query = ''' + SELECT v.*, vf.* + FROM videos v + JOIN video_features vf ON v.id = vf.video_id + LEFT JOIN preferences p ON v.id = p.video_id + WHERE p.video_id IS NULL + ORDER BY v.view_count DESC + ''' + df = pd.read_sql_query(query, conn) + conn.close() + return df + +def get_rated_count_from_database(db_path: str) -> int: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM preferences") + count = cursor.fetchone()[0] + conn.close() + return count \ No newline at end of file diff --git a/src/database/video_operations.py b/src/database/video_operations.py new file mode 100644 index 0000000..f236fbb --- /dev/null +++ b/src/database/video_operations.py @@ -0,0 +1,58 @@ +import sqlite3 +from datetime import datetime +from typing import List, Dict, Tuple + +def save_videos_to_database(videos: List[Dict], db_path: str): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + for video in videos: + cursor.execute(''' + INSERT OR REPLACE INTO videos VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + video['id'], video['title'], video['description'], + video['view_count'], video['like_count'], video['comment_count'], + video['duration'], video['published_at'], video['channel_name'], + video['thumbnail_url'], video['tags'], video['category_id'], + datetime.now().isoformat() + )) + + conn.commit() + conn.close() + +def save_video_features_to_database(video_id: str, features: Tuple, db_path: str): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + cursor.execute(''' + INSERT OR REPLACE INTO video_features VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', (video_id,) + features) + + conn.commit() + conn.close() + +def get_unrated_videos_from_database(limit: int, db_path: str) -> List[Dict]: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + cursor.execute(''' + SELECT v.* + FROM videos v + LEFT JOIN preferences p ON v.id = p.video_id + WHERE p.video_id IS NULL + ORDER BY v.view_count DESC + LIMIT ? + ''', (limit,)) + + videos = [] + for row in cursor.fetchall(): + videos.append({ + 'id': row[0], + 'title': row[1], + 'channel_name': row[8], + 'view_count': row[3], + 'url': f"https://www.youtube.com/watch?v={row[0]}" + }) + + conn.close() + return videos \ No newline at end of file diff --git a/src/ml/__init__.py b/src/ml/__init__.py new file mode 100644 index 0000000..70c47f6 --- /dev/null +++ b/src/ml/__init__.py @@ -0,0 +1 @@ +# Machine learning operations package \ No newline at end of file diff --git a/src/ml/feature_extraction.py b/src/ml/feature_extraction.py new file mode 100644 index 0000000..f06c338 --- /dev/null +++ b/src/ml/feature_extraction.py @@ -0,0 +1,42 @@ +from typing import Dict, Tuple + +def calculate_basic_video_metrics(video: Dict) -> Tuple: + title_length = len(video['title']) + description_length = len(video['description']) + view_like_ratio = video['like_count'] / max(video['view_count'], 1) + engagement_score = (video['like_count'] + video['comment_count']) / max(video['view_count'], 1) + + return (title_length, description_length, view_like_ratio, engagement_score) + +def detect_keyword_features_in_video(title: str, description: str) -> Tuple: + tutorial_keywords = ['tutorial', 'learn', 'course', 'guide', 'how to'] + time_keywords = ['24 hours', '1 day', '1 hour', 'minutes', 'seconds', 'crash course'] + beginner_keywords = ['beginner', 'start', 'basics', 'introduction', 'getting started'] + ai_keywords = ['ai', 'artificial intelligence', 'machine learning', 'neural network'] + challenge_keywords = ['challenge', 'build', 'create', 'project', 'coding'] + + has_tutorial = any(kw in title or kw in description for kw in tutorial_keywords) + has_time_constraint = any(kw in title for kw in time_keywords) + has_beginner = any(kw in title or kw in description for kw in beginner_keywords) + has_ai = any(kw in title or kw in description for kw in ai_keywords) + has_challenge = any(kw in title for kw in challenge_keywords) + + return (has_tutorial, has_time_constraint, has_beginner, has_ai, has_challenge) + +def calculate_title_sentiment_score(title: str) -> float: + positive_words = ['amazing', 'best', 'awesome', 'great', 'perfect', 'love', 'incredible'] + negative_words = ['hard', 'difficult', 'impossible', 'failed', 'broke', 'wrong'] + + positive_count = sum(1 for word in positive_words if word in title) + negative_count = sum(1 for word in negative_words if word in title) + return positive_count - negative_count + +def extract_all_features_from_video(video: Dict) -> Tuple: + title = video['title'].lower() + description = video['description'].lower() + + basic_metrics = calculate_basic_video_metrics(video) + keyword_features = detect_keyword_features_in_video(title, description) + sentiment_score = calculate_title_sentiment_score(title) + + return basic_metrics + keyword_features + (sentiment_score,) \ No newline at end of file diff --git a/src/ml/model_training.py b/src/ml/model_training.py new file mode 100644 index 0000000..003d924 --- /dev/null +++ b/src/ml/model_training.py @@ -0,0 +1,23 @@ +from sklearn.ensemble import RandomForestClassifier +import pandas as pd + +def create_recommendation_model(): + return RandomForestClassifier(n_estimators=100, random_state=42) + +def train_model_on_user_preferences(model, training_data: pd.DataFrame) -> bool: + if len(training_data) < 10: + print("Need at least 10 rated videos to train model") + return False + + feature_columns = [ + 'title_length', 'description_length', 'view_like_ratio', 'engagement_score', + 'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint', + 'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords' + ] + + X = training_data[feature_columns] + y = training_data['liked'] + + model.fit(X, y) + print(f"Model trained on {len(training_data)} rated videos") + return True \ No newline at end of file diff --git a/src/ml/predictions.py b/src/ml/predictions.py new file mode 100644 index 0000000..d20cced --- /dev/null +++ b/src/ml/predictions.py @@ -0,0 +1,33 @@ +from typing import List, Dict +import pandas as pd + +def predict_video_preferences_with_model(model, video_features: pd.DataFrame) -> List[Dict]: + if video_features.empty: + return [] + + feature_columns = [ + 'title_length', 'description_length', 'view_like_ratio', 'engagement_score', + 'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint', + 'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords' + ] + + X = video_features[feature_columns] + probabilities = model.predict_proba(X)[:, 1] + + video_features_copy = video_features.copy() + video_features_copy['like_probability'] = probabilities + + top_videos = video_features_copy.nlargest(10, 'like_probability') + + recommendations = [] + for _, row in top_videos.iterrows(): + recommendations.append({ + 'id': row['id'], + 'title': row['title'], + 'channel_name': row['channel_name'], + 'view_count': row['view_count'], + 'url': f"https://www.youtube.com/watch?v={row['id']}", + 'like_probability': row['like_probability'] + }) + + return recommendations \ No newline at end of file diff --git a/src/rating/__init__.py b/src/rating/__init__.py new file mode 100644 index 0000000..36afa9a --- /dev/null +++ b/src/rating/__init__.py @@ -0,0 +1 @@ +# Rating system operations package \ No newline at end of file diff --git a/src/rating/display.py b/src/rating/display.py new file mode 100644 index 0000000..fcba5b5 --- /dev/null +++ b/src/rating/display.py @@ -0,0 +1,20 @@ +from typing import Dict + +def display_video_information_for_rating(video: Dict): + print(f"\n{'='*50}") + print(f"Title: {video['title']}") + print(f"Channel: {video['channel_name']}") + print(f"Views: {video['view_count']:,}") + print(f"URL: {video['url']}") + print(f"{'='*50}") + +def display_rating_session_header(): + print("🎯 Video Inspiration Finder - Interactive Session") + print("Rate videos with 'y' (like), 'n' (dislike), 'q' (quit)") + +def display_session_type_message(is_ml_ready: bool, rated_count: int) -> str: + if is_ml_ready: + return "📊 ML Recommendations based on your preferences:" + else: + remaining_needed = 10 - rated_count + return f"📹 Unrated videos (need {remaining_needed} more to train ML):" \ No newline at end of file diff --git a/src/rating/session.py b/src/rating/session.py new file mode 100644 index 0000000..25d8837 --- /dev/null +++ b/src/rating/session.py @@ -0,0 +1,17 @@ +from typing import List, Dict + +def process_user_rating_for_video(video: Dict, response: str, save_rating_func, get_notes_func): + if response == 'y': + notes = get_notes_func(True) + save_rating_func(video['id'], True, notes) + print(f"Rated video {video['id']}: 👍") + elif response == 'n': + notes = get_notes_func(False) + save_rating_func(video['id'], False, notes) + print(f"Rated video {video['id']}: 👎") + +def should_continue_rating_session(response: str) -> bool: + return response != 'q' + +def has_videos_to_rate(videos: List[Dict]) -> bool: + return len(videos) > 0 \ No newline at end of file diff --git a/src/rating/user_input.py b/src/rating/user_input.py new file mode 100644 index 0000000..57af330 --- /dev/null +++ b/src/rating/user_input.py @@ -0,0 +1,12 @@ +def get_user_rating_response() -> str: + while True: + response = input("Rate this video (y/n/q): ").strip().lower() + if response in ['y', 'n', 'q']: + return response + print("Please enter 'y', 'n', or 'q'") + +def get_user_notes_for_rating(liked: bool) -> str: + if liked: + return input("Why did you like it? (optional): ").strip() + else: + return input("Why didn't you like it? (optional): ").strip() \ No newline at end of file diff --git a/src/youtube/__init__.py b/src/youtube/__init__.py new file mode 100644 index 0000000..ae9b59f --- /dev/null +++ b/src/youtube/__init__.py @@ -0,0 +1 @@ +# YouTube API operations package \ No newline at end of file diff --git a/src/youtube/details.py b/src/youtube/details.py new file mode 100644 index 0000000..393beab --- /dev/null +++ b/src/youtube/details.py @@ -0,0 +1,67 @@ +import requests +import json +from typing import List, Dict + +def get_video_details_from_youtube(api_key: str, video_ids: List[str]) -> List[Dict]: + if not video_ids: + return [] + + details_url = "https://www.googleapis.com/youtube/v3/videos" + params = { + 'key': api_key, + 'id': ','.join(video_ids), + 'part': 'snippet,statistics,contentDetails' + } + + try: + response = requests.get(details_url, params=params) + data = response.json() + + videos = [] + for item in data.get('items', []): + video = parse_youtube_video_response(item) + if is_relevant_coding_video(video): + videos.append(video) + + return videos + + except Exception as e: + print(f"Error getting video details: {e}") + return [] + +def parse_youtube_video_response(item: Dict) -> Dict: + snippet = item['snippet'] + statistics = item['statistics'] + + return { + 'id': item['id'], + 'title': snippet['title'], + 'description': snippet['description'], + 'view_count': int(statistics.get('viewCount', 0)), + 'like_count': int(statistics.get('likeCount', 0)), + 'comment_count': int(statistics.get('commentCount', 0)), + 'duration': item['contentDetails']['duration'], + 'published_at': snippet['publishedAt'], + 'channel_name': snippet['channelTitle'], + 'thumbnail_url': snippet['thumbnails']['high']['url'], + 'tags': json.dumps(snippet.get('tags', [])), + 'category_id': int(snippet.get('categoryId', 0)), + 'url': f"https://www.youtube.com/watch?v={item['id']}" + } + +def is_relevant_coding_video(video: Dict) -> bool: + title = video['title'].lower() + description = video['description'].lower() + + programming_keywords = [ + 'coding', 'programming', 'javascript', 'python', 'react', 'web development', + 'tutorial', 'learn', 'build', 'create', 'app', 'website', 'algorithm', 'ai' + ] + + if video['view_count'] < 100000: + return False + + has_programming = any(keyword in title or keyword in description + for keyword in programming_keywords) + + return has_programming \ No newline at end of file diff --git a/src/youtube/search.py b/src/youtube/search.py new file mode 100644 index 0000000..efd6560 --- /dev/null +++ b/src/youtube/search.py @@ -0,0 +1,43 @@ +import requests +from typing import List, Dict + +def search_youtube_videos_by_query(api_key: str, query: str, max_results: int) -> List[Dict]: + search_url = "https://www.googleapis.com/youtube/v3/search" + params = { + 'key': api_key, + 'q': query, + 'part': 'snippet', + 'type': 'video', + 'order': 'viewCount', + 'maxResults': max_results, + 'videoCategoryId': '28', + 'publishedAfter': '2020-01-01T00:00:00Z' + } + + try: + response = requests.get(search_url, params=params) + data = response.json() + + if 'items' not in data: + return [] + + video_ids = [item['id']['videoId'] for item in data['items']] + return video_ids + + except Exception as e: + print(f"Error searching videos: {e}") + return [] + +def get_coding_search_queries() -> List[str]: + return [ + "coding tutorial millions views", + "programming challenge viral", + "I built app hours", + "learn programming beginner", + "coding project from scratch", + "AI coding tutorial", + "web development crash course", + "javascript tutorial millions", + "python tutorial viral", + "coding in 24 hours" + ] \ No newline at end of file diff --git a/src/youtube/utils.py b/src/youtube/utils.py new file mode 100644 index 0000000..1f5d54b --- /dev/null +++ b/src/youtube/utils.py @@ -0,0 +1,12 @@ +from typing import List, Dict + +def remove_duplicate_videos(videos: List[Dict]) -> List[Dict]: + seen_ids = set() + unique_videos = [] + + for video in videos: + if video['id'] not in seen_ids: + seen_ids.add(video['id']) + unique_videos.append(video) + + return unique_videos \ No newline at end of file