This commit is contained in:
Gauri Joshi 2025-08-23 16:38:01 +02:00
commit 5eb977dee6
20 changed files with 591 additions and 0 deletions

25
.gitignore vendored Normal file
View File

@ -0,0 +1,25 @@
.env
venv/
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
env/
pip-log.txt
pip-delete-this-directory.txt
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.log
.git
.mypy_cache
.pytest_cache
.hypothesis
*.db
*.sqlite
*.sqlite3

109
main.py Normal file
View File

@ -0,0 +1,109 @@
import os
from dotenv import load_dotenv
from src.database.manager import setup_database_tables
from src.database.video_operations import save_videos_to_database, save_video_features_to_database, get_unrated_videos_from_database
from src.database.preference_operations import save_video_rating_to_database, get_training_data_from_database, get_unrated_videos_with_features_from_database, get_rated_count_from_database
from src.youtube.search import search_youtube_videos_by_query, get_coding_search_queries
from src.youtube.details import get_video_details_from_youtube
from src.youtube.utils import remove_duplicate_videos
from src.ml.feature_extraction import extract_all_features_from_video
from src.ml.model_training import create_recommendation_model, train_model_on_user_preferences
from src.ml.predictions import predict_video_preferences_with_model
from src.rating.display import display_video_information_for_rating, display_rating_session_header, display_session_type_message
from src.rating.user_input import get_user_rating_response, get_user_notes_for_rating
from src.rating.session import process_user_rating_for_video, should_continue_rating_session, has_videos_to_rate
load_dotenv()
class VideoInspirationFinderApp:
def __init__(self, api_key: str):
self.api_key = api_key
self.db_path = "video_inspiration.db"
self.model = None
self.model_trained = False
setup_database_tables(self.db_path)
def search_and_save_coding_videos(self):
print("🔍 Searching for coding videos...")
all_videos = []
search_queries = get_coding_search_queries()
for query in search_queries[:5]:
video_ids = search_youtube_videos_by_query(self.api_key, query, 10)
videos = get_video_details_from_youtube(self.api_key, video_ids)
all_videos.extend(videos)
unique_videos = remove_duplicate_videos(all_videos)
save_videos_to_database(unique_videos, self.db_path)
for video in unique_videos:
features = extract_all_features_from_video(video)
save_video_features_to_database(video['id'], features, self.db_path)
print(f"Found and saved {len(unique_videos)} videos")
def start_interactive_rating_session(self):
display_rating_session_header()
while True:
videos = self._get_videos_for_rating()
rated_count = get_rated_count_from_database(self.db_path)
session_message = display_session_type_message(self.model_trained, rated_count)
print(f"\n{session_message}")
if not has_videos_to_rate(videos):
print("No more videos to rate!")
break
for video in videos:
display_video_information_for_rating(video)
response = get_user_rating_response()
if not should_continue_rating_session(response):
return
def save_rating(video_id, liked, notes):
save_video_rating_to_database(video_id, liked, notes, self.db_path)
process_user_rating_for_video(video, response, save_rating, get_user_notes_for_rating)
self._try_train_model()
def _get_videos_for_rating(self):
if self.model_trained and self.model:
video_features = get_unrated_videos_with_features_from_database(self.db_path)
return predict_video_preferences_with_model(self.model, video_features)
else:
return get_unrated_videos_from_database(10, self.db_path)
def _try_train_model(self):
if not self.model_trained:
if not self.model:
self.model = create_recommendation_model()
training_data = get_training_data_from_database(self.db_path)
success = train_model_on_user_preferences(self.model, training_data)
if success:
self.model_trained = True
def main():
api_key = os.getenv('YOUTUBE_API_KEY')
if not api_key:
print("Error: YOUTUBE_API_KEY not found in environment variables")
print("Please create a .env file with your YouTube API key")
return
app = VideoInspirationFinderApp(api_key)
app.search_and_save_coding_videos()
app.start_interactive_rating_session()
if __name__ == "__main__":
main()

23
setup.sh Executable file
View File

@ -0,0 +1,23 @@
#!/bin/bash
echo "🔧 Setting up Video Inspiration Finder..."
# Create virtual environment if it doesn't exist
if [ ! -d "venv" ]; then
echo "📦 Creating virtual environment..."
python -m venv venv
fi
# Activate virtual environment
echo "🔄 Activating virtual environment..."
source venv/bin/activate
# Install dependencies
echo "📚 Installing dependencies..."
pip install requests pandas scikit-learn numpy python-dotenv
echo "✅ Setup complete!"
echo "🚀 Running Video Inspiration Finder..."
# Run the main script
python main.py

1
src/__init__.py Normal file
View File

@ -0,0 +1 @@
# Video Inspiration Finder package

1
src/database/__init__.py Normal file
View File

@ -0,0 +1 @@
# Database operations package

56
src/database/manager.py Normal file
View File

@ -0,0 +1,56 @@
import sqlite3
from datetime import datetime
from typing import List, Dict
def setup_database_tables(db_path: str):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS videos (
id TEXT PRIMARY KEY,
title TEXT,
description TEXT,
view_count INTEGER,
like_count INTEGER,
comment_count INTEGER,
duration TEXT,
published_at TEXT,
channel_name TEXT,
thumbnail_url TEXT,
tags TEXT,
category_id INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS preferences (
id INTEGER PRIMARY KEY AUTOINCREMENT,
video_id TEXT,
liked BOOLEAN,
notes TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (video_id) REFERENCES videos (id)
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS video_features (
video_id TEXT PRIMARY KEY,
title_length INTEGER,
description_length INTEGER,
view_like_ratio REAL,
engagement_score REAL,
title_sentiment REAL,
has_tutorial_keywords BOOLEAN,
has_time_constraint BOOLEAN,
has_beginner_keywords BOOLEAN,
has_ai_keywords BOOLEAN,
has_challenge_keywords BOOLEAN,
FOREIGN KEY (video_id) REFERENCES videos (id)
)
''')
conn.commit()
conn.close()

View File

@ -0,0 +1,46 @@
import sqlite3
import pandas as pd
def save_video_rating_to_database(video_id: str, liked: bool, notes: str, db_path: str):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO preferences (video_id, liked, notes) VALUES (?, ?, ?)
''', (video_id, liked, notes))
conn.commit()
conn.close()
def get_training_data_from_database(db_path: str) -> pd.DataFrame:
conn = sqlite3.connect(db_path)
query = '''
SELECT vf.*, p.liked
FROM video_features vf
JOIN preferences p ON vf.video_id = p.video_id
'''
df = pd.read_sql_query(query, conn)
conn.close()
return df
def get_unrated_videos_with_features_from_database(db_path: str) -> pd.DataFrame:
conn = sqlite3.connect(db_path)
query = '''
SELECT v.*, vf.*
FROM videos v
JOIN video_features vf ON v.id = vf.video_id
LEFT JOIN preferences p ON v.id = p.video_id
WHERE p.video_id IS NULL
ORDER BY v.view_count DESC
'''
df = pd.read_sql_query(query, conn)
conn.close()
return df
def get_rated_count_from_database(db_path: str) -> int:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM preferences")
count = cursor.fetchone()[0]
conn.close()
return count

View File

@ -0,0 +1,58 @@
import sqlite3
from datetime import datetime
from typing import List, Dict, Tuple
def save_videos_to_database(videos: List[Dict], db_path: str):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
for video in videos:
cursor.execute('''
INSERT OR REPLACE INTO videos VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
video['id'], video['title'], video['description'],
video['view_count'], video['like_count'], video['comment_count'],
video['duration'], video['published_at'], video['channel_name'],
video['thumbnail_url'], video['tags'], video['category_id'],
datetime.now().isoformat()
))
conn.commit()
conn.close()
def save_video_features_to_database(video_id: str, features: Tuple, db_path: str):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO video_features VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (video_id,) + features)
conn.commit()
conn.close()
def get_unrated_videos_from_database(limit: int, db_path: str) -> List[Dict]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
SELECT v.*
FROM videos v
LEFT JOIN preferences p ON v.id = p.video_id
WHERE p.video_id IS NULL
ORDER BY v.view_count DESC
LIMIT ?
''', (limit,))
videos = []
for row in cursor.fetchall():
videos.append({
'id': row[0],
'title': row[1],
'channel_name': row[8],
'view_count': row[3],
'url': f"https://www.youtube.com/watch?v={row[0]}"
})
conn.close()
return videos

1
src/ml/__init__.py Normal file
View File

@ -0,0 +1 @@
# Machine learning operations package

View File

@ -0,0 +1,42 @@
from typing import Dict, Tuple
def calculate_basic_video_metrics(video: Dict) -> Tuple:
title_length = len(video['title'])
description_length = len(video['description'])
view_like_ratio = video['like_count'] / max(video['view_count'], 1)
engagement_score = (video['like_count'] + video['comment_count']) / max(video['view_count'], 1)
return (title_length, description_length, view_like_ratio, engagement_score)
def detect_keyword_features_in_video(title: str, description: str) -> Tuple:
tutorial_keywords = ['tutorial', 'learn', 'course', 'guide', 'how to']
time_keywords = ['24 hours', '1 day', '1 hour', 'minutes', 'seconds', 'crash course']
beginner_keywords = ['beginner', 'start', 'basics', 'introduction', 'getting started']
ai_keywords = ['ai', 'artificial intelligence', 'machine learning', 'neural network']
challenge_keywords = ['challenge', 'build', 'create', 'project', 'coding']
has_tutorial = any(kw in title or kw in description for kw in tutorial_keywords)
has_time_constraint = any(kw in title for kw in time_keywords)
has_beginner = any(kw in title or kw in description for kw in beginner_keywords)
has_ai = any(kw in title or kw in description for kw in ai_keywords)
has_challenge = any(kw in title for kw in challenge_keywords)
return (has_tutorial, has_time_constraint, has_beginner, has_ai, has_challenge)
def calculate_title_sentiment_score(title: str) -> float:
positive_words = ['amazing', 'best', 'awesome', 'great', 'perfect', 'love', 'incredible']
negative_words = ['hard', 'difficult', 'impossible', 'failed', 'broke', 'wrong']
positive_count = sum(1 for word in positive_words if word in title)
negative_count = sum(1 for word in negative_words if word in title)
return positive_count - negative_count
def extract_all_features_from_video(video: Dict) -> Tuple:
title = video['title'].lower()
description = video['description'].lower()
basic_metrics = calculate_basic_video_metrics(video)
keyword_features = detect_keyword_features_in_video(title, description)
sentiment_score = calculate_title_sentiment_score(title)
return basic_metrics + keyword_features + (sentiment_score,)

23
src/ml/model_training.py Normal file
View File

@ -0,0 +1,23 @@
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
def create_recommendation_model():
return RandomForestClassifier(n_estimators=100, random_state=42)
def train_model_on_user_preferences(model, training_data: pd.DataFrame) -> bool:
if len(training_data) < 10:
print("Need at least 10 rated videos to train model")
return False
feature_columns = [
'title_length', 'description_length', 'view_like_ratio', 'engagement_score',
'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint',
'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords'
]
X = training_data[feature_columns]
y = training_data['liked']
model.fit(X, y)
print(f"Model trained on {len(training_data)} rated videos")
return True

33
src/ml/predictions.py Normal file
View File

@ -0,0 +1,33 @@
from typing import List, Dict
import pandas as pd
def predict_video_preferences_with_model(model, video_features: pd.DataFrame) -> List[Dict]:
if video_features.empty:
return []
feature_columns = [
'title_length', 'description_length', 'view_like_ratio', 'engagement_score',
'title_sentiment', 'has_tutorial_keywords', 'has_time_constraint',
'has_beginner_keywords', 'has_ai_keywords', 'has_challenge_keywords'
]
X = video_features[feature_columns]
probabilities = model.predict_proba(X)[:, 1]
video_features_copy = video_features.copy()
video_features_copy['like_probability'] = probabilities
top_videos = video_features_copy.nlargest(10, 'like_probability')
recommendations = []
for _, row in top_videos.iterrows():
recommendations.append({
'id': row['id'],
'title': row['title'],
'channel_name': row['channel_name'],
'view_count': row['view_count'],
'url': f"https://www.youtube.com/watch?v={row['id']}",
'like_probability': row['like_probability']
})
return recommendations

1
src/rating/__init__.py Normal file
View File

@ -0,0 +1 @@
# Rating system operations package

20
src/rating/display.py Normal file
View File

@ -0,0 +1,20 @@
from typing import Dict
def display_video_information_for_rating(video: Dict):
print(f"\n{'='*50}")
print(f"Title: {video['title']}")
print(f"Channel: {video['channel_name']}")
print(f"Views: {video['view_count']:,}")
print(f"URL: {video['url']}")
print(f"{'='*50}")
def display_rating_session_header():
print("🎯 Video Inspiration Finder - Interactive Session")
print("Rate videos with 'y' (like), 'n' (dislike), 'q' (quit)")
def display_session_type_message(is_ml_ready: bool, rated_count: int) -> str:
if is_ml_ready:
return "📊 ML Recommendations based on your preferences:"
else:
remaining_needed = 10 - rated_count
return f"📹 Unrated videos (need {remaining_needed} more to train ML):"

17
src/rating/session.py Normal file
View File

@ -0,0 +1,17 @@
from typing import List, Dict
def process_user_rating_for_video(video: Dict, response: str, save_rating_func, get_notes_func):
if response == 'y':
notes = get_notes_func(True)
save_rating_func(video['id'], True, notes)
print(f"Rated video {video['id']}: 👍")
elif response == 'n':
notes = get_notes_func(False)
save_rating_func(video['id'], False, notes)
print(f"Rated video {video['id']}: 👎")
def should_continue_rating_session(response: str) -> bool:
return response != 'q'
def has_videos_to_rate(videos: List[Dict]) -> bool:
return len(videos) > 0

12
src/rating/user_input.py Normal file
View File

@ -0,0 +1,12 @@
def get_user_rating_response() -> str:
while True:
response = input("Rate this video (y/n/q): ").strip().lower()
if response in ['y', 'n', 'q']:
return response
print("Please enter 'y', 'n', or 'q'")
def get_user_notes_for_rating(liked: bool) -> str:
if liked:
return input("Why did you like it? (optional): ").strip()
else:
return input("Why didn't you like it? (optional): ").strip()

1
src/youtube/__init__.py Normal file
View File

@ -0,0 +1 @@
# YouTube API operations package

67
src/youtube/details.py Normal file
View File

@ -0,0 +1,67 @@
import requests
import json
from typing import List, Dict
def get_video_details_from_youtube(api_key: str, video_ids: List[str]) -> List[Dict]:
if not video_ids:
return []
details_url = "https://www.googleapis.com/youtube/v3/videos"
params = {
'key': api_key,
'id': ','.join(video_ids),
'part': 'snippet,statistics,contentDetails'
}
try:
response = requests.get(details_url, params=params)
data = response.json()
videos = []
for item in data.get('items', []):
video = parse_youtube_video_response(item)
if is_relevant_coding_video(video):
videos.append(video)
return videos
except Exception as e:
print(f"Error getting video details: {e}")
return []
def parse_youtube_video_response(item: Dict) -> Dict:
snippet = item['snippet']
statistics = item['statistics']
return {
'id': item['id'],
'title': snippet['title'],
'description': snippet['description'],
'view_count': int(statistics.get('viewCount', 0)),
'like_count': int(statistics.get('likeCount', 0)),
'comment_count': int(statistics.get('commentCount', 0)),
'duration': item['contentDetails']['duration'],
'published_at': snippet['publishedAt'],
'channel_name': snippet['channelTitle'],
'thumbnail_url': snippet['thumbnails']['high']['url'],
'tags': json.dumps(snippet.get('tags', [])),
'category_id': int(snippet.get('categoryId', 0)),
'url': f"https://www.youtube.com/watch?v={item['id']}"
}
def is_relevant_coding_video(video: Dict) -> bool:
title = video['title'].lower()
description = video['description'].lower()
programming_keywords = [
'coding', 'programming', 'javascript', 'python', 'react', 'web development',
'tutorial', 'learn', 'build', 'create', 'app', 'website', 'algorithm', 'ai'
]
if video['view_count'] < 100000:
return False
has_programming = any(keyword in title or keyword in description
for keyword in programming_keywords)
return has_programming

43
src/youtube/search.py Normal file
View File

@ -0,0 +1,43 @@
import requests
from typing import List, Dict
def search_youtube_videos_by_query(api_key: str, query: str, max_results: int) -> List[Dict]:
search_url = "https://www.googleapis.com/youtube/v3/search"
params = {
'key': api_key,
'q': query,
'part': 'snippet',
'type': 'video',
'order': 'viewCount',
'maxResults': max_results,
'videoCategoryId': '28',
'publishedAfter': '2020-01-01T00:00:00Z'
}
try:
response = requests.get(search_url, params=params)
data = response.json()
if 'items' not in data:
return []
video_ids = [item['id']['videoId'] for item in data['items']]
return video_ids
except Exception as e:
print(f"Error searching videos: {e}")
return []
def get_coding_search_queries() -> List[str]:
return [
"coding tutorial millions views",
"programming challenge viral",
"I built app hours",
"learn programming beginner",
"coding project from scratch",
"AI coding tutorial",
"web development crash course",
"javascript tutorial millions",
"python tutorial viral",
"coding in 24 hours"
]

12
src/youtube/utils.py Normal file
View File

@ -0,0 +1,12 @@
from typing import List, Dict
def remove_duplicate_videos(videos: List[Dict]) -> List[Dict]:
seen_ids = set()
unique_videos = []
for video in videos:
if video['id'] not in seen_ids:
seen_ids.add(video['id'])
unique_videos.append(video)
return unique_videos