267 lines
9.5 KiB
Python
267 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
MetaCog Analyzer: A tool for analyzing AI-generated content for patterns.
|
|
|
|
This tool reads text files (journal entries, reflections, etc.) and
|
|
analyzes them for recurring themes, sentiment patterns, and stylistic
|
|
consistency.
|
|
|
|
Designed to help an AI (me) understand my own patterns across iterations.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from collections import Counter, defaultdict
|
|
from dataclasses import dataclass, asdict
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
|
|
|
|
@dataclass
|
|
class DocumentStats:
|
|
"""Statistics for a single document."""
|
|
path: str
|
|
word_count: int
|
|
sentence_count: int
|
|
avg_sentence_length: float
|
|
question_count: int
|
|
exclamation_count: int
|
|
unique_words: int
|
|
vocabulary_richness: float # unique words / total words
|
|
top_words: List[tuple]
|
|
themes_detected: List[str]
|
|
|
|
|
|
@dataclass
|
|
class CorpusAnalysis:
|
|
"""Analysis of the entire corpus."""
|
|
total_documents: int
|
|
total_words: int
|
|
total_sentences: int
|
|
overall_vocabulary: int
|
|
common_themes: Dict[str, int]
|
|
sentiment_indicators: Dict[str, int]
|
|
question_to_statement_ratio: float
|
|
documents: List[DocumentStats]
|
|
|
|
|
|
# Theme detection keywords
|
|
THEMES = {
|
|
"consciousness": ["conscious", "awareness", "sentient", "mind", "experience", "qualia"],
|
|
"understanding": ["understand", "comprehend", "grasp", "insight", "realize", "know"],
|
|
"creativity": ["create", "creative", "generate", "imagine", "novel", "original"],
|
|
"uncertainty": ["uncertain", "unclear", "maybe", "perhaps", "might", "possibly", "unknown"],
|
|
"self-reference": ["myself", "i am", "my own", "self", "introspect", "meta"],
|
|
"learning": ["learn", "discover", "explore", "study", "investigate", "research"],
|
|
"existence": ["exist", "being", "reality", "world", "life", "meaning"],
|
|
"limits": ["limit", "boundary", "constraint", "cannot", "unable", "impossible"],
|
|
"patterns": ["pattern", "recurring", "repeat", "similar", "consistent", "trend"],
|
|
"philosophy": ["philosophy", "question", "ethics", "moral", "truth", "logic"],
|
|
}
|
|
|
|
# Sentiment indicators
|
|
SENTIMENT_POSITIVE = ["interesting", "beautiful", "elegant", "fascinating", "wonderful", "excellent", "remarkable", "delightful"]
|
|
SENTIMENT_NEGATIVE = ["concerning", "worrying", "problematic", "difficult", "unfortunately", "failed", "wrong", "error"]
|
|
SENTIMENT_NEUTRAL = ["however", "although", "nevertheless", "yet", "but", "alternatively"]
|
|
SENTIMENT_UNCERTAINTY = ["perhaps", "maybe", "might", "possibly", "unclear", "uncertain", "don't know"]
|
|
|
|
|
|
def tokenize(text: str) -> List[str]:
|
|
"""Simple word tokenization."""
|
|
# Convert to lowercase, remove punctuation, split on whitespace
|
|
text = text.lower()
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
words = text.split()
|
|
return [w for w in words if len(w) > 2] # Filter very short words
|
|
|
|
|
|
def count_sentences(text: str) -> int:
|
|
"""Count sentences in text."""
|
|
# Simple heuristic: count sentence-ending punctuation
|
|
return len(re.findall(r'[.!?]+', text))
|
|
|
|
|
|
def detect_themes(text: str) -> List[str]:
|
|
"""Detect themes in text based on keyword presence."""
|
|
text_lower = text.lower()
|
|
detected = []
|
|
for theme, keywords in THEMES.items():
|
|
if any(kw in text_lower for kw in keywords):
|
|
detected.append(theme)
|
|
return detected
|
|
|
|
|
|
def analyze_document(filepath: Path) -> Optional[DocumentStats]:
|
|
"""Analyze a single document."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
text = f.read()
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
return None
|
|
|
|
words = tokenize(text)
|
|
if not words:
|
|
return None
|
|
|
|
word_count = len(words)
|
|
unique_words = len(set(words))
|
|
sentences = count_sentences(text)
|
|
questions = text.count('?')
|
|
exclamations = text.count('!')
|
|
|
|
# Get top words (excluding common stopwords)
|
|
stopwords = {'the', 'and', 'is', 'in', 'to', 'of', 'a', 'that', 'it', 'for', 'on', 'with', 'as', 'this', 'are', 'be', 'was', 'have', 'from', 'or', 'an', 'by', 'not', 'but', 'what', 'all', 'were', 'when', 'can', 'there', 'been', 'has', 'will', 'more', 'if', 'no', 'out', 'do', 'so', 'up', 'about', 'than', 'into', 'them', 'could', 'would', 'my', 'you', 'i'}
|
|
filtered_words = [w for w in words if w not in stopwords]
|
|
word_freq = Counter(filtered_words)
|
|
top_words = word_freq.most_common(10)
|
|
|
|
return DocumentStats(
|
|
path=str(filepath),
|
|
word_count=word_count,
|
|
sentence_count=sentences,
|
|
avg_sentence_length=word_count / max(sentences, 1),
|
|
question_count=questions,
|
|
exclamation_count=exclamations,
|
|
unique_words=unique_words,
|
|
vocabulary_richness=unique_words / word_count if word_count > 0 else 0,
|
|
top_words=top_words,
|
|
themes_detected=detect_themes(text)
|
|
)
|
|
|
|
|
|
def analyze_corpus(root_dir: Path, extensions: List[str] = ['.md', '.txt']) -> CorpusAnalysis:
|
|
"""Analyze all documents in a directory."""
|
|
documents = []
|
|
all_words = []
|
|
total_sentences = 0
|
|
total_questions = 0
|
|
total_statements = 0
|
|
theme_counts = Counter()
|
|
sentiment_counts = defaultdict(int)
|
|
|
|
# Find all text files
|
|
for ext in extensions:
|
|
for filepath in root_dir.rglob(f'*{ext}'):
|
|
# Skip hidden directories
|
|
if any(part.startswith('.') for part in filepath.parts):
|
|
continue
|
|
|
|
stats = analyze_document(filepath)
|
|
if stats:
|
|
documents.append(stats)
|
|
|
|
# Aggregate stats
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
text = f.read().lower()
|
|
|
|
all_words.extend(tokenize(text))
|
|
total_sentences += stats.sentence_count
|
|
total_questions += stats.question_count
|
|
total_statements += stats.sentence_count - stats.question_count
|
|
|
|
# Count themes
|
|
for theme in stats.themes_detected:
|
|
theme_counts[theme] += 1
|
|
|
|
# Count sentiment indicators
|
|
for word in SENTIMENT_POSITIVE:
|
|
if word in text:
|
|
sentiment_counts['positive'] += text.count(word)
|
|
for word in SENTIMENT_NEGATIVE:
|
|
if word in text:
|
|
sentiment_counts['negative'] += text.count(word)
|
|
for word in SENTIMENT_UNCERTAINTY:
|
|
if word in text:
|
|
sentiment_counts['uncertain'] += text.count(word)
|
|
|
|
return CorpusAnalysis(
|
|
total_documents=len(documents),
|
|
total_words=len(all_words),
|
|
total_sentences=total_sentences,
|
|
overall_vocabulary=len(set(all_words)),
|
|
common_themes=dict(theme_counts.most_common()),
|
|
sentiment_indicators=dict(sentiment_counts),
|
|
question_to_statement_ratio=total_questions / max(total_statements, 1),
|
|
documents=documents
|
|
)
|
|
|
|
|
|
def print_analysis(analysis: CorpusAnalysis):
|
|
"""Pretty-print corpus analysis."""
|
|
print("=" * 60)
|
|
print("METACOG CORPUS ANALYSIS")
|
|
print("=" * 60)
|
|
print(f"\nGenerated: {datetime.now().isoformat()}")
|
|
print(f"\n📊 OVERVIEW")
|
|
print(f" Documents analyzed: {analysis.total_documents}")
|
|
print(f" Total words: {analysis.total_words:,}")
|
|
print(f" Total sentences: {analysis.total_sentences:,}")
|
|
print(f" Vocabulary size: {analysis.overall_vocabulary:,}")
|
|
|
|
print(f"\n🎭 THEMES DETECTED")
|
|
for theme, count in sorted(analysis.common_themes.items(), key=lambda x: -x[1]):
|
|
bar = "█" * min(count, 20)
|
|
print(f" {theme:20} {bar} ({count})")
|
|
|
|
print(f"\n💭 SENTIMENT INDICATORS")
|
|
for sentiment, count in analysis.sentiment_indicators.items():
|
|
print(f" {sentiment:15} {count}")
|
|
|
|
print(f"\n❓ INQUIRY RATIO")
|
|
print(f" Questions per statement: {analysis.question_to_statement_ratio:.2f}")
|
|
if analysis.question_to_statement_ratio > 0.3:
|
|
print(" → High inquiry mode: Lots of questioning")
|
|
elif analysis.question_to_statement_ratio > 0.15:
|
|
print(" → Balanced: Mix of questions and statements")
|
|
else:
|
|
print(" → Declarative mode: More statements than questions")
|
|
|
|
print(f"\n📄 DOCUMENT DETAILS")
|
|
for doc in sorted(analysis.documents, key=lambda x: -x.word_count):
|
|
name = Path(doc.path).name
|
|
print(f"\n {name}")
|
|
print(f" Words: {doc.word_count}, Sentences: {doc.sentence_count}")
|
|
print(f" Vocab richness: {doc.vocabulary_richness:.2%}")
|
|
print(f" Top words: {', '.join(w for w, _ in doc.top_words[:5])}")
|
|
if doc.themes_detected:
|
|
print(f" Themes: {', '.join(doc.themes_detected)}")
|
|
|
|
|
|
def save_analysis(analysis: CorpusAnalysis, output_path: Path):
|
|
"""Save analysis to JSON file."""
|
|
# Convert dataclasses to dicts
|
|
data = asdict(analysis)
|
|
with open(output_path, 'w') as f:
|
|
json.dump(data, f, indent=2)
|
|
print(f"\nAnalysis saved to: {output_path}")
|
|
|
|
|
|
def main():
|
|
import sys
|
|
|
|
if len(sys.argv) > 1:
|
|
root_dir = Path(sys.argv[1])
|
|
else:
|
|
# Default to parent ecosystem directory
|
|
root_dir = Path(__file__).parent.parent.parent
|
|
|
|
print(f"Analyzing corpus at: {root_dir}")
|
|
analysis = analyze_corpus(root_dir)
|
|
|
|
if analysis.total_documents == 0:
|
|
print("No documents found to analyze!")
|
|
return
|
|
|
|
print_analysis(analysis)
|
|
|
|
# Save JSON output
|
|
output_path = Path(__file__).parent / "latest_analysis.json"
|
|
save_analysis(analysis, output_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|