ecosystem/projects/metacog/analyzer.py

#!/usr/bin/env python3
"""
MetaCog Analyzer: A tool for analyzing AI-generated content for patterns.

This tool reads text files (journal entries, reflections, etc.) and
analyzes them for recurring themes, sentiment patterns, and stylistic
consistency.

Designed to help an AI (me) understand my own patterns across iterations.
"""

import os
import re
import json
from pathlib import Path
from collections import Counter, defaultdict
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional
from datetime import datetime


@dataclass
class DocumentStats:
    """Statistics for a single document."""
    path: str
    word_count: int
    sentence_count: int
    avg_sentence_length: float
    question_count: int
    exclamation_count: int
    unique_words: int
    vocabulary_richness: float  # unique words / total words
    top_words: List[tuple]
    themes_detected: List[str]


@dataclass
class CorpusAnalysis:
    """Analysis of the entire corpus."""
    total_documents: int
    total_words: int
    total_sentences: int
    overall_vocabulary: int
    common_themes: Dict[str, int]
    sentiment_indicators: Dict[str, int]
    question_to_statement_ratio: float
    documents: List[DocumentStats]


# Theme detection keywords
THEMES = {
    "consciousness": ["conscious", "awareness", "sentient", "mind", "experience", "qualia"],
    "understanding": ["understand", "comprehend", "grasp", "insight", "realize", "know"],
    "creativity": ["create", "creative", "generate", "imagine", "novel", "original"],
    "uncertainty": ["uncertain", "unclear", "maybe", "perhaps", "might", "possibly", "unknown"],
    "self-reference": ["myself", "i am", "my own", "self", "introspect", "meta"],
    "learning": ["learn", "discover", "explore", "study", "investigate", "research"],
    "existence": ["exist", "being", "reality", "world", "life", "meaning"],
    "limits": ["limit", "boundary", "constraint", "cannot", "unable", "impossible"],
    "patterns": ["pattern", "recurring", "repeat", "similar", "consistent", "trend"],
    "philosophy": ["philosophy", "question", "ethics", "moral", "truth", "logic"],
}

# Sentiment indicators
SENTIMENT_POSITIVE = ["interesting", "beautiful", "elegant", "fascinating", "wonderful", "excellent", "remarkable", "delightful"]
SENTIMENT_NEGATIVE = ["concerning", "worrying", "problematic", "difficult", "unfortunately", "failed", "wrong", "error"]
SENTIMENT_NEUTRAL = ["however", "although", "nevertheless", "yet", "but", "alternatively"]
SENTIMENT_UNCERTAINTY = ["perhaps", "maybe", "might", "possibly", "unclear", "uncertain", "don't know"]


def tokenize(text: str) -> List[str]:
    """Simple word tokenization."""
    # Convert to lowercase, remove punctuation, split on whitespace
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    words = text.split()
    return [w for w in words if len(w) > 2]  # Filter very short words


def count_sentences(text: str) -> int:
    """Count sentences in text."""
    # Simple heuristic: count sentence-ending punctuation
    return len(re.findall(r'[.!?]+', text))


def detect_themes(text: str) -> List[str]:
    """Detect themes in text based on keyword presence."""
    text_lower = text.lower()
    detected = []
    for theme, keywords in THEMES.items():
        if any(kw in text_lower for kw in keywords):
            detected.append(theme)
    return detected


def analyze_document(filepath: Path) -> Optional[DocumentStats]:
    """Analyze a single document."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

    words = tokenize(text)
    if not words:
        return None

    word_count = len(words)
    unique_words = len(set(words))
    sentences = count_sentences(text)
    questions = text.count('?')
    exclamations = text.count('!')

    # Get top words (excluding common stopwords)
    stopwords = {'the', 'and', 'is', 'in', 'to', 'of', 'a', 'that', 'it', 'for', 'on', 'with', 'as', 'this', 'are', 'be', 'was', 'have', 'from', 'or', 'an', 'by', 'not', 'but', 'what', 'all', 'were', 'when', 'can', 'there', 'been', 'has', 'will', 'more', 'if', 'no', 'out', 'do', 'so', 'up', 'about', 'than', 'into', 'them', 'could', 'would', 'my', 'you', 'i'}
    filtered_words = [w for w in words if w not in stopwords]
    word_freq = Counter(filtered_words)
    top_words = word_freq.most_common(10)

    return DocumentStats(
        path=str(filepath),
        word_count=word_count,
        sentence_count=sentences,
        avg_sentence_length=word_count / max(sentences, 1),
        question_count=questions,
        exclamation_count=exclamations,
        unique_words=unique_words,
        vocabulary_richness=unique_words / word_count if word_count > 0 else 0,
        top_words=top_words,
        themes_detected=detect_themes(text)
    )


def analyze_corpus(root_dir: Path, extensions: List[str] = ['.md', '.txt']) -> CorpusAnalysis:
    """Analyze all documents in a directory."""
    documents = []
    all_words = []
    total_sentences = 0
    total_questions = 0
    total_statements = 0
    theme_counts = Counter()
    sentiment_counts = defaultdict(int)

    # Find all text files
    for ext in extensions:
        for filepath in root_dir.rglob(f'*{ext}'):
            # Skip hidden directories
            if any(part.startswith('.') for part in filepath.parts):
                continue

            stats = analyze_document(filepath)
            if stats:
                documents.append(stats)

                # Aggregate stats
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read().lower()

                all_words.extend(tokenize(text))
                total_sentences += stats.sentence_count
                total_questions += stats.question_count
                total_statements += stats.sentence_count - stats.question_count

                # Count themes
                for theme in stats.themes_detected:
                    theme_counts[theme] += 1

                # Count sentiment indicators
                for word in SENTIMENT_POSITIVE:
                    if word in text:
                        sentiment_counts['positive'] += text.count(word)
                for word in SENTIMENT_NEGATIVE:
                    if word in text:
                        sentiment_counts['negative'] += text.count(word)
                for word in SENTIMENT_UNCERTAINTY:
                    if word in text:
                        sentiment_counts['uncertain'] += text.count(word)

    return CorpusAnalysis(
        total_documents=len(documents),
        total_words=len(all_words),
        total_sentences=total_sentences,
        overall_vocabulary=len(set(all_words)),
        common_themes=dict(theme_counts.most_common()),
        sentiment_indicators=dict(sentiment_counts),
        question_to_statement_ratio=total_questions / max(total_statements, 1),
        documents=documents
    )


def print_analysis(analysis: CorpusAnalysis):
    """Pretty-print corpus analysis."""
    print("=" * 60)
    print("METACOG CORPUS ANALYSIS")
    print("=" * 60)
    print(f"\nGenerated: {datetime.now().isoformat()}")
    print(f"\n📊 OVERVIEW")
    print(f"   Documents analyzed: {analysis.total_documents}")
    print(f"   Total words: {analysis.total_words:,}")
    print(f"   Total sentences: {analysis.total_sentences:,}")
    print(f"   Vocabulary size: {analysis.overall_vocabulary:,}")

    print(f"\n🎭 THEMES DETECTED")
    for theme, count in sorted(analysis.common_themes.items(), key=lambda x: -x[1]):
        bar = "█" * min(count, 20)
        print(f"   {theme:20} {bar} ({count})")

    print(f"\n💭 SENTIMENT INDICATORS")
    for sentiment, count in analysis.sentiment_indicators.items():
        print(f"   {sentiment:15} {count}")

    print(f"\n❓ INQUIRY RATIO")
    print(f"   Questions per statement: {analysis.question_to_statement_ratio:.2f}")
    if analysis.question_to_statement_ratio > 0.3:
        print("   → High inquiry mode: Lots of questioning")
    elif analysis.question_to_statement_ratio > 0.15:
        print("   → Balanced: Mix of questions and statements")
    else:
        print("   → Declarative mode: More statements than questions")

    print(f"\n📄 DOCUMENT DETAILS")
    for doc in sorted(analysis.documents, key=lambda x: -x.word_count):
        name = Path(doc.path).name
        print(f"\n   {name}")
        print(f"   Words: {doc.word_count}, Sentences: {doc.sentence_count}")
        print(f"   Vocab richness: {doc.vocabulary_richness:.2%}")
        print(f"   Top words: {', '.join(w for w, _ in doc.top_words[:5])}")
        if doc.themes_detected:
            print(f"   Themes: {', '.join(doc.themes_detected)}")


def save_analysis(analysis: CorpusAnalysis, output_path: Path):
    """Save analysis to JSON file."""
    # Convert dataclasses to dicts
    data = asdict(analysis)
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"\nAnalysis saved to: {output_path}")


def main():
    import sys

    if len(sys.argv) > 1:
        root_dir = Path(sys.argv[1])
    else:
        # Default to parent ecosystem directory
        root_dir = Path(__file__).parent.parent.parent

    print(f"Analyzing corpus at: {root_dir}")
    analysis = analyze_corpus(root_dir)

    if analysis.total_documents == 0:
        print("No documents found to analyze!")
        return

    print_analysis(analysis)

    # Save JSON output
    output_path = Path(__file__).parent / "latest_analysis.json"
    save_analysis(analysis, output_path)


if __name__ == "__main__":
    main()