diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f0d6582 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,186 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is an Obsidian plugin that provides semantic search over vault contents by indexing documents into Qdrant vector database using Ollama (local) or OpenAI (cloud) embeddings. It supports markdown, text files, PDFs, and images with OCR through the Text Extractor plugin. + +## Build & Development Commands + +```bash +# Install dependencies +npm install + +# Development mode (watch mode with hot reload) +npm run dev + +# Production build +npm run build + +# Type checking (run before committing) +npm run build +``` + +### Development Workflow + +1. The plugin uses esbuild for bundling (configured in `esbuild.config.mjs`) +2. Entry point is `main.ts` which bundles all `src/**/*.ts` files into `main.js` +3. To test the plugin, symlink or copy `main.js`, `manifest.json`, and `styles.css` to your vault's `.obsidian/plugins/obsidian-qdrant/` folder +4. TypeScript strict mode is enabled - all types must be properly defined + +## Architecture + +### Core Components (Event-Driven Pipeline) + +The plugin follows an event-driven architecture with these key components: + +1. **IndexingOrchestrator** (`src/indexing/orchestrator.ts`): Central coordinator + - Initializes all subsystems and manages their lifecycle + - Coordinates the indexing pipeline from file changes to vector storage + - Key initialization sequence: load manifest → get embedding dimension → initialize Qdrant collection → start file watching + +2. **FileWatcher** (`src/indexing/fileWatcher.ts`): Monitors vault changes + - Listens to Obsidian vault events (create, modify, delete) + - Filters files based on settings (include/exclude patterns, ignored folders, max file size) + - Adds qualifying files to IndexingQueue + +3. **IndexingQueue** (`src/indexing/indexQueue.ts`): Async job processor + - Processes files through the pipeline: Extract → Chunk → Embed → Upsert + - Manages concurrency and batch processing + - Tracks progress and error handling + +4. **FileManifest** (`src/indexing/manifest.ts`): Index state tracker + - Stores metadata about indexed files (mtime, size, hash, chunk count) + - Determines which files need re-indexing (changed since last index) + - Identifies orphaned files (in index but deleted from vault) + - Persisted to vault's `.obsidian/plugins/obsidian-qdrant/` folder + +### Data Processing Pipeline + +**Extract** → **Chunk** → **Embed** → **Store** + +1. **Extractors** (`src/extractors/`): Extract content from different file types + - `MarkdownExtractor`: Parses frontmatter, headings, links, tags + - `TextExtractor`: Handles plain text and code files + - `TextExtractorPlugin`: Integrates with Text Extractor plugin for PDFs/images + - Each returns `ExtractedContent` with text and rich metadata + +2. **Chunker** (`src/chunking/chunker.ts`): Split content into semantic chunks + - `HybridChunker`: Splits on markdown headings first, then by token count + - Uses simple word-based tokenizer (GPT-style would require large dependency) + - Maintains overlap between chunks for context continuity + - Each chunk includes metadata: path, title, tags, heading hierarchy, position + +3. **Embedding Providers** (`src/embeddings/`): Generate vector embeddings + - Factory pattern: `createEmbeddingProvider()` returns appropriate provider + - `OllamaEmbeddingProvider`: Uses local Ollama server (default: nomic-embed-text) + - `OpenAIEmbeddingProvider`: Uses OpenAI API + - Batching and concurrency control for efficiency + +4. **Qdrant Client** (`src/qdrant/client.ts`): Vector storage operations + - Wraps Qdrant REST API using Obsidian's `requestUrl()` + - Handles collection management (create, ensure, delete) + - Point operations (upsert, search, delete, recommend) + - Uses cosine distance for similarity + +5. **CollectionManager** (`src/qdrant/collection.ts`): Collection lifecycle + - Creates collections with naming: `{vaultName}_{modelName}` (sanitized) + - Ensures correct vector dimensions match embedding provider + - Configures sparse vectors for hybrid search (future feature) + +### Search Flow + +1. User opens SearchModal (`src/search/searchModal.ts`) +2. Query text is embedded using the same provider as indexing +3. Query vector is searched against Qdrant collection +4. Results rendered by ResultRenderer (`src/search/resultRenderer.ts`) +5. User can click result to open file at specific chunk position + +### Settings & Configuration + +- Settings stored in vault's `.obsidian/plugins/obsidian-qdrant/data.json` +- Defaults in `src/settings.ts` (especially important: DEFAULT_SETTINGS) +- **Critical**: The orchestrator must use `this.settings` loaded from disk, not DEFAULT_SETTINGS +- Settings tab (`src/ui/settingsTab.ts`) provides UI for all configuration +- Validation function ensures settings are complete before initialization + +### Type System + +All types defined in `src/types.ts`: +- `PluginSettings`: Complete configuration structure +- `ChunkMetadata`: Rich metadata stored with each vector point +- `SearchResult`, `SearchOptions`: Search interface +- `IndexingProgress`: Real-time progress tracking +- `FileManifestEntry`: Index state per file +- Interface contracts: `EmbeddingProviderInterface`, `ExtractorInterface`, etc. + +## Important Implementation Details + +### Qdrant Point IDs +- Must be UUIDs (not sequential integers) for Qdrant compatibility +- Generated using crypto.randomUUID() in the chunker +- Each chunk gets a unique ID that persists across re-indexing + +### Settings Initialization Timing +- Settings are loaded async in `main.ts:onload()` +- Orchestrator created after settings load with `new IndexingOrchestrator(this.app, this.settings)` +- **Bug to watch for**: Orchestrator components must reference the passed settings, not DEFAULT_SETTINGS + +### File Change Handling +- Create/Modify: Add to queue with 'update' action +- Delete: Add to queue with 'delete' action (removes points from Qdrant) +- Rename: Treated as delete old + create new + +### Error Handling +- Connection failures are caught and surfaced via status bar +- File extraction errors are logged but don't stop the queue +- Progress callback and error callback allow UI updates + +### Text Extractor Plugin Integration +- Optional dependency detected at runtime +- Falls back gracefully if not installed +- Uses Obsidian plugin API to access text-extractor methods + +## Common Development Patterns + +### Adding a New Extractor +1. Implement `ExtractorInterface` in `src/extractors/` +2. Add to `ExtractorManager.initializeExtractors()` priority list +3. Update `getExtractorStatus()` to report the new extractor + +### Adding a New Embedding Provider +1. Extend `BaseEmbeddingProvider` in `src/embeddings/` +2. Implement `embed()`, `getDimension()`, `getName()`, `testConnection()` +3. Add to `createEmbeddingProvider()` factory switch +4. Add provider enum value to `src/types.ts:EmbeddingProvider` +5. Add settings interface and update `PluginSettings` +6. Add UI controls in `src/ui/settingsTab.ts` + +### Debugging Indexing Issues +1. Check console logs: orchestrator logs initialization steps +2. Verify settings: `this.settings` should match what's in data.json +3. Check manifest: FileManifest tracks what's been indexed +4. Test connections: Use settings tab "Test Connection" buttons +5. Monitor status bar: Shows progress and errors + +## Testing & Validation + +While there are no automated tests, manual testing workflow: + +1. Build the plugin: `npm run build` +2. Copy to test vault's plugin folder +3. Configure settings (Qdrant URL, embedding provider) +4. Test connection buttons in settings +5. Index a single file via command palette +6. Verify in Qdrant dashboard that points were created +7. Run semantic search and verify results +8. Test file modifications and deletions + +## Known Issues & Limitations + +- Graph visualization not yet implemented (UI placeholder exists) +- Hybrid search (sparse + dense) configured but not exposed in search UI +- Simple word-based tokenizer doesn't match exact GPT token counts +- No rate limiting on API calls (relies on provider batch/concurrency settings) +- File manifest doesn't handle vault renames (would require full reindex) diff --git a/main.ts b/main.ts index 515f09c..da076e9 100644 --- a/main.ts +++ b/main.ts @@ -51,6 +51,7 @@ export default class QdrantPlugin extends Plugin { private async initializeOrchestrator() { try { console.log('Initializing indexing orchestrator...'); + console.log('Creating orchestrator with Qdrant URL:', this.settings.qdrant.url); this.indexingOrchestrator = new IndexingOrchestrator(this.app, this.settings); await this.indexingOrchestrator.initialize(); @@ -74,12 +75,54 @@ export default class QdrantPlugin extends Plugin { } } + /** + * Deep merge two objects, recursively merging nested objects + */ + private deepMerge(target: any, source: any): any { + const output = Object.assign({}, target); + if (this.isObject(target) && this.isObject(source)) { + Object.keys(source).forEach(key => { + if (this.isObject(source[key])) { + if (!(key in target)) { + Object.assign(output, { [key]: source[key] }); + } else { + output[key] = this.deepMerge(target[key], source[key]); + } + } else { + Object.assign(output, { [key]: source[key] }); + } + }); + } + return output; + } + + private isObject(item: any): boolean { + return item && typeof item === 'object' && !Array.isArray(item); + } + async loadSettings() { - this.settings = Object.assign({}, DEFAULT_SETTINGS, await this.loadData()); + const loadedData = await this.loadData(); + console.log('Loading settings from data.json:', JSON.stringify(loadedData, null, 2)); + + // Use deep merge to properly combine defaults with saved settings + this.settings = this.deepMerge(DEFAULT_SETTINGS, loadedData || {}); + + console.log('Merged settings (Qdrant URL):', this.settings.qdrant.url); } async saveSettings() { + console.log('Saving settings (Qdrant URL):', this.settings.qdrant.url); await this.saveData(this.settings); + + // Reinitialize orchestrator with new settings + console.log('Reinitializing orchestrator with updated settings...'); + if (this.indexingOrchestrator) { + await this.indexingOrchestrator.shutdown(); + this.indexingOrchestrator = null; + } + + // Reinitialize with new settings + await this.initializeOrchestrator(); } private setupStatusBar() { diff --git a/src/extractors/base.ts b/src/extractors/base.ts index c743938..cca6d9c 100644 --- a/src/extractors/base.ts +++ b/src/extractors/base.ts @@ -31,7 +31,7 @@ export abstract class BaseExtractor { protected createBaseMetadata(file: TFile): ChunkMetadata { const stats = this.getFileStats(file); const ext = file.extension || ''; - + return { path: file.path, ext, @@ -47,6 +47,7 @@ export abstract class BaseExtractor { chunk_index: 0, chunk_start: 0, chunk_end: 0, + chunk_text: '', // Will be set during chunking fm: {} }; } diff --git a/src/indexing/indexQueue.ts b/src/indexing/indexQueue.ts index 46458b4..d94e45e 100644 --- a/src/indexing/indexQueue.ts +++ b/src/indexing/indexQueue.ts @@ -180,7 +180,7 @@ export class IndexingQueue { // Extract content const extractedContent = await this.extractorManager.extract(file); - + if (!extractedContent.text.trim()) { console.log(`Skipping file ${file.path} - no text content`); return; @@ -188,7 +188,7 @@ export class IndexingQueue { // Chunk content const chunks = await this.chunker.chunk(extractedContent); - + if (chunks.length === 0) { console.log(`Skipping file ${file.path} - no chunks created`); return; @@ -196,15 +196,22 @@ export class IndexingQueue { this.progress.totalChunks += chunks.length; - // Generate embeddings + // Extract chunk texts and generate embeddings const texts = chunks.map(chunk => extractedContent.text.substring(chunk.chunk_start, chunk.chunk_end)); const embeddings = await this.embeddingProvider.embed(texts); + // Get model name from the embedding provider + const modelName = this.embeddingProvider.getName(); + // Prepare points for Qdrant const points = chunks.map((chunk, index) => ({ id: this.generatePointId(file, chunk.chunk_index), vector: embeddings[index], - metadata: chunk + metadata: { + ...chunk, + model: modelName, // Set the model name for each chunk + chunk_text: texts[index] // Include the actual text content + } })); // Index in Qdrant diff --git a/src/qdrant/client.ts b/src/qdrant/client.ts index 056ba3e..a20231b 100644 --- a/src/qdrant/client.ts +++ b/src/qdrant/client.ts @@ -47,6 +47,7 @@ export class QdrantClient { constructor(settings: QdrantSettings) { this.settings = settings; this.baseUrl = settings.url.replace(/\/$/, ''); // Remove trailing slash + console.log('QdrantClient initialized with URL:', this.baseUrl); } private async makeRequest( diff --git a/src/types.ts b/src/types.ts index fb68f80..d8faea4 100644 --- a/src/types.ts +++ b/src/types.ts @@ -78,6 +78,7 @@ export interface ChunkMetadata { chunk_index: number; chunk_start: number; chunk_end: number; + chunk_text: string; // The actual text content of this chunk page_no?: number; ocr?: boolean; fm: Record; // frontmatter fields diff --git a/src/utils/hash.ts b/src/utils/hash.ts index 163f622..b74ab9b 100644 --- a/src/utils/hash.ts +++ b/src/utils/hash.ts @@ -21,19 +21,37 @@ export function generateDeterministicUUID(input: string): string { hash = ((hash << 5) - hash) + char; hash = hash & hash; // Convert to 32bit integer } - + // Convert hash to UUID format - const hex = Math.abs(hash).toString(16).padStart(8, '0'); - - // Generate additional random-looking but deterministic parts + const hex1 = Math.abs(hash).toString(16).padStart(8, '0'); + + // Generate additional deterministic hex strings for different UUID segments let hash2 = hash; for (let i = 0; i < input.length; i++) { hash2 = ((hash2 << 3) + input.charCodeAt(i)) & 0xFFFFFFFF; } const hex2 = Math.abs(hash2).toString(16).padStart(8, '0'); - + + // Generate third hash for the 12-character segment + let hash3 = hash2; + for (let i = 0; i < input.length; i++) { + hash3 = ((hash3 << 7) + input.charCodeAt(input.length - 1 - i)) & 0xFFFFFFFF; + } + const hex3 = Math.abs(hash3).toString(16).padStart(8, '0'); + + // Combine to make more hex data for segment 5 (needs 12 chars) + const hex3_extended = (hex3 + hex1).substring(0, 12); + // Create UUID v4 format (xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx) - return `${hex.substring(0, 8)}-${hex.substring(0, 4)}-4${hex.substring(1, 4)}-${hex2.substring(0, 4)}-${hex2.substring(0, 12)}`; + // Segment 1: 8 hex chars + // Segment 2: 4 hex chars + // Segment 3: 4 hex chars (starting with '4' for version 4) + // Segment 4: 4 hex chars (should start with 8, 9, a, or b for variant) + // Segment 5: 12 hex chars + const segment4 = hex2.substring(0, 4); + const segment4Fixed = (parseInt(segment4.charAt(0), 16) & 0x3 | 0x8).toString(16) + segment4.substring(1, 4); + + return `${hex1.substring(0, 8)}-${hex1.substring(0, 4)}-4${hex1.substring(4, 7)}-${segment4Fixed}-${hex3_extended}`; } /**