obsidian-qdrant/src/extractors/text.ts.backup
Nicholai 92f49f4bf7 Summary
 What's Working:
Plugin loads successfully in Obsidian
Settings are being saved correctly to disk
Qdrant server is accessible and responding
Ollama is set up with the embedding model
UUID generation fixed for Qdrant compatibility
 Main Issue:
Plugin is using default localhost:6333 URL instead of your saved https://vectors.biohazardvfx.com URL
This is a settings initialization timing problem
🎯 Next Step:
Fix the IndexingOrchestrator to use the loaded settings instead of defaults
This is likely a simple fix - the orchestrator needs to reference this.settings that were loaded from data.json
Progress: ~95% complete - just need to fix this one settings issue and then test the full indexing + search workflow!
2025-10-23 09:24:03 -06:00

256 lines
8.2 KiB
Plaintext

import { TFile } from 'obsidian';
import { BaseExtractor } from './base';
import { ExtractedContent } from '../types';
export class TextExtractor extends BaseExtractor {
private supportedExtensions = ['txt', 'js', 'ts', 'json', 'html', 'css', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb', 'sh', 'yml', 'yaml', 'xml'];
canHandle(file: TFile): boolean {
return this.supportedExtensions.includes(file.extension || '');
}
async extract(file: TFile): Promise<ExtractedContent> {
const content = await this.getFileContent(file);
const metadata = this.createBaseMetadata(file);
// For code files, we might want to extract some basic structure
if (this.isCodeFile(file)) {
const codeElements = this.extractCodeElements(content, file.extension || '');
metadata.fm = {
...metadata.fm,
language: file.extension,
functions: codeElements.functions,
classes: codeElements.classes,
imports: codeElements.imports
};
}
return {
text: content,
metadata,
pageNumbers: undefined
};
}
private isCodeFile(file: TFile): boolean {
const codeExtensions = ['js', 'ts', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb'];
return codeExtensions.includes(file.extension || '');
}
private extractCodeElements(content: string, extension: string): {
functions: string[];
classes: string[];
imports: string[];
} {
const functions: string[] = [];
const classes: string[] = [];
const imports: string[] = [];
switch (extension) {
case 'js':
case 'ts':
this.extractJSElements(content, functions, classes, imports);
break;
case 'py':
this.extractPythonElements(content, functions, classes, imports);
break;
case 'java':
this.extractJavaElements(content, functions, classes, imports);
break;
case 'cpp':
case 'c':
this.extractCElements(content, functions, classes, imports);
break;
case 'go':
this.extractGoElements(content, functions, classes, imports);
break;
case 'rs':
this.extractRustElements(content, functions, classes, imports);
break;
case 'php':
this.extractPhpElements(content, functions, classes, imports);
break;
case 'rb':
this.extractRubyElements(content, functions, classes, imports);
break;
}
return { functions, classes, imports };
}
private extractJSElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract function declarations
const functionRegex = /(?:function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s+)?\(|(\w+)\s*:\s*(?:async\s+)?\(/g;
let match;
while ((match = functionRegex.exec(content)) !== null) {
const funcName = match[1] || match[2] || match[3];
if (funcName) functions.push(funcName);
}
// Extract class declarations
const classRegex = /class\s+(\w+)/g;
while ((match = classRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract imports
const importRegex = /import\s+(?:.*\s+from\s+)?['"]([^'"]+)['"]/g;
while ((match = importRegex.exec(content)) !== null) {
imports.push(match[1]);
}
}
private extractPythonElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract function definitions
const functionRegex = /def\s+(\w+)\s*\(/g;
let match;
while ((match = functionRegex.exec(content)) !== null) {
functions.push(match[1]);
}
// Extract class definitions
const classRegex = /class\s+(\w+)/g;
while ((match = classRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract imports
const importRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g;
while ((match = importRegex.exec(content)) !== null) {
imports.push(match[1] || match[2]);
}
}
private extractJavaElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract method declarations
const methodRegex = /(?:public|private|protected)?\s*(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g;
let match;
while ((match = methodRegex.exec(content)) !== null) {
functions.push(match[1]);
}
// Extract class declarations
const classRegex = /(?:public\s+)?class\s+(\w+)/g;
while ((match = classRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract imports
const importRegex = /import\s+([^;]+);/g;
while ((match = importRegex.exec(content)) !== null) {
imports.push(match[1]);
}
}
private extractCElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract function declarations
const functionRegex = /(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g;
let match;
while ((match = functionRegex.exec(content)) !== null) {
functions.push(match[1]);
}
// Extract struct declarations
const structRegex = /struct\s+(\w+)/g;
while ((match = structRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract includes
const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g;
while ((match = includeRegex.exec(content)) !== null) {
imports.push(match[1]);
}
}
private extractGoElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract function declarations
const functionRegex = /func\s+(?:\(\w+\s+\*?\w+\)\s+)?(\w+)\s*\(/g;
let match;
while ((match = functionRegex.exec(content)) !== null) {
functions.push(match[1]);
}
// Extract type declarations
const typeRegex = /type\s+(\w+)\s+(?:struct|interface)/g;
while ((match = typeRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract imports
const importRegex = /import\s+(?:\(([^)]+)\)|['"]([^'"]+)['"])/g;
while ((match = importRegex.exec(content)) !== null) {
if (match[1]) {
// Multi-line import
const importsList = match[1].split('\n').map(imp => imp.trim().replace(/['"]/g, ''));
imports.push(...importsList);
} else if (match[2]) {
imports.push(match[2]);
}
}
}
private extractRustElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract function declarations
const functionRegex = /fn\s+(\w+)\s*\(/g;
let match;
while ((match = functionRegex.exec(content)) !== null) {
functions.push(match[1]);
}
// Extract struct and enum declarations
const structRegex = /(?:struct|enum)\s+(\w+)/g;
while ((match = structRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract use statements
const useRegex = /use\s+([^;]+);/g;
while ((match = useRegex.exec(content)) !== null) {
imports.push(match[1]);
}
}
private extractPhpElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract function declarations
const functionRegex = /function\s+(\w+)\s*\(/g;
let match;
while ((match = functionRegex.exec(content)) !== null) {
functions.push(match[1]);
}
// Extract class declarations
const classRegex = /class\s+(\w+)/g;
while ((match = classRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract require/include statements
const requireRegex = /(?:require|include)(?:_once)?\s*['"]([^'"]+)['"]/g;
while ((match = requireRegex.exec(content)) !== null) {
imports.push(match[1]);
}
}
private extractRubyElements(content: string, functions: string[], classes: string[], imports: string[]): void {
// Extract method definitions
const methodRegex = /def\s+(\w+)/g;
let match;
while ((match = methodRegex.exec(content)) !== null) {
functions.push(match[1]);
}
// Extract class definitions
const classRegex = /class\s+(\w+)/g;
while ((match = classRegex.exec(content)) !== null) {
classes.push(match[1]);
}
// Extract require statements
const requireRegex = /require\s+['"]([^'"]+)['"]/g;
while ((match = requireRegex.exec(content)) !== null) {
imports.push(match[1]);
}
}
}