import { TFile } from 'obsidian'; import { BaseExtractor } from './base'; import { ExtractedContent } from '../types'; export class TextExtractor extends BaseExtractor { private supportedExtensions = ['txt', 'js', 'ts', 'json', 'html', 'css', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb', 'sh', 'yml', 'yaml', 'xml']; canHandle(file: TFile): boolean { return this.supportedExtensions.includes(file.extension || ''); } async extract(file: TFile): Promise { const content = await this.getFileContent(file); const metadata = this.createBaseMetadata(file); // For code files, we might want to extract some basic structure if (this.isCodeFile(file)) { const codeElements = this.extractCodeElements(content, file.extension || ''); metadata.fm = { ...metadata.fm, language: file.extension, functions: codeElements.functions, classes: codeElements.classes, imports: codeElements.imports }; } return { text: content, metadata, pageNumbers: undefined }; } private isCodeFile(file: TFile): boolean { const codeExtensions = ['js', 'ts', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb']; return codeExtensions.includes(file.extension || ''); } private extractCodeElements(content: string, extension: string): { functions: string[]; classes: string[]; imports: string[]; } { const functions: string[] = []; const classes: string[] = []; const imports: string[] = []; switch (extension) { case 'js': case 'ts': this.extractJSElements(content, functions, classes, imports); break; case 'py': this.extractPythonElements(content, functions, classes, imports); break; case 'java': this.extractJavaElements(content, functions, classes, imports); break; case 'cpp': case 'c': this.extractCElements(content, functions, classes, imports); break; case 'go': this.extractGoElements(content, functions, classes, imports); break; case 'rs': this.extractRustElements(content, functions, classes, imports); break; case 'php': this.extractPhpElements(content, functions, classes, imports); break; case 'rb': this.extractRubyElements(content, functions, classes, imports); break; } return { functions, classes, imports }; } private extractJSElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract function declarations const functionRegex = /(?:function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s+)?\(|(\w+)\s*:\s*(?:async\s+)?\(/g; let match; while ((match = functionRegex.exec(content)) !== null) { const funcName = match[1] || match[2] || match[3]; if (funcName) functions.push(funcName); } // Extract class declarations const classRegex = /class\s+(\w+)/g; while ((match = classRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract imports const importRegex = /import\s+(?:.*\s+from\s+)?['"]([^'"]+)['"]/g; while ((match = importRegex.exec(content)) !== null) { imports.push(match[1]); } } private extractPythonElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract function definitions const functionRegex = /def\s+(\w+)\s*\(/g; let match; while ((match = functionRegex.exec(content)) !== null) { functions.push(match[1]); } // Extract class definitions const classRegex = /class\s+(\w+)/g; while ((match = classRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract imports const importRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g; while ((match = importRegex.exec(content)) !== null) { imports.push(match[1] || match[2]); } } private extractJavaElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract method declarations const methodRegex = /(?:public|private|protected)?\s*(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g; let match; while ((match = methodRegex.exec(content)) !== null) { functions.push(match[1]); } // Extract class declarations const classRegex = /(?:public\s+)?class\s+(\w+)/g; while ((match = classRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract imports const importRegex = /import\s+([^;]+);/g; while ((match = importRegex.exec(content)) !== null) { imports.push(match[1]); } } private extractCElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract function declarations const functionRegex = /(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g; let match; while ((match = functionRegex.exec(content)) !== null) { functions.push(match[1]); } // Extract struct declarations const structRegex = /struct\s+(\w+)/g; while ((match = structRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract includes const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g; while ((match = includeRegex.exec(content)) !== null) { imports.push(match[1]); } } private extractGoElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract function declarations const functionRegex = /func\s+(?:\(\w+\s+\*?\w+\)\s+)?(\w+)\s*\(/g; let match; while ((match = functionRegex.exec(content)) !== null) { functions.push(match[1]); } // Extract type declarations const typeRegex = /type\s+(\w+)\s+(?:struct|interface)/g; while ((match = typeRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract imports const importRegex = /import\s+(?:\(([^)]+)\)|['"]([^'"]+)['"])/g; while ((match = importRegex.exec(content)) !== null) { if (match[1]) { // Multi-line import const importsList = match[1].split('\n').map(imp => imp.trim().replace(/['"]/g, '')); imports.push(...importsList); } else if (match[2]) { imports.push(match[2]); } } } private extractRustElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract function declarations const functionRegex = /fn\s+(\w+)\s*\(/g; let match; while ((match = functionRegex.exec(content)) !== null) { functions.push(match[1]); } // Extract struct and enum declarations const structRegex = /(?:struct|enum)\s+(\w+)/g; while ((match = structRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract use statements const useRegex = /use\s+([^;]+);/g; while ((match = useRegex.exec(content)) !== null) { imports.push(match[1]); } } private extractPhpElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract function declarations const functionRegex = /function\s+(\w+)\s*\(/g; let match; while ((match = functionRegex.exec(content)) !== null) { functions.push(match[1]); } // Extract class declarations const classRegex = /class\s+(\w+)/g; while ((match = classRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract require/include statements const requireRegex = /(?:require|include)(?:_once)?\s*['"]([^'"]+)['"]/g; while ((match = requireRegex.exec(content)) !== null) { imports.push(match[1]); } } private extractRubyElements(content: string, functions: string[], classes: string[], imports: string[]): void { // Extract method definitions const methodRegex = /def\s+(\w+)/g; let match; while ((match = methodRegex.exec(content)) !== null) { functions.push(match[1]); } // Extract class definitions const classRegex = /class\s+(\w+)/g; while ((match = classRegex.exec(content)) !== null) { classes.push(match[1]); } // Extract require statements const requireRegex = /require\s+['"]([^'"]+)['"]/g; while ((match = requireRegex.exec(content)) !== null) { imports.push(match[1]); } } }