✅ What's Working: Plugin loads successfully in Obsidian Settings are being saved correctly to disk Qdrant server is accessible and responding Ollama is set up with the embedding model UUID generation fixed for Qdrant compatibility ❌ Main Issue: Plugin is using default localhost:6333 URL instead of your saved https://vectors.biohazardvfx.com URL This is a settings initialization timing problem 🎯 Next Step: Fix the IndexingOrchestrator to use the loaded settings instead of defaults This is likely a simple fix - the orchestrator needs to reference this.settings that were loaded from data.json Progress: ~95% complete - just need to fix this one settings issue and then test the full indexing + search workflow!
256 lines
8.2 KiB
Plaintext
256 lines
8.2 KiB
Plaintext
import { TFile } from 'obsidian';
|
|
import { BaseExtractor } from './base';
|
|
import { ExtractedContent } from '../types';
|
|
|
|
export class TextExtractor extends BaseExtractor {
|
|
private supportedExtensions = ['txt', 'js', 'ts', 'json', 'html', 'css', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb', 'sh', 'yml', 'yaml', 'xml'];
|
|
|
|
canHandle(file: TFile): boolean {
|
|
return this.supportedExtensions.includes(file.extension || '');
|
|
}
|
|
|
|
async extract(file: TFile): Promise<ExtractedContent> {
|
|
const content = await this.getFileContent(file);
|
|
const metadata = this.createBaseMetadata(file);
|
|
|
|
// For code files, we might want to extract some basic structure
|
|
if (this.isCodeFile(file)) {
|
|
const codeElements = this.extractCodeElements(content, file.extension || '');
|
|
metadata.fm = {
|
|
...metadata.fm,
|
|
language: file.extension,
|
|
functions: codeElements.functions,
|
|
classes: codeElements.classes,
|
|
imports: codeElements.imports
|
|
};
|
|
}
|
|
|
|
return {
|
|
text: content,
|
|
metadata,
|
|
pageNumbers: undefined
|
|
};
|
|
}
|
|
|
|
private isCodeFile(file: TFile): boolean {
|
|
const codeExtensions = ['js', 'ts', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb'];
|
|
return codeExtensions.includes(file.extension || '');
|
|
}
|
|
|
|
private extractCodeElements(content: string, extension: string): {
|
|
functions: string[];
|
|
classes: string[];
|
|
imports: string[];
|
|
} {
|
|
const functions: string[] = [];
|
|
const classes: string[] = [];
|
|
const imports: string[] = [];
|
|
|
|
switch (extension) {
|
|
case 'js':
|
|
case 'ts':
|
|
this.extractJSElements(content, functions, classes, imports);
|
|
break;
|
|
case 'py':
|
|
this.extractPythonElements(content, functions, classes, imports);
|
|
break;
|
|
case 'java':
|
|
this.extractJavaElements(content, functions, classes, imports);
|
|
break;
|
|
case 'cpp':
|
|
case 'c':
|
|
this.extractCElements(content, functions, classes, imports);
|
|
break;
|
|
case 'go':
|
|
this.extractGoElements(content, functions, classes, imports);
|
|
break;
|
|
case 'rs':
|
|
this.extractRustElements(content, functions, classes, imports);
|
|
break;
|
|
case 'php':
|
|
this.extractPhpElements(content, functions, classes, imports);
|
|
break;
|
|
case 'rb':
|
|
this.extractRubyElements(content, functions, classes, imports);
|
|
break;
|
|
}
|
|
|
|
return { functions, classes, imports };
|
|
}
|
|
|
|
private extractJSElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract function declarations
|
|
const functionRegex = /(?:function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s+)?\(|(\w+)\s*:\s*(?:async\s+)?\(/g;
|
|
let match;
|
|
while ((match = functionRegex.exec(content)) !== null) {
|
|
const funcName = match[1] || match[2] || match[3];
|
|
if (funcName) functions.push(funcName);
|
|
}
|
|
|
|
// Extract class declarations
|
|
const classRegex = /class\s+(\w+)/g;
|
|
while ((match = classRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract imports
|
|
const importRegex = /import\s+(?:.*\s+from\s+)?['"]([^'"]+)['"]/g;
|
|
while ((match = importRegex.exec(content)) !== null) {
|
|
imports.push(match[1]);
|
|
}
|
|
}
|
|
|
|
private extractPythonElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract function definitions
|
|
const functionRegex = /def\s+(\w+)\s*\(/g;
|
|
let match;
|
|
while ((match = functionRegex.exec(content)) !== null) {
|
|
functions.push(match[1]);
|
|
}
|
|
|
|
// Extract class definitions
|
|
const classRegex = /class\s+(\w+)/g;
|
|
while ((match = classRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract imports
|
|
const importRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g;
|
|
while ((match = importRegex.exec(content)) !== null) {
|
|
imports.push(match[1] || match[2]);
|
|
}
|
|
}
|
|
|
|
private extractJavaElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract method declarations
|
|
const methodRegex = /(?:public|private|protected)?\s*(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g;
|
|
let match;
|
|
while ((match = methodRegex.exec(content)) !== null) {
|
|
functions.push(match[1]);
|
|
}
|
|
|
|
// Extract class declarations
|
|
const classRegex = /(?:public\s+)?class\s+(\w+)/g;
|
|
while ((match = classRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract imports
|
|
const importRegex = /import\s+([^;]+);/g;
|
|
while ((match = importRegex.exec(content)) !== null) {
|
|
imports.push(match[1]);
|
|
}
|
|
}
|
|
|
|
private extractCElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract function declarations
|
|
const functionRegex = /(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g;
|
|
let match;
|
|
while ((match = functionRegex.exec(content)) !== null) {
|
|
functions.push(match[1]);
|
|
}
|
|
|
|
// Extract struct declarations
|
|
const structRegex = /struct\s+(\w+)/g;
|
|
while ((match = structRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract includes
|
|
const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g;
|
|
while ((match = includeRegex.exec(content)) !== null) {
|
|
imports.push(match[1]);
|
|
}
|
|
}
|
|
|
|
private extractGoElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract function declarations
|
|
const functionRegex = /func\s+(?:\(\w+\s+\*?\w+\)\s+)?(\w+)\s*\(/g;
|
|
let match;
|
|
while ((match = functionRegex.exec(content)) !== null) {
|
|
functions.push(match[1]);
|
|
}
|
|
|
|
// Extract type declarations
|
|
const typeRegex = /type\s+(\w+)\s+(?:struct|interface)/g;
|
|
while ((match = typeRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract imports
|
|
const importRegex = /import\s+(?:\(([^)]+)\)|['"]([^'"]+)['"])/g;
|
|
while ((match = importRegex.exec(content)) !== null) {
|
|
if (match[1]) {
|
|
// Multi-line import
|
|
const importsList = match[1].split('\n').map(imp => imp.trim().replace(/['"]/g, ''));
|
|
imports.push(...importsList);
|
|
} else if (match[2]) {
|
|
imports.push(match[2]);
|
|
}
|
|
}
|
|
}
|
|
|
|
private extractRustElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract function declarations
|
|
const functionRegex = /fn\s+(\w+)\s*\(/g;
|
|
let match;
|
|
while ((match = functionRegex.exec(content)) !== null) {
|
|
functions.push(match[1]);
|
|
}
|
|
|
|
// Extract struct and enum declarations
|
|
const structRegex = /(?:struct|enum)\s+(\w+)/g;
|
|
while ((match = structRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract use statements
|
|
const useRegex = /use\s+([^;]+);/g;
|
|
while ((match = useRegex.exec(content)) !== null) {
|
|
imports.push(match[1]);
|
|
}
|
|
}
|
|
|
|
private extractPhpElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract function declarations
|
|
const functionRegex = /function\s+(\w+)\s*\(/g;
|
|
let match;
|
|
while ((match = functionRegex.exec(content)) !== null) {
|
|
functions.push(match[1]);
|
|
}
|
|
|
|
// Extract class declarations
|
|
const classRegex = /class\s+(\w+)/g;
|
|
while ((match = classRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract require/include statements
|
|
const requireRegex = /(?:require|include)(?:_once)?\s*['"]([^'"]+)['"]/g;
|
|
while ((match = requireRegex.exec(content)) !== null) {
|
|
imports.push(match[1]);
|
|
}
|
|
}
|
|
|
|
private extractRubyElements(content: string, functions: string[], classes: string[], imports: string[]): void {
|
|
// Extract method definitions
|
|
const methodRegex = /def\s+(\w+)/g;
|
|
let match;
|
|
while ((match = methodRegex.exec(content)) !== null) {
|
|
functions.push(match[1]);
|
|
}
|
|
|
|
// Extract class definitions
|
|
const classRegex = /class\s+(\w+)/g;
|
|
while ((match = classRegex.exec(content)) !== null) {
|
|
classes.push(match[1]);
|
|
}
|
|
|
|
// Extract require statements
|
|
const requireRegex = /require\s+['"]([^'"]+)['"]/g;
|
|
while ((match = requireRegex.exec(content)) !== null) {
|
|
imports.push(match[1]);
|
|
}
|
|
}
|
|
}
|