obsidian-qdrant/src/extractors/text.ts.backup

import { TFile } from 'obsidian';
import { BaseExtractor } from './base';
import { ExtractedContent } from '../types';

export class TextExtractor extends BaseExtractor {
  private supportedExtensions = ['txt', 'js', 'ts', 'json', 'html', 'css', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb', 'sh', 'yml', 'yaml', 'xml'];

  canHandle(file: TFile): boolean {
    return this.supportedExtensions.includes(file.extension || '');
  }

  async extract(file: TFile): Promise<ExtractedContent> {
    const content = await this.getFileContent(file);
    const metadata = this.createBaseMetadata(file);

    // For code files, we might want to extract some basic structure
    if (this.isCodeFile(file)) {
      const codeElements = this.extractCodeElements(content, file.extension || '');
      metadata.fm = {
        ...metadata.fm,
        language: file.extension,
        functions: codeElements.functions,
        classes: codeElements.classes,
        imports: codeElements.imports
      };
    }

    return {
      text: content,
      metadata,
      pageNumbers: undefined
    };
  }

  private isCodeFile(file: TFile): boolean {
    const codeExtensions = ['js', 'ts', 'py', 'java', 'cpp', 'c', 'go', 'rs', 'php', 'rb'];
    return codeExtensions.includes(file.extension || '');
  }

  private extractCodeElements(content: string, extension: string): {
    functions: string[];
    classes: string[];
    imports: string[];
  } {
    const functions: string[] = [];
    const classes: string[] = [];
    const imports: string[] = [];

    switch (extension) {
      case 'js':
      case 'ts':
        this.extractJSElements(content, functions, classes, imports);
        break;
      case 'py':
        this.extractPythonElements(content, functions, classes, imports);
        break;
      case 'java':
        this.extractJavaElements(content, functions, classes, imports);
        break;
      case 'cpp':
      case 'c':
        this.extractCElements(content, functions, classes, imports);
        break;
      case 'go':
        this.extractGoElements(content, functions, classes, imports);
        break;
      case 'rs':
        this.extractRustElements(content, functions, classes, imports);
        break;
      case 'php':
        this.extractPhpElements(content, functions, classes, imports);
        break;
      case 'rb':
        this.extractRubyElements(content, functions, classes, imports);
        break;
    }

    return { functions, classes, imports };
  }

  private extractJSElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract function declarations
    const functionRegex = /(?:function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s+)?\(|(\w+)\s*:\s*(?:async\s+)?\(/g;
    let match;
    while ((match = functionRegex.exec(content)) !== null) {
      const funcName = match[1] || match[2] || match[3];
      if (funcName) functions.push(funcName);
    }

    // Extract class declarations
    const classRegex = /class\s+(\w+)/g;
    while ((match = classRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract imports
    const importRegex = /import\s+(?:.*\s+from\s+)?['"]([^'"]+)['"]/g;
    while ((match = importRegex.exec(content)) !== null) {
      imports.push(match[1]);
    }
  }

  private extractPythonElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract function definitions
    const functionRegex = /def\s+(\w+)\s*\(/g;
    let match;
    while ((match = functionRegex.exec(content)) !== null) {
      functions.push(match[1]);
    }

    // Extract class definitions
    const classRegex = /class\s+(\w+)/g;
    while ((match = classRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract imports
    const importRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g;
    while ((match = importRegex.exec(content)) !== null) {
      imports.push(match[1] || match[2]);
    }
  }

  private extractJavaElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract method declarations
    const methodRegex = /(?:public|private|protected)?\s*(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g;
    let match;
    while ((match = methodRegex.exec(content)) !== null) {
      functions.push(match[1]);
    }

    // Extract class declarations
    const classRegex = /(?:public\s+)?class\s+(\w+)/g;
    while ((match = classRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract imports
    const importRegex = /import\s+([^;]+);/g;
    while ((match = importRegex.exec(content)) !== null) {
      imports.push(match[1]);
    }
  }

  private extractCElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract function declarations
    const functionRegex = /(?:static\s+)?\s*(?:void|\w+)\s+(\w+)\s*\(/g;
    let match;
    while ((match = functionRegex.exec(content)) !== null) {
      functions.push(match[1]);
    }

    // Extract struct declarations
    const structRegex = /struct\s+(\w+)/g;
    while ((match = structRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract includes
    const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g;
    while ((match = includeRegex.exec(content)) !== null) {
      imports.push(match[1]);
    }
  }

  private extractGoElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract function declarations
    const functionRegex = /func\s+(?:\(\w+\s+\*?\w+\)\s+)?(\w+)\s*\(/g;
    let match;
    while ((match = functionRegex.exec(content)) !== null) {
      functions.push(match[1]);
    }

    // Extract type declarations
    const typeRegex = /type\s+(\w+)\s+(?:struct|interface)/g;
    while ((match = typeRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract imports
    const importRegex = /import\s+(?:\(([^)]+)\)|['"]([^'"]+)['"])/g;
    while ((match = importRegex.exec(content)) !== null) {
      if (match[1]) {
        // Multi-line import
        const importsList = match[1].split('\n').map(imp => imp.trim().replace(/['"]/g, ''));
        imports.push(...importsList);
      } else if (match[2]) {
        imports.push(match[2]);
      }
    }
  }

  private extractRustElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract function declarations
    const functionRegex = /fn\s+(\w+)\s*\(/g;
    let match;
    while ((match = functionRegex.exec(content)) !== null) {
      functions.push(match[1]);
    }

    // Extract struct and enum declarations
    const structRegex = /(?:struct|enum)\s+(\w+)/g;
    while ((match = structRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract use statements
    const useRegex = /use\s+([^;]+);/g;
    while ((match = useRegex.exec(content)) !== null) {
      imports.push(match[1]);
    }
  }

  private extractPhpElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract function declarations
    const functionRegex = /function\s+(\w+)\s*\(/g;
    let match;
    while ((match = functionRegex.exec(content)) !== null) {
      functions.push(match[1]);
    }

    // Extract class declarations
    const classRegex = /class\s+(\w+)/g;
    while ((match = classRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract require/include statements
    const requireRegex = /(?:require|include)(?:_once)?\s*['"]([^'"]+)['"]/g;
    while ((match = requireRegex.exec(content)) !== null) {
      imports.push(match[1]);
    }
  }

  private extractRubyElements(content: string, functions: string[], classes: string[], imports: string[]): void {
    // Extract method definitions
    const methodRegex = /def\s+(\w+)/g;
    let match;
    while ((match = methodRegex.exec(content)) !== null) {
      functions.push(match[1]);
    }

    // Extract class definitions
    const classRegex = /class\s+(\w+)/g;
    while ((match = classRegex.exec(content)) !== null) {
      classes.push(match[1]);
    }

    // Extract require statements
    const requireRegex = /require\s+['"]([^'"]+)['"]/g;
    while ((match = requireRegex.exec(content)) !== null) {
      imports.push(match[1]);
    }
  }
}