#!/usr/bin/env tsx /** * Ingest Nextcloud files into Elasticsearch (BM25 baseline). * * Flow: * - Crawl Nextcloud via WebDAV starting from NEXTCLOUD_ROOT_PATH * - For each file, build an IndexDocument * - Optionally extract text with Apache Tika if TIKA_BASE_URL is set * - Index into Elasticsearch using env.ELASTICSEARCH_INDEX/ALIAS * * Usage: * npm run ingest:nextcloud * npm run ingest:nextcloud -- --root=/remote.php/dav/files/admin/SomeFolder * * Notes: * - Requires .env.local with Nextcloud + Elasticsearch creds * - Tika is optional; if not configured or fails, content is omitted */ import * as Sentry from "@sentry/nextjs"; import { env } from "@/lib/env"; import { nextcloud, NextcloudClient } from "@/lib/webdav"; import { basename, joinPath, normalizePath, pathToId, parentPath as getParent } from "@/lib/paths"; import type { WebDavEntry } from "@/types/files"; import type { IndexDocument } from "@/types/ingest"; import { indexDocument, ensureIndex } from "@/lib/elasticsearch"; // Simple fetch helper for Node (global fetch supported in Next runtime) async function tikaExtractText(stream: NodeJS.ReadableStream, contentType?: string): Promise<{ text?: string; meta?: Record }> { if (!env.TIKA_BASE_URL) return {}; try { // Tika Server v2: POST /tika/text for plain text const url = `${env.TIKA_BASE_URL.replace(/\/$/, "")}/tika/text`; const res = await fetch(url, { method: "PUT", // Some deployments use PUT; POST is also supported depending on image headers: { Accept: "text/plain", ...(contentType ? { "Content-Type": contentType } : {}), }, body: stream as unknown as BodyInit, } as RequestInit); if (!res.ok) { throw new Error(`Tika extraction failed: ${res.status} ${res.statusText}`); } const text = await res.text(); return { text }; } catch (err) { Sentry.captureException(err); return {}; } } type CrawlOpts = { client: NextcloudClient; root: string; }; async function* crawl({ client, root }: CrawlOpts): AsyncGenerator { const stack: string[] = [root]; while (stack.length) { const current = stack.pop()!; const entries = await client.listDirectory(current); for (const e of entries) { // Skip the container itself when PROPFIND returns the directory entry if (normalizePath(e.href) === normalizePath(current) && e.isDirectory) continue; if (e.isDirectory) { stack.push(e.href); } else { yield e; } } } } function toIndexDoc(e: WebDavEntry): IndexDocument { const p = normalizePath(e.href); const doc: IndexDocument = { id: pathToId(p), name: e.name || basename(p), path: p, parentPath: getParent(p)!, sizeBytes: typeof e.size === "number" ? e.size : 0, mimeType: e.contentType || "application/octet-stream", owner: env.NEXTCLOUD_USERNAME, modifiedAt: e.lastmod, etag: e.etag, previewAvailable: false, tags: [], }; return doc; } function parseArgs(argv: string[]) { const out: Record = {}; for (const arg of argv) { if (arg.startsWith("--")) { const [k, v] = arg.slice(2).split("="); out[k] = v ?? true; } } return out; } async function main() { const args = parseArgs(process.argv.slice(2)); const root = normalizePath( typeof args.root === "string" && args.root.length > 1 ? args.root : env.NEXTCLOUD_ROOT_PATH, ); console.log(`[ingest] Start crawl from: ${root}`); // Ensure index exists (creates mappings and alias if missing) await ensureIndex(); // Create a client rooted at NEXTCLOUD_ROOT_PATH (resolve uses absolute) const client = new NextcloudClient(env.NEXTCLOUD_ROOT_PATH); let count = 0; for await (const e of crawl({ client, root })) { try { // Build document from DAV metadata const baseDoc = toIndexDoc(e); // Try Tika extraction (optional) let contentText: string | undefined; try { const stream = await client.downloadStream(e.href); const { text } = await tikaExtractText(stream, e.contentType); if (text && text.trim().length > 0) contentText = text; } catch (err) { // Non-fatal; proceed without content Sentry.captureException(err); } const doc: IndexDocument = { ...baseDoc, content: contentText, title: baseDoc.name, }; // Index into ES await indexDocument(doc); count++; if (count % 50 === 0) { console.log(`[ingest] Indexed ${count} documents`); } } catch (err) { console.error(`[ingest] Failed for ${e.href}:`, err); Sentry.captureException(err); } } console.log(`[ingest] Done. Indexed ${count} documents.`); } main().catch((err) => { console.error("[ingest] Fatal error:", err); process.exit(1); });