file-browser/scripts/ingest-nextcloud.ts

160 lines
4.8 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Ingest Nextcloud files into Elasticsearch (BM25 baseline).
*
* Flow:
* - Crawl Nextcloud via WebDAV starting from NEXTCLOUD_ROOT_PATH
* - For each file, build an IndexDocument
* - Optionally extract text with Apache Tika if TIKA_BASE_URL is set
* - Index into Elasticsearch using env.ELASTICSEARCH_INDEX/ALIAS
*
* Usage:
* npm run ingest:nextcloud
* npm run ingest:nextcloud -- --root=/remote.php/dav/files/admin/SomeFolder
*
* Notes:
* - Requires .env.local with Nextcloud + Elasticsearch creds
* - Tika is optional; if not configured or fails, content is omitted
*/
import * as Sentry from "@sentry/nextjs";
import { env } from "@/lib/env";
import { nextcloud, NextcloudClient } from "@/lib/webdav";
import { basename, joinPath, normalizePath, pathToId, parentPath as getParent } from "@/lib/paths";
import type { WebDavEntry } from "@/types/files";
import type { IndexDocument } from "@/types/ingest";
import { indexDocument, ensureIndex } from "@/lib/elasticsearch";
// Simple fetch helper for Node (global fetch supported in Next runtime)
async function tikaExtractText(stream: NodeJS.ReadableStream, contentType?: string): Promise<{ text?: string; meta?: Record<string, unknown> }> {
if (!env.TIKA_BASE_URL) return {};
try {
// Tika Server v2: POST /tika/text for plain text
const url = `${env.TIKA_BASE_URL.replace(/\/$/, "")}/tika/text`;
const res = await fetch(url, {
method: "PUT", // Some deployments use PUT; POST is also supported depending on image
headers: {
Accept: "text/plain",
...(contentType ? { "Content-Type": contentType } : {}),
},
body: stream as unknown as BodyInit,
} as RequestInit);
if (!res.ok) {
throw new Error(`Tika extraction failed: ${res.status} ${res.statusText}`);
}
const text = await res.text();
return { text };
} catch (err) {
Sentry.captureException(err);
return {};
}
}
type CrawlOpts = {
client: NextcloudClient;
root: string;
};
async function* crawl({ client, root }: CrawlOpts): AsyncGenerator<WebDavEntry> {
const stack: string[] = [root];
while (stack.length) {
const current = stack.pop()!;
const entries = await client.listDirectory(current);
for (const e of entries) {
// Skip the container itself when PROPFIND returns the directory entry
if (normalizePath(e.href) === normalizePath(current) && e.isDirectory) continue;
if (e.isDirectory) {
stack.push(e.href);
} else {
yield e;
}
}
}
}
function toIndexDoc(e: WebDavEntry): IndexDocument {
const p = normalizePath(e.href);
const doc: IndexDocument = {
id: pathToId(p),
name: e.name || basename(p),
path: p,
parentPath: getParent(p)!,
sizeBytes: typeof e.size === "number" ? e.size : 0,
mimeType: e.contentType || "application/octet-stream",
owner: env.NEXTCLOUD_USERNAME,
modifiedAt: e.lastmod,
etag: e.etag,
previewAvailable: false,
tags: [],
};
return doc;
}
function parseArgs(argv: string[]) {
const out: Record<string, string | boolean> = {};
for (const arg of argv) {
if (arg.startsWith("--")) {
const [k, v] = arg.slice(2).split("=");
out[k] = v ?? true;
}
}
return out;
}
async function main() {
const args = parseArgs(process.argv.slice(2));
const root = normalizePath(
typeof args.root === "string" && args.root.length > 1 ? args.root : env.NEXTCLOUD_ROOT_PATH,
);
console.log(`[ingest] Start crawl from: ${root}`);
// Ensure index exists (creates mappings and alias if missing)
await ensureIndex();
// Create a client rooted at NEXTCLOUD_ROOT_PATH (resolve uses absolute)
const client = new NextcloudClient(env.NEXTCLOUD_ROOT_PATH);
let count = 0;
for await (const e of crawl({ client, root })) {
try {
// Build document from DAV metadata
const baseDoc = toIndexDoc(e);
// Try Tika extraction (optional)
let contentText: string | undefined;
try {
const stream = await client.downloadStream(e.href);
const { text } = await tikaExtractText(stream, e.contentType);
if (text && text.trim().length > 0) contentText = text;
} catch (err) {
// Non-fatal; proceed without content
Sentry.captureException(err);
}
const doc: IndexDocument = {
...baseDoc,
content: contentText,
title: baseDoc.name,
};
// Index into ES
await indexDocument(doc);
count++;
if (count % 50 === 0) {
console.log(`[ingest] Indexed ${count} documents`);
}
} catch (err) {
console.error(`[ingest] Failed for ${e.href}:`, err);
Sentry.captureException(err);
}
}
console.log(`[ingest] Done. Indexed ${count} documents.`);
}
main().catch((err) => {
console.error("[ingest] Fatal error:", err);
process.exit(1);
});