160 lines
4.8 KiB
TypeScript
160 lines
4.8 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Ingest Nextcloud files into Elasticsearch (BM25 baseline).
|
|
*
|
|
* Flow:
|
|
* - Crawl Nextcloud via WebDAV starting from NEXTCLOUD_ROOT_PATH
|
|
* - For each file, build an IndexDocument
|
|
* - Optionally extract text with Apache Tika if TIKA_BASE_URL is set
|
|
* - Index into Elasticsearch using env.ELASTICSEARCH_INDEX/ALIAS
|
|
*
|
|
* Usage:
|
|
* npm run ingest:nextcloud
|
|
* npm run ingest:nextcloud -- --root=/remote.php/dav/files/admin/SomeFolder
|
|
*
|
|
* Notes:
|
|
* - Requires .env.local with Nextcloud + Elasticsearch creds
|
|
* - Tika is optional; if not configured or fails, content is omitted
|
|
*/
|
|
|
|
import * as Sentry from "@sentry/nextjs";
|
|
import { env } from "@/lib/env";
|
|
import { nextcloud, NextcloudClient } from "@/lib/webdav";
|
|
import { basename, joinPath, normalizePath, pathToId, parentPath as getParent } from "@/lib/paths";
|
|
import type { WebDavEntry } from "@/types/files";
|
|
import type { IndexDocument } from "@/types/ingest";
|
|
import { indexDocument, ensureIndex } from "@/lib/elasticsearch";
|
|
|
|
// Simple fetch helper for Node (global fetch supported in Next runtime)
|
|
async function tikaExtractText(stream: NodeJS.ReadableStream, contentType?: string): Promise<{ text?: string; meta?: Record<string, unknown> }> {
|
|
if (!env.TIKA_BASE_URL) return {};
|
|
try {
|
|
// Tika Server v2: POST /tika/text for plain text
|
|
const url = `${env.TIKA_BASE_URL.replace(/\/$/, "")}/tika/text`;
|
|
const res = await fetch(url, {
|
|
method: "PUT", // Some deployments use PUT; POST is also supported depending on image
|
|
headers: {
|
|
Accept: "text/plain",
|
|
...(contentType ? { "Content-Type": contentType } : {}),
|
|
},
|
|
body: stream as unknown as BodyInit,
|
|
} as RequestInit);
|
|
|
|
if (!res.ok) {
|
|
throw new Error(`Tika extraction failed: ${res.status} ${res.statusText}`);
|
|
}
|
|
const text = await res.text();
|
|
return { text };
|
|
} catch (err) {
|
|
Sentry.captureException(err);
|
|
return {};
|
|
}
|
|
}
|
|
|
|
type CrawlOpts = {
|
|
client: NextcloudClient;
|
|
root: string;
|
|
};
|
|
|
|
async function* crawl({ client, root }: CrawlOpts): AsyncGenerator<WebDavEntry> {
|
|
const stack: string[] = [root];
|
|
while (stack.length) {
|
|
const current = stack.pop()!;
|
|
const entries = await client.listDirectory(current);
|
|
for (const e of entries) {
|
|
// Skip the container itself when PROPFIND returns the directory entry
|
|
if (normalizePath(e.href) === normalizePath(current) && e.isDirectory) continue;
|
|
if (e.isDirectory) {
|
|
stack.push(e.href);
|
|
} else {
|
|
yield e;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
function toIndexDoc(e: WebDavEntry): IndexDocument {
|
|
const p = normalizePath(e.href);
|
|
const doc: IndexDocument = {
|
|
id: pathToId(p),
|
|
name: e.name || basename(p),
|
|
path: p,
|
|
parentPath: getParent(p)!,
|
|
sizeBytes: typeof e.size === "number" ? e.size : 0,
|
|
mimeType: e.contentType || "application/octet-stream",
|
|
owner: env.NEXTCLOUD_USERNAME,
|
|
modifiedAt: e.lastmod,
|
|
etag: e.etag,
|
|
previewAvailable: false,
|
|
tags: [],
|
|
};
|
|
return doc;
|
|
}
|
|
|
|
function parseArgs(argv: string[]) {
|
|
const out: Record<string, string | boolean> = {};
|
|
for (const arg of argv) {
|
|
if (arg.startsWith("--")) {
|
|
const [k, v] = arg.slice(2).split("=");
|
|
out[k] = v ?? true;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
async function main() {
|
|
const args = parseArgs(process.argv.slice(2));
|
|
const root = normalizePath(
|
|
typeof args.root === "string" && args.root.length > 1 ? args.root : env.NEXTCLOUD_ROOT_PATH,
|
|
);
|
|
console.log(`[ingest] Start crawl from: ${root}`);
|
|
|
|
// Ensure index exists (creates mappings and alias if missing)
|
|
await ensureIndex();
|
|
|
|
// Create a client rooted at NEXTCLOUD_ROOT_PATH (resolve uses absolute)
|
|
const client = new NextcloudClient(env.NEXTCLOUD_ROOT_PATH);
|
|
|
|
let count = 0;
|
|
for await (const e of crawl({ client, root })) {
|
|
try {
|
|
// Build document from DAV metadata
|
|
const baseDoc = toIndexDoc(e);
|
|
|
|
// Try Tika extraction (optional)
|
|
let contentText: string | undefined;
|
|
try {
|
|
const stream = await client.downloadStream(e.href);
|
|
const { text } = await tikaExtractText(stream, e.contentType);
|
|
if (text && text.trim().length > 0) contentText = text;
|
|
} catch (err) {
|
|
// Non-fatal; proceed without content
|
|
Sentry.captureException(err);
|
|
}
|
|
|
|
const doc: IndexDocument = {
|
|
...baseDoc,
|
|
content: contentText,
|
|
title: baseDoc.name,
|
|
};
|
|
|
|
// Index into ES
|
|
await indexDocument(doc);
|
|
count++;
|
|
if (count % 50 === 0) {
|
|
console.log(`[ingest] Indexed ${count} documents`);
|
|
}
|
|
} catch (err) {
|
|
console.error(`[ingest] Failed for ${e.href}:`, err);
|
|
Sentry.captureException(err);
|
|
}
|
|
}
|
|
|
|
console.log(`[ingest] Done. Indexed ${count} documents.`);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("[ingest] Fatal error:", err);
|
|
process.exit(1);
|
|
});
|