file-browser/scripts/ingest-nextcloud.ts

#!/usr/bin/env tsx
/**
 * Ingest Nextcloud files into Elasticsearch (BM25 baseline).
 *
 * Flow:
 * - Crawl Nextcloud via WebDAV starting from NEXTCLOUD_ROOT_PATH
 * - For each file, build an IndexDocument
 * - Optionally extract text with Apache Tika if TIKA_BASE_URL is set
 * - Index into Elasticsearch using env.ELASTICSEARCH_INDEX/ALIAS
 *
 * Usage:
 *   npm run ingest:nextcloud
 *   npm run ingest:nextcloud -- --root=/remote.php/dav/files/admin/SomeFolder
 *
 * Notes:
 * - Requires .env.local with Nextcloud + Elasticsearch creds
 * - Tika is optional; if not configured or fails, content is omitted
 */

import * as Sentry from "@sentry/nextjs";
import { env } from "@/lib/env";
import { nextcloud, NextcloudClient } from "@/lib/webdav";
import { basename, joinPath, normalizePath, pathToId, parentPath as getParent } from "@/lib/paths";
import type { WebDavEntry } from "@/types/files";
import type { IndexDocument } from "@/types/ingest";
import { indexDocument, ensureIndex } from "@/lib/elasticsearch";

// Simple fetch helper for Node (global fetch supported in Next runtime)
async function tikaExtractText(stream: NodeJS.ReadableStream, contentType?: string): Promise<{ text?: string; meta?: Record<string, unknown> }> {
  if (!env.TIKA_BASE_URL) return {};
  try {
    // Tika Server v2: POST /tika/text for plain text
    const url = `${env.TIKA_BASE_URL.replace(/\/$/, "")}/tika/text`;
    const res = await fetch(url, {
      method: "PUT", // Some deployments use PUT; POST is also supported depending on image
      headers: {
        Accept: "text/plain",
        ...(contentType ? { "Content-Type": contentType } : {}),
      },
      body: stream as unknown as BodyInit,
    } as RequestInit);

    if (!res.ok) {
      throw new Error(`Tika extraction failed: ${res.status} ${res.statusText}`);
    }
    const text = await res.text();
    return { text };
  } catch (err) {
    Sentry.captureException(err);
    return {};
  }
}

type CrawlOpts = {
  client: NextcloudClient;
  root: string;
};

async function* crawl({ client, root }: CrawlOpts): AsyncGenerator<WebDavEntry> {
  const stack: string[] = [root];
  while (stack.length) {
    const current = stack.pop()!;
    const entries = await client.listDirectory(current);
    for (const e of entries) {
      // Skip the container itself when PROPFIND returns the directory entry
      if (normalizePath(e.href) === normalizePath(current) && e.isDirectory) continue;
      if (e.isDirectory) {
        stack.push(e.href);
      } else {
        yield e;
      }
    }
  }
}

function toIndexDoc(e: WebDavEntry): IndexDocument {
  const p = normalizePath(e.href);
  const doc: IndexDocument = {
    id: pathToId(p),
    name: e.name || basename(p),
    path: p,
    parentPath: getParent(p)!,
    sizeBytes: typeof e.size === "number" ? e.size : 0,
    mimeType: e.contentType || "application/octet-stream",
    owner: env.NEXTCLOUD_USERNAME,
    modifiedAt: e.lastmod,
    etag: e.etag,
    previewAvailable: false,
    tags: [],
  };
  return doc;
}

function parseArgs(argv: string[]) {
  const out: Record<string, string | boolean> = {};
  for (const arg of argv) {
    if (arg.startsWith("--")) {
      const [k, v] = arg.slice(2).split("=");
      out[k] = v ?? true;
    }
  }
  return out;
}

async function main() {
  const args = parseArgs(process.argv.slice(2));
  const root = normalizePath(
    typeof args.root === "string" && args.root.length > 1 ? args.root : env.NEXTCLOUD_ROOT_PATH,
  );
  console.log(`[ingest] Start crawl from: ${root}`);

  // Ensure index exists (creates mappings and alias if missing)
  await ensureIndex();

  // Create a client rooted at NEXTCLOUD_ROOT_PATH (resolve uses absolute)
  const client = new NextcloudClient(env.NEXTCLOUD_ROOT_PATH);

  let count = 0;
  for await (const e of crawl({ client, root })) {
    try {
      // Build document from DAV metadata
      const baseDoc = toIndexDoc(e);

      // Try Tika extraction (optional)
      let contentText: string | undefined;
      try {
        const stream = await client.downloadStream(e.href);
        const { text } = await tikaExtractText(stream, e.contentType);
        if (text && text.trim().length > 0) contentText = text;
      } catch (err) {
        // Non-fatal; proceed without content
        Sentry.captureException(err);
      }

      const doc: IndexDocument = {
        ...baseDoc,
        content: contentText,
        title: baseDoc.name,
      };

      // Index into ES
      await indexDocument(doc);
      count++;
      if (count % 50 === 0) {
        console.log(`[ingest] Indexed ${count} documents`);
      }
    } catch (err) {
      console.error(`[ingest] Failed for ${e.href}:`, err);
      Sentry.captureException(err);
    }
  }

  console.log(`[ingest] Done. Indexed ${count} documents.`);
}

main().catch((err) => {
  console.error("[ingest] Fatal error:", err);
  process.exit(1);
});