jan/extensions/inference-nitro-extension/src/node/index.ts

import fs from 'fs'
import path from 'path'
import { ChildProcessWithoutNullStreams, spawn } from 'child_process'
import tcpPortUsed from 'tcp-port-used'
import fetchRT from 'fetch-retry'
import {
  log,
  getSystemResourceInfo,
  Model,
  InferenceEngine,
  ModelSettingParams,
  PromptTemplate,
  SystemInformation,
  getJanDataFolderPath,
} from '@janhq/core/node'
import { executableNitroFile } from './execute'
import terminate from 'terminate'
import decompress from 'decompress'

// Polyfill fetch with retry
const fetchRetry = fetchRT(fetch)

/**
 * The response object for model init operation.
 */
interface ModelInitOptions {
  modelFolder: string
  model: Model
}
// The PORT to use for the Nitro subprocess
const PORT = 3928
// The HOST address to use for the Nitro subprocess
const LOCAL_HOST = '127.0.0.1'
// The URL for the Nitro subprocess
const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${PORT}`
// The URL for the Nitro subprocess to load a model
const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/loadmodel`
// The URL for the Nitro subprocess to validate a model
const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/modelstatus`
// The URL for the Nitro subprocess to kill itself
const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`

const NITRO_PORT_FREE_CHECK_INTERVAL = 100

// The supported model format
// TODO: Should be an array to support more models
const SUPPORTED_MODEL_FORMAT = '.gguf'

// The subprocess instance for Nitro
let subprocess: ChildProcessWithoutNullStreams | undefined = undefined

// The current model settings
let currentSettings: (ModelSettingParams & { model?: string }) | undefined =
  undefined

/**
 * Stops a Nitro subprocess.
 * @param wrapper - The model wrapper.
 * @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
 */
function unloadModel(): Promise<void> {
  return killSubprocess()
}

/**
 * Initializes a Nitro subprocess to load a machine learning model.
 * @param wrapper - The model wrapper.
 * @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
 * TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
 */
async function loadModel(
  params: ModelInitOptions,
  systemInfo?: SystemInformation
): Promise<ModelOperationResponse | void> {
  if (params.model.engine !== InferenceEngine.nitro) {
    // Not a nitro model
    return Promise.resolve()
  }

  if (params.model.engine !== InferenceEngine.nitro) {
    return Promise.reject('Not a cortex model')
  } else {
    const nitroResourceProbe = await getSystemResourceInfo()
    // Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
    if (params.model.settings.prompt_template) {
      const promptTemplate = params.model.settings.prompt_template
      const prompt = promptTemplateConverter(promptTemplate)
      if (prompt?.error) {
        return Promise.reject(prompt.error)
      }
      params.model.settings.system_prompt = prompt.system_prompt
      params.model.settings.user_prompt = prompt.user_prompt
      params.model.settings.ai_prompt = prompt.ai_prompt
    }

    // modelFolder is the absolute path to the running model folder
    // e.g. ~/jan/models/llama-2
    let modelFolder = params.modelFolder

    let llama_model_path = params.model.settings.llama_model_path

    // Absolute model path support
    if (
      params.model?.sources.length &&
      params.model.sources.every((e) => fs.existsSync(e.url))
    ) {
      llama_model_path =
        params.model.sources.length === 1
          ? params.model.sources[0].url
          : params.model.sources.find((e) =>
              e.url.includes(llama_model_path ?? params.model.id)
            )?.url
    }

    if (!llama_model_path || !path.isAbsolute(llama_model_path)) {
      // Look for GGUF model file
      const modelFiles: string[] = fs.readdirSync(modelFolder)
      const ggufBinFile = modelFiles.find(
        (file) =>
          // 1. Prioritize llama_model_path (predefined)
          (llama_model_path && file === llama_model_path) ||
          // 2. Prioritize GGUF File (manual import)
          file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT) ||
          // 3. Fallback Model ID (for backward compatibility)
          file === params.model.id
      )
      if (ggufBinFile) llama_model_path = path.join(modelFolder, ggufBinFile)
    }

    // Look for absolute source path for single model

    if (!llama_model_path) return Promise.reject('No GGUF model file found')

    currentSettings = {
      cpu_threads: Math.max(1, nitroResourceProbe.numCpuPhysicalCore),
      // model.settings can override the default settings
      ...params.model.settings,
      llama_model_path,
      model: params.model.id,
      // This is critical and requires real CPU physical core count (or performance core)
      ...(params.model.settings.mmproj && {
        mmproj: path.isAbsolute(params.model.settings.mmproj)
          ? params.model.settings.mmproj
          : path.join(modelFolder, params.model.settings.mmproj),
      }),
    }
    return runNitroAndLoadModel(params.model.id, systemInfo)
  }
}

/**
 * 1. Spawn Nitro process
 * 2. Load model into Nitro subprocess
 * 3. Validate model status
 * @returns
 */
async function runNitroAndLoadModel(
  modelId: string,
  systemInfo?: SystemInformation
) {
  // Gather system information for CPU physical cores and memory
  return killSubprocess()
    .then(() =>
      tcpPortUsed.waitUntilFree(PORT, NITRO_PORT_FREE_CHECK_INTERVAL, 5000)
    )
    .then(() => spawnNitroProcess(systemInfo))
    .then(() => loadLLMModel(currentSettings))
    .then(() => validateModelStatus(modelId))
    .catch((err) => {
      // TODO: Broadcast error so app could display proper error message
      log(`[CORTEX]::Error: ${err}`)
      return { error: err }
    })
}

/**
 * Parse prompt template into agrs settings
 * @param promptTemplate Template as string
 * @returns
 */
function promptTemplateConverter(promptTemplate: string): PromptTemplate {
  // Split the string using the markers
  const systemMarker = '{system_message}'
  const promptMarker = '{prompt}'

  if (
    promptTemplate.includes(systemMarker) &&
    promptTemplate.includes(promptMarker)
  ) {
    // Find the indices of the markers
    const systemIndex = promptTemplate.indexOf(systemMarker)
    const promptIndex = promptTemplate.indexOf(promptMarker)

    // Extract the parts of the string
    const system_prompt = promptTemplate.substring(0, systemIndex)
    const user_prompt = promptTemplate.substring(
      systemIndex + systemMarker.length,
      promptIndex
    )
    const ai_prompt = promptTemplate.substring(
      promptIndex + promptMarker.length
    )

    // Return the split parts
    return { system_prompt, user_prompt, ai_prompt }
  } else if (promptTemplate.includes(promptMarker)) {
    // Extract the parts of the string for the case where only promptMarker is present
    const promptIndex = promptTemplate.indexOf(promptMarker)
    const user_prompt = promptTemplate.substring(0, promptIndex)
    const ai_prompt = promptTemplate.substring(
      promptIndex + promptMarker.length
    )

    // Return the split parts
    return { user_prompt, ai_prompt }
  }

  // Return an error if none of the conditions are met
  return { error: 'Cannot split prompt template' }
}

/**
 * Loads a LLM model into the Nitro subprocess by sending a HTTP POST request.
 * @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
 */
function loadLLMModel(settings: any): Promise<Response> {
  if (!settings?.ngl) {
    settings.ngl = 100
  }
  log(`[CORTEX]::Debug: Loading model with params ${JSON.stringify(settings)}`)
  return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
    },
    body: JSON.stringify(settings),
    retries: 3,
    retryDelay: 300,
  })
    .then((res) => {
      log(
        `[CORTEX]::Debug: Load model success with response ${JSON.stringify(
          res
        )}`
      )
      return Promise.resolve(res)
    })
    .catch((err) => {
      log(`[CORTEX]::Error: Load model failed with error ${err}`)
      return Promise.reject(err)
    })
}

/**
 * Validates the status of a model.
 * @returns {Promise<ModelOperationResponse>} A promise that resolves to an object.
 * If the model is loaded successfully, the object is empty.
 * If the model is not loaded successfully, the object contains an error message.
 */
async function validateModelStatus(modelId: string): Promise<void> {
  // Send a GET request to the validation URL.
  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
  log(`[CORTEX]::Debug: Validating model ${modelId}`)
  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
    method: 'POST',
    body: JSON.stringify({
      model: modelId,
      // TODO: force to use cortex llamacpp by default
      engine: 'cortex.llamacpp',
    }),
    headers: {
      'Content-Type': 'application/json',
    },
    retries: 5,
    retryDelay: 300,
  }).then(async (res: Response) => {
    log(
      `[CORTEX]::Debug: Validate model state with response ${JSON.stringify(
        res.status
      )}`
    )
    // If the response is OK, check model_loaded status.
    if (res.ok) {
      const body = await res.json()
      // If the model is loaded, return an empty object.
      // Otherwise, return an object with an error message.
      if (body.model_loaded) {
        log(
          `[CORTEX]::Debug: Validate model state success with response ${JSON.stringify(
            body
          )}`
        )
        return Promise.resolve()
      }
    }
    const errorBody = await res.text()
    log(
      `[CORTEX]::Debug: Validate model state failed with response ${errorBody} and status is ${JSON.stringify(
        res.statusText
      )}`
    )
    return Promise.reject('Validate model status failed')
  })
}

/**
 * Terminates the Nitro subprocess.
 * @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
 */
async function killSubprocess(): Promise<void> {
  const controller = new AbortController()
  setTimeout(() => controller.abort(), 5000)
  log(`[CORTEX]::Debug: Request to kill cortex`)

  const killRequest = () => {
    return fetch(NITRO_HTTP_KILL_URL, {
      method: 'DELETE',
      signal: controller.signal,
    })
      .catch(() => {}) // Do nothing with this attempt
      .then(() =>
        tcpPortUsed.waitUntilFree(PORT, NITRO_PORT_FREE_CHECK_INTERVAL, 5000)
      )
      .then(() => log(`[CORTEX]::Debug: cortex process is terminated`))
      .catch((err) => {
        log(
          `[CORTEX]::Debug: Could not kill running process on port ${PORT}. Might be another process running on the same port? ${err}`
        )
        throw 'PORT_NOT_AVAILABLE'
      })
  }

  if (subprocess?.pid && process.platform !== 'darwin') {
    log(`[CORTEX]::Debug: Killing PID ${subprocess.pid}`)
    const pid = subprocess.pid
    return new Promise((resolve, reject) => {
      terminate(pid, function (err) {
        if (err) {
          log('[CORTEX]::Failed to kill PID - sending request to kill')
          killRequest().then(resolve).catch(reject)
        } else {
          tcpPortUsed
            .waitUntilFree(PORT, NITRO_PORT_FREE_CHECK_INTERVAL, 5000)
            .then(() => log(`[CORTEX]::Debug: cortex process is terminated`))
            .then(() => resolve())
            .catch(() => {
              log(
                '[CORTEX]::Failed to kill PID (Port check timeout) - sending request to kill'
              )
              killRequest().then(resolve).catch(reject)
            })
        }
      })
    })
  } else {
    return killRequest()
  }
}

/**
 * Spawns a Nitro subprocess.
 * @returns A promise that resolves when the Nitro subprocess is started.
 */
function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
  log(`[CORTEX]::Debug: Spawning cortex subprocess...`)

  return new Promise<void>(async (resolve, reject) => {
    let executableOptions = executableNitroFile(
      // If ngl is not set or equal to 0, run on CPU with correct instructions
      systemInfo?.gpuSetting
        ? {
            ...systemInfo.gpuSetting,
            run_mode:
              currentSettings?.ngl === undefined || currentSettings.ngl === 0
                ? 'cpu'
                : systemInfo.gpuSetting.run_mode,
          }
        : undefined
    )

    const args: string[] = ['1', LOCAL_HOST, PORT.toString()]
    // Execute the binary
    log(
      `[CORTEX]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
    )
    log(`[CORTEX]::Debug: Cortex engine path: ${executableOptions.enginePath}`)

    // Add engine path to the PATH and LD_LIBRARY_PATH
    process.env.PATH = (process.env.PATH || '').concat(
      path.delimiter,
      executableOptions.enginePath
    )
    log(`[CORTEX] PATH: ${process.env.PATH}`)
    process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
      path.delimiter,
      executableOptions.enginePath
    )

    subprocess = spawn(
      executableOptions.executablePath,
      ['1', LOCAL_HOST, PORT.toString()],
      {
        cwd: path.join(path.parse(executableOptions.executablePath).dir),
        env: {
          ...process.env,
          ENGINE_PATH: executableOptions.enginePath,
          CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
          // Vulkan - Support 1 device at a time for now
          ...(executableOptions.vkVisibleDevices?.length > 0 && {
            GGML_VULKAN_DEVICE: executableOptions.vkVisibleDevices[0],
          }),
        },
      }
    )

    // Handle subprocess output
    subprocess.stdout.on('data', (data: any) => {
      log(`[CORTEX]::Debug: ${data}`)
    })

    subprocess.stderr.on('data', (data: any) => {
      log(`[CORTEX]::Error: ${data}`)
    })

    subprocess.on('close', (code: any) => {
      log(`[CORTEX]::Debug: cortex exited with code: ${code}`)
      subprocess = undefined
      reject(`child process exited with code ${code}`)
    })

    tcpPortUsed
      .waitUntilUsed(PORT, NITRO_PORT_FREE_CHECK_INTERVAL, 30000)
      .then(() => {
        log(`[CORTEX]::Debug: cortex is ready`)
        resolve()
      })
  })
}

/**
 * Every module should have a dispose function
 * This will be called when the extension is unloaded and should clean up any resources
 * Also called when app is closed
 */
function dispose() {
  // clean other registered resources here
  killSubprocess()
}

/**
 * Nitro process info
 */
export interface NitroProcessInfo {
  isRunning: boolean
}

/**
 * Retrieve current nitro process
 */
const getCurrentNitroProcessInfo = (): NitroProcessInfo => {
  return {
    isRunning: subprocess != null,
  }
}

const addAdditionalDependencies = (data: { name: string; version: string }) => {
  log(
    `[CORTEX]::Debug: Adding additional dependencies for ${data.name} ${data.version}`
  )
  const additionalPath = path.delimiter.concat(
    path.join(getJanDataFolderPath(), 'engines', data.name, data.version)
  )
  // Set the updated PATH
  process.env.PATH = (process.env.PATH || '').concat(
    path.delimiter,
    additionalPath
  )
  process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
    path.delimiter,
    additionalPath
  )
}

const decompressRunner = async (zipPath: string, output: string) => {
  console.debug(`Decompressing ${zipPath} to ${output}...`)
  try {
    const files = await decompress(zipPath, output)
    console.debug('Decompress finished!', files)
  } catch (err) {
    console.error(`Decompress ${zipPath} failed: ${err}`)
  }
}

export default {
  loadModel,
  unloadModel,
  dispose,
  getCurrentNitroProcessInfo,
  addAdditionalDependencies,
  decompressRunner,
}