fix: #3549, #3552 - Inference on CPU is slower on Jan 0.5.3 (#3602)

2024-09-11 14:03:53 +07:00 · 2024-09-11 14:03:53 +07:00 · 5217437912
commit 5217437912
parent 2d05134cb1
12 changed files with 921 additions and 374 deletions
--- a/.github/workflows/nightly-integrate-cortex-cpp.yml
+++ b/.github/workflows/nightly-integrate-cortex-cpp.yml
@ -51,13 +51,13 @@ jobs:
        latest_prerelease_asset_count=$(get_asset_count "$latest_prerelease_name")
        if [ "$current_version_name" = "$latest_prerelease_name" ]; then
-          echo "cortex cpp remote repo doesn't have update today, skip update cortex-cpp for today nightly build"
+          echo "cortex cpp remote repo doesn't have update today, skip update cortex.cpp for today nightly build"
          echo "::set-output name=pr_created::false"
          exit 0
        fi
        if [ "$current_version_asset_count" != "$latest_prerelease_asset_count" ]; then
-          echo "Latest prerelease version has different number of assets, somethink went wrong, skip update cortex-cpp for today nightly build"
+          echo "Latest prerelease version has different number of assets, somethink went wrong, skip update cortex.cpp for today nightly build"
          echo "::set-output name=pr_created::false"
          exit 1
        fi
--- a/core/src/node/api/restful/helper/startStopModel.ts
+++ b/core/src/node/api/restful/helper/startStopModel.ts
@ -1,31 +1,13 @@
 import fs from 'fs'
 import { join } from 'path'
-import {
+import { getJanDataFolderPath, getJanExtensionsPath, log } from '../../../helper'
-  getJanDataFolderPath,
+import { ModelSettingParams } from '../../../../types'
  getJanExtensionsPath,
  getSystemResourceInfo,
  log,
 } from '../../../helper'
 import { ChildProcessWithoutNullStreams, spawn } from 'child_process'
 import { Model, ModelSettingParams, PromptTemplate } from '../../../../types'
 import {
  LOCAL_HOST,
  NITRO_DEFAULT_PORT,
  NITRO_HTTP_KILL_URL,
  NITRO_HTTP_LOAD_MODEL_URL,
  NITRO_HTTP_VALIDATE_MODEL_URL,
  SUPPORTED_MODEL_FORMAT,
 } from './consts'
 // The subprocess instance for Nitro
 let subprocess: ChildProcessWithoutNullStreams | undefined = undefined
 // TODO: move this to core type
 interface NitroModelSettings extends ModelSettingParams {
  llama_model_path: string
  cpu_threads: number
 }
 /**
 * Start a model
 * @param modelId
 * @param settingParams
 * @returns
 */
 export const startModel = async (modelId: string, settingParams?: ModelSettingParams) => {
  try {
    await runModel(modelId, settingParams)
@ -40,316 +22,57 @@ export const startModel = async (modelId: string, settingParams?: ModelSettingPa
  }
 }
 const runModel = async (modelId: string, settingParams?: ModelSettingParams): Promise<void> => {
  const janDataFolderPath = getJanDataFolderPath()
  const modelFolderFullPath = join(janDataFolderPath, 'models', modelId)
  if (!fs.existsSync(modelFolderFullPath)) {
    throw new Error(`Model not found: ${modelId}`)
  }
  const files: string[] = fs.readdirSync(modelFolderFullPath)
  // Look for GGUF model file
  const ggufBinFile = files.find((file) => file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT))
  const modelMetadataPath = join(modelFolderFullPath, 'model.json')
  const modelMetadata: Model = JSON.parse(fs.readFileSync(modelMetadataPath, 'utf-8'))
  if (!ggufBinFile) {
    throw new Error('No GGUF model file found')
  }
  const modelBinaryPath = join(modelFolderFullPath, ggufBinFile)
  const nitroResourceProbe = await getSystemResourceInfo()
  const nitroModelSettings: NitroModelSettings = {
    // This is critical and requires real CPU physical core count (or performance core)
    cpu_threads: Math.max(1, nitroResourceProbe.numCpuPhysicalCore),
    ...modelMetadata.settings,
    ...settingParams,
    llama_model_path: modelBinaryPath,
    ...(modelMetadata.settings.mmproj && {
      mmproj: join(modelFolderFullPath, modelMetadata.settings.mmproj),
    }),
  }
  log(`[SERVER]::Debug: Nitro model settings: ${JSON.stringify(nitroModelSettings)}`)
  // Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
  if (modelMetadata.settings.prompt_template) {
    const promptTemplate = modelMetadata.settings.prompt_template
    const prompt = promptTemplateConverter(promptTemplate)
    if (prompt?.error) {
      throw new Error(prompt.error)
    }
    nitroModelSettings.system_prompt = prompt.system_prompt
    nitroModelSettings.user_prompt = prompt.user_prompt
    nitroModelSettings.ai_prompt = prompt.ai_prompt
  }
  await runNitroAndLoadModel(modelId, nitroModelSettings)
 }
 // TODO: move to util
 const promptTemplateConverter = (promptTemplate: string): PromptTemplate => {
  // Split the string using the markers
  const systemMarker = '{system_message}'
  const promptMarker = '{prompt}'
  if (promptTemplate.includes(systemMarker) && promptTemplate.includes(promptMarker)) {
    // Find the indices of the markers
    const systemIndex = promptTemplate.indexOf(systemMarker)
    const promptIndex = promptTemplate.indexOf(promptMarker)
    // Extract the parts of the string
    const system_prompt = promptTemplate.substring(0, systemIndex)
    const user_prompt = promptTemplate.substring(systemIndex + systemMarker.length, promptIndex)
    const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)
    // Return the split parts
    return { system_prompt, user_prompt, ai_prompt }
  } else if (promptTemplate.includes(promptMarker)) {
    // Extract the parts of the string for the case where only promptMarker is present
    const promptIndex = promptTemplate.indexOf(promptMarker)
    const user_prompt = promptTemplate.substring(0, promptIndex)
    const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)
    // Return the split parts
    return { user_prompt, ai_prompt }
  }
  // Return an error if none of the conditions are met
  return { error: 'Cannot split prompt template' }
 }
 const runNitroAndLoadModel = async (modelId: string, modelSettings: NitroModelSettings) => {
  // Gather system information for CPU physical cores and memory
  const tcpPortUsed = require('tcp-port-used')
  await stopModel(modelId)
  await tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000)
  /**
   * There is a problem with Windows process manager
   * Should wait for awhile to make sure the port is free and subprocess is killed
   * The tested threshold is 500ms
   **/
  if (process.platform === 'win32') {
    await new Promise((resolve) => setTimeout(resolve, 500))
  }
  await spawnNitroProcess()
  await loadLLMModel(modelSettings)
  await validateModelStatus()
 }
 const spawnNitroProcess = async (): Promise<void> => {
  log(`[SERVER]::Debug: Spawning cortex subprocess...`)
  let binaryFolder = join(
    getJanExtensionsPath(),
    '@janhq',
    'inference-cortex-extension',
    'dist',
    'bin'
  )
  let executableOptions = executableNitroFile()
  const tcpPortUsed = require('tcp-port-used')
  const args: string[] = ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()]
  // Execute the binary
  log(
    `[SERVER]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
  )
  subprocess = spawn(
    executableOptions.executablePath,
    ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()],
    {
      cwd: binaryFolder,
      env: {
        ...process.env,
        CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
      },
    }
  )
  // Handle subprocess output
  subprocess.stdout.on('data', (data: any) => {
    log(`[SERVER]::Debug: ${data}`)
  })
  subprocess.stderr.on('data', (data: any) => {
    log(`[SERVER]::Error: ${data}`)
  })
  subprocess.on('close', (code: any) => {
    log(`[SERVER]::Debug: cortex exited with code: ${code}`)
    subprocess = undefined
  })
  tcpPortUsed.waitUntilUsed(NITRO_DEFAULT_PORT, 300, 30000).then(() => {
    log(`[SERVER]::Debug: cortex is ready`)
  })
 }
 type NitroExecutableOptions = {
  executablePath: string
  cudaVisibleDevices: string
 }
 const executableNitroFile = (): NitroExecutableOptions => {
  const nvidiaInfoFilePath = join(getJanDataFolderPath(), 'settings', 'settings.json')
  let binaryFolder = join(
    getJanExtensionsPath(),
    '@janhq',
    'inference-cortex-extension',
    'dist',
    'bin'
  )
  let cudaVisibleDevices = ''
  let binaryName = 'cortex-cpp'
  /**
   * The binary folder is different for each platform.
   */
  if (process.platform === 'win32') {
    /**
     *  For Windows: win-cpu, win-cuda-11-7, win-cuda-12-0
     */
    let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
    if (nvidiaInfo['run_mode'] === 'cpu') {
      binaryFolder = join(binaryFolder, 'win-cpu')
    } else {
      if (nvidiaInfo['cuda'].version === '12') {
        binaryFolder = join(binaryFolder, 'win-cuda-12-0')
      } else {
        binaryFolder = join(binaryFolder, 'win-cuda-11-7')
      }
      cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
    }
    binaryName = 'cortex-cpp.exe'
  } else if (process.platform === 'darwin') {
    /**
     *  For MacOS: mac-universal both Silicon and InteL
     */
    if(process.arch === 'arm64') {
    binaryFolder = join(binaryFolder, 'mac-arm64')
    } else {
      binaryFolder = join(binaryFolder, 'mac-amd64')
    }
  } else {
    /**
     *  For Linux: linux-cpu, linux-cuda-11-7, linux-cuda-12-0
     */
    let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
    if (nvidiaInfo['run_mode'] === 'cpu') {
      binaryFolder = join(binaryFolder, 'linux-cpu')
    } else {
      if (nvidiaInfo['cuda'].version === '12') {
        binaryFolder = join(binaryFolder, 'linux-cuda-12-0')
      } else {
        binaryFolder = join(binaryFolder, 'linux-cuda-11-7')
      }
      cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
    }
  }
  return {
    executablePath: join(binaryFolder, binaryName),
    cudaVisibleDevices,
  }
 }
 const validateModelStatus = async (): Promise<void> => {
  // Send a GET request to the validation URL.
  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
  const fetchRT = require('fetch-retry')
  const fetchRetry = fetchRT(fetch)
  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
    method: 'GET',
    headers: {
      'Content-Type': 'application/json',
    },
    retries: 5,
    retryDelay: 500,
  }).then(async (res: Response) => {
    log(`[SERVER]::Debug: Validate model state success with response ${JSON.stringify(res)}`)
    // If the response is OK, check model_loaded status.
    if (res.ok) {
      const body = await res.json()
      // If the model is loaded, return an empty object.
      // Otherwise, return an object with an error message.
      if (body.model_loaded) {
        return Promise.resolve()
      }
    }
    return Promise.reject('Validate model status failed')
  })
 }
 const loadLLMModel = async (settings: NitroModelSettings): Promise<Response> => {
  log(`[SERVER]::Debug: Loading model with params ${JSON.stringify(settings)}`)
  const fetchRT = require('fetch-retry')
  const fetchRetry = fetchRT(fetch)
  return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
    },
    body: JSON.stringify(settings),
    retries: 3,
    retryDelay: 500,
  })
    .then((res: any) => {
      log(`[SERVER]::Debug: Load model request with response ${JSON.stringify(res)}`)
      return Promise.resolve(res)
    })
    .catch((err: any) => {
      log(`[SERVER]::Error: Load model failed with error ${err}`)
      return Promise.reject(err)
    })
 }
 /**
 * Run a model using installed cortex extension
 * @param model
 * @param settingParams
 */
 const runModel = async (model: string, settingParams?: ModelSettingParams): Promise<void> => {
  const janDataFolderPath = getJanDataFolderPath()
  const modelFolder = join(janDataFolderPath, 'models', model)
  let module = join(
    getJanExtensionsPath(),
    '@janhq',
    'inference-cortex-extension',
    'dist',
    'node',
    'index.cjs'
  )
  // Just reuse the cortex extension implementation, don't duplicate then lost of sync
  return import(module).then((extension) =>
    extension
      .loadModel(
        {
          modelFolder,
          model,
        },
        settingParams
      )
      .then(() => log(`[SERVER]::Debug: Model is loaded`))
      .then({
        message: 'Model started',
      })
  )
 }
 /*
 * Stop model and kill nitro process.
 */
 export const stopModel = async (_modelId: string) => {
-  if (!subprocess) {
+  let module = join(
-    return {
+    getJanExtensionsPath(),
-      error: "Model isn't running",
+    '@janhq',
-    }
+    'inference-cortex-extension',
-  }
+    'dist',
-  return new Promise((resolve, reject) => {
+    'node',
-    const controller = new AbortController()
+    'index.cjs'
-    setTimeout(() => {
+  )
-      controller.abort()
+  // Just reuse the cortex extension implementation, don't duplicate then lost of sync
-      reject({
+  return import(module).then((extension) =>
-        error: 'Failed to stop model: Timedout',
+    extension
-      })
+      .unloadModel()
-    }, 5000)
+      .then(() => log(`[SERVER]::Debug: Model is unloaded`))
-    const tcpPortUsed = require('tcp-port-used')
+      .then({
    log(`[SERVER]::Debug: Request to kill cortex`)
    fetch(NITRO_HTTP_KILL_URL, {
      method: 'DELETE',
      signal: controller.signal,
    })
      .then(() => {
        subprocess?.kill()
        subprocess = undefined
      })
      .catch(() => {
        // don't need to do anything, we still kill the subprocess
      })
      .then(() => tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000))
      .then(() => log(`[SERVER]::Debug: Nitro process is terminated`))
      .then(() =>
        resolve({
        message: 'Model stopped',
      })
  )
  })
 }
--- a/extensions/inference-nitro-extension/download.bat
+++ b/extensions/inference-nitro-extension/download.bat
@ -1,3 +1,31 @@
@echo off
 set BIN_PATH=./bin
 set /p CORTEX_VERSION=<./bin/version.txt
-.\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cpu && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-vulkan && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx.tar.gz -e --strip 1 -o ./bin/win-cpu/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/win-vulkan/engines/cortex.llamacpp
+
@REM Download cortex.llamacpp binaries
 set VERSION=v0.1.25
 set DOWNLOAD_URL=https://github.com/janhq/cortex.llamacpp/releases/download/%VERSION%/cortex.llamacpp-0.1.25-windows-amd64
 set SUBFOLDERS=win-cuda-12-0 win-cuda-11-7 win-noavx win-avx win-avx2 win-avx512 win-vulkan
 call .\node_modules\.bin\download -e --strip 1 -o %BIN_PATH% https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz
 call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-12-0.tar.gz -e --strip 1 -o %BIN_PATH%/win-cuda-12-0/engines/cortex.llamacpp
 call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-11-7.tar.gz -e --strip 1 -o %BIN_PATH%/win-cuda-11-7/engines/cortex.llamacpp
 call .\node_modules\.bin\download %DOWNLOAD_URL%-noavx.tar.gz -e --strip 1 -o %BIN_PATH%/win-noavx/engines/cortex.llamacpp
 call .\node_modules\.bin\download %DOWNLOAD_URL%-avx.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx/engines/cortex.llamacpp
 call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx2/engines/cortex.llamacpp
 call .\node_modules\.bin\download %DOWNLOAD_URL%-avx512.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx512/engines/cortex.llamacpp
 call .\node_modules\.bin\download %DOWNLOAD_URL%-vulkan.tar.gz -e --strip 1 -o %BIN_PATH%/win-vulkan/engines/cortex.llamacpp
@REM Loop through each folder and move DLLs (excluding engine.dll)
 for %%F in (%SUBFOLDERS%) do (
    echo Processing folder: %BIN_PATH%\%%F
    @REM Move all .dll files except engine.dll
    for %%D in (%BIN_PATH%\%%F\engines\cortex.llamacpp\*.dll) do (
        if /I not "%%~nxD"=="engine.dll" (
            move "%%D" "%BIN_PATH%"
        )
    )
 )
 echo DLL files moved successfully.
--- a/extensions/inference-nitro-extension/download.sh
+++ b/extensions/inference-nitro-extension/download.sh
@ -0,0 +1,41 @@
 #!/bin/bash
 # Read CORTEX_VERSION
 CORTEX_VERSION=$(cat ./bin/version.txt)
 CORTEX_RELEASE_URL="https://github.com/janhq/cortex/releases/download"
 # Detect platform
 OS_TYPE=$(uname)
 if [ "$OS_TYPE" == "Linux" ]; then
    # Linux downloads
    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz"  -e --strip 1 -o "./bin"
    chmod +x "./bin/cortex-cpp"
    ENGINE_DOWNLOAD_URL="https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64"
    # Download engines for Linux
    download "${ENGINE_DOWNLOAD_URL}-noavx.tar.gz"  -e --strip 1 -o "./bin/linux-noavx/engines/cortex.llamacpp" 1
    download "${ENGINE_DOWNLOAD_URL}-avx.tar.gz"  -e --strip 1 -o "./bin/linux-avx/engines/cortex.llamacpp" 1
    download "${ENGINE_DOWNLOAD_URL}-avx2.tar.gz"  -e --strip 1 -o "./bin/linux-avx2/engines/cortex.llamacpp" 1
    download "${ENGINE_DOWNLOAD_URL}-avx512.tar.gz"  -e --strip 1 -o "./bin/linux-avx512/engines/cortex.llamacpp" 1
    download "${ENGINE_DOWNLOAD_URL}-avx2-cuda-12-0.tar.gz"  -e --strip 1 -o "./bin/linux-cuda-12-0/engines/cortex.llamacpp" 1
    download "${ENGINE_DOWNLOAD_URL}-avx2-cuda-11-7.tar.gz"  -e --strip 1 -o "./bin/linux-cuda-11-7/engines/cortex.llamacpp" 1
    download "${ENGINE_DOWNLOAD_URL}-vulkan.tar.gz"  -e --strip 1 -o "./bin/linux-vulkan/engines/cortex.llamacpp" 1
 elif [ "$OS_TYPE" == "Darwin" ]; then
    # macOS downloads
    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz"  -e --strip 1 -o "./bin/mac-arm64" 1
    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz"  -e --strip 1 -o "./bin/mac-x64" 1
    chmod +x "./bin/mac-arm64/cortex-cpp"
    chmod +x "./bin/mac-x64/cortex-cpp"
    ENGINE_DOWNLOAD_URL="https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac"
    # Download engines for macOS
    download "${ENGINE_DOWNLOAD_URL}-arm64.tar.gz" -e --strip 1 -o ./bin/mac-arm64/engines/cortex.llamacpp
    download "${ENGINE_DOWNLOAD_URL}-amd64.tar.gz" -e --strip 1 -o ./bin/mac-x64/engines/cortex.llamacpp
 else
    echo "Unsupported operating system: $OS_TYPE"
    exit 1
 fi
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@ -2,7 +2,7 @@
  "name": "@janhq/inference-cortex-extension",
  "productName": "Cortex Inference Engine",
  "version": "1.0.15",
-  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
+  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
  "main": "dist/index.js",
  "node": "dist/node/index.cjs.js",
  "author": "Jan <service@jan.ai>",
@ -10,13 +10,11 @@
  "scripts": {
    "test": "jest",
    "build": "tsc --module commonjs && rollup -c rollup.config.ts",
-    "downloadnitro:linux": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-vulkan && chmod +x ./bin/linux-vulkan/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx.tar.gz -e --strip 1 -o ./bin/linux-cpu/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/linux-vulkan/engines/cortex.llamacpp",
+    "downloadnitro:linux:darwin": "./download.sh",
    "downloadnitro:darwin": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-arm64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz --strip-components=1 -C ./bin/mac-arm64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz && chmod +x ./bin/mac-arm64/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-amd64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz --strip-components=1 -C ./bin/mac-amd64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz && chmod +x ./bin/mac-amd64/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-arm64.tar.gz -e --strip 1 -o ./bin/mac-arm64/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-amd64.tar.gz -e --strip 1 -o ./bin/mac-amd64/engines/cortex.llamacpp",
    "downloadnitro:win32": "download.bat",
    "downloadnitro": "run-script-os",
    "build:publish:darwin": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && ../../.github/scripts/auto-sign.sh && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:win32": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
+    "build:publish:win32:linux": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
    "build:publish:linux": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
    "build:publish": "yarn test && run-script-os"
  },
  "exports": {
@ -49,6 +47,7 @@
  },
  "dependencies": {
    "@janhq/core": "file:../../core",
    "cpu-instructions": "^0.0.13",
    "decompress": "^4.2.1",
    "fetch-retry": "^5.0.6",
    "rxjs": "^7.8.1",
@ -68,6 +67,7 @@
    "tcp-port-used",
    "fetch-retry",
    "@janhq/core",
-    "decompress"
+    "decompress",
    "cpu-instructions"
  ]
 }
--- a/extensions/inference-nitro-extension/rollup.config.ts
+++ b/extensions/inference-nitro-extension/rollup.config.ts
@ -96,7 +96,7 @@ export default [
          llama3170bJson,
          gemma22bJson,
          gemma29bJson,
-          gemma227bJson
+          gemma227bJson,
        ]),
        NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
        DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson),
@ -117,7 +117,10 @@ export default [
      // Allow json resolution
      json(),
      //     Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
+      typescript({
        useTsconfigDeclarationDir: true,
        exclude: ['**/__tests__', '**/*.test.ts'],
      }),
      // Compile TypeScript files
      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
      commonjs(),
@ -139,7 +142,7 @@ export default [
      { file: 'dist/node/index.cjs.js', format: 'cjs', sourcemap: true },
    ],
    // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
-    external: ['@janhq/core/node'],
+    external: ['@janhq/core/node', 'cpu-instructions'],
    watch: {
      include: 'src/node/**',
    },
@ -147,7 +150,10 @@ export default [
      // Allow json resolution
      json(),
      // Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
+      typescript({
        useTsconfigDeclarationDir: true,
        exclude: ['**/__tests__', '**/*.test.ts'],
      }),
      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
      commonjs(),
      // Allow node_modules resolution, so you can use 'external' to control
@ -156,7 +162,6 @@ export default [
      resolve({
        extensions: ['.ts', '.js', '.json'],
      }),
      // Resolve source maps to the original source
      sourceMaps(),
    ],
--- a/extensions/inference-nitro-extension/src/index.ts
+++ b/extensions/inference-nitro-extension/src/index.ts
@ -73,6 +73,7 @@ export default class JanInferenceNitroExtension extends LocalOAIEngine {
    this.registerModels(models)
    super.onLoad()
    // Add additional dependencies PATH to the env
    executeOnMain(NODE, 'addAdditionalDependencies', {
      name: this.name,
      version: this.version,
--- a/extensions/inference-nitro-extension/src/node/execute.test.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.test.ts
@ -1,7 +1,7 @@
 import { describe, expect, it } from '@jest/globals'
 import { executableNitroFile } from './execute'
 import { GpuSetting } from '@janhq/core'
-import { sep } from 'path'
+import { cpuInfo } from 'cpu-instructions'
 let testSettings: GpuSetting = {
  run_mode: 'cpu',
@ -22,6 +22,14 @@ let testSettings: GpuSetting = {
 }
 const originalPlatform = process.platform
 jest.mock('cpu-instructions', () => ({
  cpuInfo: {
    cpuInfo: jest.fn(),
  },
 }))
 let mock = cpuInfo.cpuInfo as jest.Mock
 mock.mockReturnValue([])
 describe('test executable nitro file', () => {
  afterAll(function () {
    Object.defineProperty(process, 'platform', {
@ -38,17 +46,19 @@ describe('test executable nitro file', () => {
    })
    expect(executableNitroFile(testSettings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`mac-arm64${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`mac-arm64`),
        executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-arm64/cortex-cpp`) : expect.anything(),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
    )
    Object.defineProperty(process, 'arch', {
-      value: 'amd64',
+      value: 'x64',
    })
    expect(executableNitroFile(testSettings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`mac-amd64${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`mac-x64`),
        executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-x64/cortex-cpp`) : expect.anything(),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -62,14 +72,11 @@ describe('test executable nitro file', () => {
    const settings: GpuSetting = {
      ...testSettings,
      run_mode: 'cpu',
      cuda: {
        exist: true,
        version: '11',
      },
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cpu${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win`),
        executablePath: expect.stringContaining(`cortex-cpp.exe`),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -102,7 +109,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-11-7${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win-cuda-11-7`),
        executablePath: expect.stringContaining(`cortex-cpp.exe`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -135,7 +143,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-12-0${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win-cuda-12-0`),
        executablePath: expect.stringContaining(`cortex-cpp.exe`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -152,7 +161,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cpu${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux`),
        executablePath: expect.stringContaining(`cortex-cpp`),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -185,7 +195,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-11-7${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux-cuda-11-7`),
        executablePath: expect.stringContaining(`cortex-cpp`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -218,10 +229,203 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-12-0${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux-cuda-12-0`),
        executablePath: expect.stringContaining(`cortex-cpp`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
    )
  })
  // Generate test for different cpu instructions on Linux
  it(`executes on Linux CPU with different instructions`, () => {
    Object.defineProperty(process, 'platform', {
      value: 'linux',
    })
    const settings: GpuSetting = {
      ...testSettings,
      run_mode: 'cpu',
    }
    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
    cpuInstructions.forEach((instruction) => {
      mock.mockReturnValue([instruction])
      expect(executableNitroFile(settings)).toEqual(
        expect.objectContaining({
          enginePath: expect.stringContaining(`linux-${instruction}`),
          executablePath: expect.stringContaining(`cortex-cpp`),
          cudaVisibleDevices: '',
          vkVisibleDevices: '',
        })
      )
    })
  })
  // Generate test for different cpu instructions on Windows
  it(`executes on Windows CPU with different instructions`, () => {
    Object.defineProperty(process, 'platform', {
      value: 'win32',
    })
    const settings: GpuSetting = {
      ...testSettings,
      run_mode: 'cpu',
    }
    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
    cpuInstructions.forEach((instruction) => {
      mock.mockReturnValue([instruction])
      expect(executableNitroFile(settings)).toEqual(
        expect.objectContaining({
          enginePath: expect.stringContaining(`win-${instruction}`),
          executablePath: expect.stringContaining(`cortex-cpp.exe`),
          cudaVisibleDevices: '',
          vkVisibleDevices: '',
        })
      )
    })
  })
  // Generate test for different cpu instructions on Windows
  it(`executes on Windows GPU with different instructions`, () => {
    Object.defineProperty(process, 'platform', {
      value: 'win32',
    })
    const settings: GpuSetting = {
      ...testSettings,
      run_mode: 'gpu',
      cuda: {
        exist: true,
        version: '12',
      },
      nvidia_driver: {
        exist: true,
        version: '12',
      },
      gpus_in_use: ['0'],
      gpus: [
        {
          id: '0',
          name: 'NVIDIA GeForce GTX 1080',
          vram: '80000000',
        },
      ],
    }
    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
    cpuInstructions.forEach((instruction) => {
      mock.mockReturnValue([instruction])
      expect(executableNitroFile(settings)).toEqual(
        expect.objectContaining({
          enginePath: expect.stringContaining(`win-cuda-12-0`),
          executablePath: expect.stringContaining(`cortex-cpp.exe`),
          cudaVisibleDevices: '0',
          vkVisibleDevices: '0',
        })
      )
    })
  })
  // Generate test for different cpu instructions on Linux
  it(`executes on Linux GPU with different instructions`, () => {
    Object.defineProperty(process, 'platform', {
      value: 'linux',
    })
    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
    const settings: GpuSetting = {
      ...testSettings,
      run_mode: 'gpu',
      cuda: {
        exist: true,
        version: '12',
      },
      nvidia_driver: {
        exist: true,
        version: '12',
      },
      gpus_in_use: ['0'],
      gpus: [
        {
          id: '0',
          name: 'NVIDIA GeForce GTX 1080',
          vram: '80000000',
        },
      ],
    }
    cpuInstructions.forEach((instruction) => {
      mock.mockReturnValue([instruction])
      expect(executableNitroFile(settings)).toEqual(
        expect.objectContaining({
          enginePath: expect.stringContaining(`linux-cuda-12-0`),
          executablePath: expect.stringContaining(`cortex-cpp`),
          cudaVisibleDevices: '0',
          vkVisibleDevices: '0',
        })
      )
    })
  })
  // Generate test for different cpu instructions on Linux
  it(`executes on Linux Vulkan should not have CPU instructions included`, () => {
    Object.defineProperty(process, 'platform', {
      value: 'linux',
    })
    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
    const settings: GpuSetting = {
      ...testSettings,
      run_mode: 'gpu',
      vulkan: true,
      cuda: {
        exist: true,
        version: '12',
      },
      nvidia_driver: {
        exist: true,
        version: '12',
      },
      gpus_in_use: ['0'],
      gpus: [
        {
          id: '0',
          name: 'NVIDIA GeForce GTX 1080',
          vram: '80000000',
        },
      ],
    }
    cpuInstructions.forEach((instruction) => {
      mock.mockReturnValue([instruction])
      expect(executableNitroFile(settings)).toEqual(
        expect.objectContaining({
          enginePath: expect.stringContaining(`linux-vulkan`),
          executablePath: expect.stringContaining(`cortex-cpp`),
          cudaVisibleDevices: '0',
          vkVisibleDevices: '0',
        })
      )
    })
  })
  // Generate test for different cpu instructions on MacOS
  it(`executes on MacOS with different instructions`, () => {
    Object.defineProperty(process, 'platform', {
      value: 'darwin',
    })
    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
    cpuInstructions.forEach(() => {
      Object.defineProperty(process, 'platform', {
        value: 'darwin',
      })
      const settings: GpuSetting = {
        ...testSettings,
        run_mode: 'cpu',
      }
      mock.mockReturnValue([])
      expect(executableNitroFile(settings)).toEqual(
        expect.objectContaining({
          enginePath: expect.stringContaining(`mac-x64`),
          executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-x64/cortex-cpp`) : expect.anything(),
          cudaVisibleDevices: '',
          vkVisibleDevices: '',
        })
      )
    })
  })
 })
--- a/extensions/inference-nitro-extension/src/node/execute.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.ts
@ -1,37 +1,59 @@
 import { GpuSetting } from '@janhq/core'
 import * as path from 'path'
 import { cpuInfo } from 'cpu-instructions'
 export interface NitroExecutableOptions {
  enginePath: string
  executablePath: string
  cudaVisibleDevices: string
  vkVisibleDevices: string
 }
-const runMode = (settings?: GpuSetting): string => {
+/**
 * The GPU runMode that will be set - either 'vulkan', 'cuda', or empty for cpu.
 * @param settings
 * @returns
 */
 const gpuRunMode = (settings?: GpuSetting): string => {
  if (process.platform === 'darwin')
    // MacOS now has universal binaries
    return ''
-  if (!settings) return 'cpu'
+  if (!settings) return ''
  return settings.vulkan === true
    ? 'vulkan'
    : settings.run_mode === 'cpu'
-      ? 'cpu'
+      ? ''
      : 'cuda'
 }
 /**
 * The OS & architecture that the current process is running on.
 * @returns win, mac-x64, mac-arm64, or linux
 */
 const os = (): string => {
  return process.platform === 'win32'
    ? 'win'
    : process.platform === 'darwin'
-      ? process.arch === 'arm64' ? 'mac-arm64' : 'mac-amd64'
+      ? process.arch === 'arm64'
        ? 'mac-arm64'
        : 'mac-x64'
      : 'linux'
 }
 /**
 * The cortex.cpp extension based on the current platform.
 * @returns .exe if on Windows, otherwise an empty string.
 */
 const extension = (): '.exe' | '' => {
  return process.platform === 'win32' ? '.exe' : ''
 }
 /**
 * The CUDA version that will be set - either '11-7' or '12-0'.
 * @param settings
 * @returns
 */
 const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
  const isUsingCuda =
    settings?.vulkan !== true && settings?.run_mode === 'gpu' && os() !== 'mac'
@ -40,6 +62,21 @@ const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
  return settings?.cuda?.version === '11' ? '11-7' : '12-0'
 }
 /**
 * The CPU instructions that will be set - either 'avx512', 'avx2', 'avx', or 'noavx'.
 * @returns
 */
 const cpuInstructions = () => {
  if (process.platform === 'darwin') return ''
  return cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX512')
    ? 'avx512'
    : cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX2')
      ? 'avx2'
      : cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX')
        ? 'avx'
        : 'noavx'
 }
 /**
 * Find which executable file to run based on the current platform.
 * @returns The name of the executable file to run.
@ -47,15 +84,26 @@ const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
 export const executableNitroFile = (
  gpuSetting?: GpuSetting
 ): NitroExecutableOptions => {
-  let binaryFolder = [os(), runMode(gpuSetting), cudaVersion(gpuSetting)]
+  let engineFolder = [
    os(),
    ...(gpuSetting?.vulkan
      ? []
      : [
          gpuRunMode(gpuSetting) !== 'cuda' ? cpuInstructions() : '',
          gpuRunMode(gpuSetting),
          cudaVersion(gpuSetting),
        ]),
    gpuSetting?.vulkan ? 'vulkan' : undefined,
  ]
    .filter((e) => !!e)
    .join('-')
  let cudaVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
  let vkVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
-  let binaryName = `cortex-cpp${extension()}`
+  let binaryName = `${process.platform === 'darwin' ? `${os()}/` : ''}cortex-cpp${extension()}`
  return {
-    executablePath: path.join(__dirname, '..', 'bin', binaryFolder, binaryName),
+    enginePath: path.join(__dirname, '..', 'bin', engineFolder),
    executablePath: path.join(__dirname, '..', 'bin', binaryName),
    cudaVisibleDevices,
    vkVisibleDevices,
  }
--- a/extensions/inference-nitro-extension/src/node/index.test.ts
+++ b/extensions/inference-nitro-extension/src/node/index.test.ts
@ -0,0 +1,465 @@
 jest.mock('fetch-retry', () => ({
  default: () => () => {
    return Promise.resolve({
      ok: true,
      status: 200,
      json: () =>
        Promise.resolve({
          model_loaded: true,
        }),
      text: () => Promise.resolve(''),
    })
  },
 }))
 jest.mock('path', () => ({
  default: {
    isAbsolute: jest.fn(),
    join: jest.fn(),
    parse: () => {
      return { dir: 'dir' }
    },
    delimiter: { concat: () => '' },
  },
 }))
 jest.mock('decompress', () => ({
  default: () => {
    return Promise.resolve()
  },
 }))
 jest.mock('@janhq/core/node', () => ({
  ...jest.requireActual('@janhq/core/node'),
  getJanDataFolderPath: () => '',
  getSystemResourceInfo: () => {
    return {
      cpu: {
        cores: 1,
        logicalCores: 1,
        threads: 1,
        model: 'model',
        speed: 1,
      },
      memory: {
        total: 1,
        free: 1,
      },
      gpu: {
        model: 'model',
        memory: 1,
        cuda: {
          version: 'version',
          devices: 'devices',
        },
        vulkan: {
          version: 'version',
          devices: 'devices',
        },
      },
    }
  },
 }))
 jest.mock('fs', () => ({
  default: {
    readdirSync: () => [],
  },
 }))
 jest.mock('child_process', () => ({
  exec: () => {
    return {
      stdout: { on: jest.fn() },
      stderr: { on: jest.fn() },
      on: jest.fn(),
    }
  },
  spawn: () => {
    return {
      stdout: { on: jest.fn() },
      stderr: { on: jest.fn() },
      on: jest.fn(),
      pid: '111',
    }
  },
 }))
 jest.mock('tcp-port-used', () => ({
  default: {
    waitUntilFree: () => Promise.resolve(true),
    waitUntilUsed: () => Promise.resolve(true),
  },
 }))
 jest.mock('./execute', () => ({
  executableNitroFile: () => {
    return {
      enginePath: 'enginePath',
      executablePath: 'executablePath',
      cudaVisibleDevices: 'cudaVisibleDevices',
      vkVisibleDevices: 'vkVisibleDevices',
    }
  },
 }))
 jest.mock('terminate', () => ({
  default: (id: String, func: Function) => {
    console.log(id)
    func()
  },
 }))
 import * as execute from './execute'
 import index from './index'
 let executeMock = execute
 const modelInitOptions: any = {
  modelFolder: '/path/to/model',
  model: {
    id: 'test',
    name: 'test',
    engine: 'nitro',
    version: '0.0',
    format: 'GGUF',
    object: 'model',
    sources: [],
    created: 0,
    description: 'test',
    parameters: {},
    metadata: {
      author: '',
      tags: [],
      size: 0,
    },
    settings: {
      prompt_template: '{prompt}',
      llama_model_path: 'model.gguf',
    },
  },
 }
 describe('loadModel', () => {
  it('should load a model successfully', async () => {
    // Mock the necessary parameters and system information
    const systemInfo = {
      // Mock the system information if needed
    }
    // Call the loadModel function
    const result = await index.loadModel(modelInitOptions, systemInfo)
    // Assert that the result is as expected
    expect(result).toBeUndefined()
  })
  it('should reject with an error message if the model is not a nitro model', async () => {
    // Mock the necessary parameters and system information
    const systemInfo = {
      // Mock the system information if needed
    }
    modelInitOptions.model.engine = 'not-nitro'
    // Call the loadModel function
    try {
      await index.loadModel(modelInitOptions, systemInfo)
    } catch (error) {
      // Assert that the error message is as expected
      expect(error).toBe('Not a cortex model')
    }
    modelInitOptions.model.engine = 'nitro'
  })
  it('should reject if model load failed with an error message', async () => {
    // Mock the necessary parameters and system information
    const systemInfo = {
      // Mock the system information if needed
    }
    // Mock the fetch-retry module to return a failed response
    jest.mock('fetch-retry', () => ({
      default: () => () => {
        return Promise.resolve({
          ok: false,
          status: 500,
          json: () =>
            Promise.resolve({
              model_loaded: false,
            }),
          text: () => Promise.resolve('Failed to load model'),
        })
      },
    }))
    // Call the loadModel function
    try {
      await index.loadModel(modelInitOptions, systemInfo)
    } catch (error) {
      // Assert that the error message is as expected
      expect(error).toBe('Failed to load model')
    }
  })
  it('should reject if port not available', async () => {
    // Mock the necessary parameters and system information
    const systemInfo = {
      // Mock the system information if needed
    }
    // Mock the tcp-port-used module to return false
    jest.mock('tcp-port-used', () => ({
      default: {
        waitUntilFree: () => Promise.resolve(false),
        waitUntilUsed: () => Promise.resolve(false),
      },
    }))
    // Call the loadModel function
    try {
      await index.loadModel(modelInitOptions, systemInfo)
    } catch (error) {
      // Assert that the error message is as expected
      expect(error).toBe('Port not available')
    }
  })
  it('should run on GPU model if ngl is set', async () => {
    const systemInfo: any = {
      gpuSetting: {
        run_mode: 'gpu',
      },
    }
    // Spy executableNitroFile
    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
      enginePath: '',
      executablePath: '',
      cudaVisibleDevices: '',
      vkVisibleDevices: '',
    })
    Object.defineProperty(process, 'platform', { value: 'win32' })
    await index.loadModel(
      {
        ...modelInitOptions,
        model: {
          ...modelInitOptions.model,
          settings: {
            ...modelInitOptions.model.settings,
            ngl: 40,
          },
        },
      },
      systemInfo
    )
    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
      run_mode: 'gpu',
    })
  })
  it('should run on correct CPU instructions if ngl is not set', async () => {
    const systemInfo: any = {
      gpuSetting: {
        run_mode: 'gpu',
      },
    }
    // Spy executableNitroFile
    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
      enginePath: '',
      executablePath: '',
      cudaVisibleDevices: '',
      vkVisibleDevices: '',
    })
    Object.defineProperty(process, 'platform', { value: 'win32' })
    await index.loadModel(
      {
        ...modelInitOptions,
        model: {
          ...modelInitOptions.model,
          settings: {
            ...modelInitOptions.model.settings,
            ngl: undefined,
          },
        },
      },
      systemInfo
    )
    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
      run_mode: 'cpu',
    })
  })
  it('should run on correct CPU instructions if ngl is 0', async () => {
    const systemInfo: any = {
      gpuSetting: {
        run_mode: 'gpu',
      },
    }
    // Spy executableNitroFile
    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
      enginePath: '',
      executablePath: '',
      cudaVisibleDevices: '',
      vkVisibleDevices: '',
    })
    Object.defineProperty(process, 'platform', { value: 'win32' })
    await index.loadModel(
      {
        ...modelInitOptions,
        model: {
          ...modelInitOptions.model,
          settings: {
            ...modelInitOptions.model.settings,
            ngl: 0,
          },
        },
      },
      systemInfo
    )
    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
      run_mode: 'cpu',
    })
  })
 })
 describe('unloadModel', () => {
  it('should unload a model successfully', async () => {
    // Call the unloadModel function
    const result = await index.unloadModel()
    // Assert that the result is as expected
    expect(result).toBeUndefined()
  })
  it('should reject with an error message if the model is not a nitro model', async () => {
    // Call the unloadModel function
    try {
      await index.unloadModel()
    } catch (error) {
      // Assert that the error message is as expected
      expect(error).toBe('Not a cortex model')
    }
  })
  it('should reject if model unload failed with an error message', async () => {
    // Mock the fetch-retry module to return a failed response
    jest.mock('fetch-retry', () => ({
      default: () => () => {
        return Promise.resolve({
          ok: false,
          status: 500,
          json: () =>
            Promise.resolve({
              model_unloaded: false,
            }),
          text: () => Promise.resolve('Failed to unload model'),
        })
      },
    }))
    // Call the unloadModel function
    try {
      await index.unloadModel()
    } catch (error) {
      // Assert that the error message is as expected
      expect(error).toBe('Failed to unload model')
    }
  })
  it('should reject if port not available', async () => {
    // Mock the tcp-port-used module to return false
    jest.mock('tcp-port-used', () => ({
      default: {
        waitUntilFree: () => Promise.resolve(false),
        waitUntilUsed: () => Promise.resolve(false),
      },
    }))
    // Call the unloadModel function
    try {
      await index.unloadModel()
    } catch (error) {
      // Assert that the error message is as expected
      expect(error).toBe('Port not available')
    }
  })
 })
 describe('dispose', () => {
  it('should dispose a model successfully on Mac', async () => {
    Object.defineProperty(process, 'platform', {
      value: 'darwin',
    })
    // Call the dispose function
    const result = await index.dispose()
    // Assert that the result is as expected
    expect(result).toBeUndefined()
  })
  it('should kill the subprocess successfully on Windows', async () => {
    Object.defineProperty(process, 'platform', {
      value: 'win32',
    })
    // Call the killSubprocess function
    const result = await index.dispose()
    // Assert that the result is as expected
    expect(result).toBeUndefined()
  })
 })
 describe('getCurrentNitroProcessInfo', () => {
  it('should return the current nitro process info', async () => {
    // Call the getCurrentNitroProcessInfo function
    const result = await index.getCurrentNitroProcessInfo()
    // Assert that the result is as expected
    expect(result).toEqual({
      isRunning: true,
    })
  })
 })
 describe('decompressRunner', () => {
  it('should decompress the runner successfully', async () => {
    jest.mock('decompress', () => ({
      default: () => {
        return Promise.resolve()
      },
    }))
    // Call the decompressRunner function
    const result = await index.decompressRunner('', '')
    // Assert that the result is as expected
    expect(result).toBeUndefined()
  })
  it('should not reject if decompression failed', async () => {
    jest.mock('decompress', () => ({
      default: () => {
        return Promise.reject('Failed to decompress')
      },
    }))
    // Call the decompressRunner function
    const result = await index.decompressRunner('', '')
    expect(result).toBeUndefined()
  })
 })
 describe('addAdditionalDependencies', () => {
  it('should add additional dependencies successfully', async () => {
    // Call the addAdditionalDependencies function
    const result = await index.addAdditionalDependencies({
      name: 'name',
      version: 'version',
    })
    // Assert that the result is as expected
    expect(result).toBeUndefined()
  })
 })
--- a/extensions/inference-nitro-extension/src/node/index.ts
+++ b/extensions/inference-nitro-extension/src/node/index.ts
@ -266,7 +266,7 @@ async function validateModelStatus(modelId: string): Promise<void> {
    body: JSON.stringify({
      model: modelId,
      // TODO: force to use cortex llamacpp by default
-      engine: 'cortex.llamacpp'
+      engine: 'cortex.llamacpp',
    }),
    headers: {
      'Content-Type': 'application/json',
@ -365,14 +365,37 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
  log(`[CORTEX]::Debug: Spawning cortex subprocess...`)
  return new Promise<void>(async (resolve, reject) => {
-    let executableOptions = executableNitroFile(systemInfo?.gpuSetting)
+    let executableOptions = executableNitroFile(
      // If ngl is not set or equal to 0, run on CPU with correct instructions
      systemInfo?.gpuSetting
        ? {
            ...systemInfo.gpuSetting,
            run_mode:
              currentSettings?.ngl === undefined || currentSettings.ngl === 0
                ? 'cpu'
                : systemInfo.gpuSetting.run_mode,
          }
        : undefined
    )
    const args: string[] = ['1', LOCAL_HOST, PORT.toString()]
    // Execute the binary
    log(
      `[CORTEX]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
    )
-    log(path.parse(executableOptions.executablePath).dir)
+    log(`[CORTEX]::Debug: Cortex engine path: ${executableOptions.enginePath}`)
    // Add engine path to the PATH and LD_LIBRARY_PATH
    process.env.PATH = (process.env.PATH || '').concat(
      path.delimiter,
      executableOptions.enginePath
    )
    log(`[CORTEX] PATH: ${process.env.PATH}`)
    process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
      path.delimiter,
      executableOptions.enginePath
    )
    subprocess = spawn(
      executableOptions.executablePath,
      ['1', LOCAL_HOST, PORT.toString()],
@ -380,6 +403,7 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
        cwd: path.join(path.parse(executableOptions.executablePath).dir),
        env: {
          ...process.env,
          ENGINE_PATH: executableOptions.enginePath,
          CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
          // Vulkan - Support 1 device at a time for now
          ...(executableOptions.vkVisibleDevices?.length > 0 && {
@ -440,12 +464,19 @@ const getCurrentNitroProcessInfo = (): NitroProcessInfo => {
 }
 const addAdditionalDependencies = (data: { name: string; version: string }) => {
  log(
    `[CORTEX]::Debug: Adding additional dependencies for ${data.name} ${data.version}`
  )
  const additionalPath = path.delimiter.concat(
    path.join(getJanDataFolderPath(), 'engines', data.name, data.version)
  )
  // Set the updated PATH
-  process.env.PATH = (process.env.PATH || '').concat(additionalPath)
+  process.env.PATH = (process.env.PATH || '').concat(
    path.delimiter,
    additionalPath
  )
  process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
    path.delimiter,
    additionalPath
  )
 }
--- a/extensions/inference-nitro-extension/tsconfig.json
+++ b/extensions/inference-nitro-extension/tsconfig.json
@ -15,5 +15,6 @@
    "importHelpers": true,
    "typeRoots": ["node_modules/@types"]
  },
-  "include": ["src"]
+  "include": ["src"],
  "exclude": ["src/**/*.test.ts"]
 }