fix: #3549, #3552 - Inference on CPU is slower on Jan 0.5.3 (#3602)

2024-09-11 14:03:53 +07:00 · 2024-09-11 14:03:53 +07:00 · 5217437912
commit 5217437912
parent 2d05134cb1
12 changed files with 921 additions and 374 deletions
--- a/.github/workflows/nightly-integrate-cortex-cpp.yml
+++ b/.github/workflows/nightly-integrate-cortex-cpp.yml
@ -51,13 +51,13 @@ jobs:
        latest_prerelease_asset_count=$(get_asset_count "$latest_prerelease_name")

        if [ "$current_version_name" = "$latest_prerelease_name" ]; then
-          echo "cortex cpp remote repo doesn't have update today, skip update cortex-cpp for today nightly build"
+          echo "cortex cpp remote repo doesn't have update today, skip update cortex.cpp for today nightly build"
          echo "::set-output name=pr_created::false"
          exit 0
        fi

        if [ "$current_version_asset_count" != "$latest_prerelease_asset_count" ]; then
-          echo "Latest prerelease version has different number of assets, somethink went wrong, skip update cortex-cpp for today nightly build"
+          echo "Latest prerelease version has different number of assets, somethink went wrong, skip update cortex.cpp for today nightly build"
          echo "::set-output name=pr_created::false"
          exit 1
        fi
--- a/core/src/node/api/restful/helper/startStopModel.ts
+++ b/core/src/node/api/restful/helper/startStopModel.ts
@ -1,31 +1,13 @@
-import fs from 'fs'
 import { join } from 'path'
-import {
-  getJanDataFolderPath,
-  getJanExtensionsPath,
-  getSystemResourceInfo,
-  log,
-} from '../../../helper'
-import { ChildProcessWithoutNullStreams, spawn } from 'child_process'
-import { Model, ModelSettingParams, PromptTemplate } from '../../../../types'
-import {
-  LOCAL_HOST,
-  NITRO_DEFAULT_PORT,
-  NITRO_HTTP_KILL_URL,
-  NITRO_HTTP_LOAD_MODEL_URL,
-  NITRO_HTTP_VALIDATE_MODEL_URL,
-  SUPPORTED_MODEL_FORMAT,
-} from './consts'
-
-// The subprocess instance for Nitro
-let subprocess: ChildProcessWithoutNullStreams | undefined = undefined
-
-// TODO: move this to core type
-interface NitroModelSettings extends ModelSettingParams {
-  llama_model_path: string
-  cpu_threads: number
-}
+import { getJanDataFolderPath, getJanExtensionsPath, log } from '../../../helper'
+import { ModelSettingParams } from '../../../../types'

+/**
+ * Start a model
+ * @param modelId
+ * @param settingParams
+ * @returns
+ */
 export const startModel = async (modelId: string, settingParams?: ModelSettingParams) => {
  try {
    await runModel(modelId, settingParams)
@ -40,316 +22,57 @@ export const startModel = async (modelId: string, settingParams?: ModelSettingPa
  }
 }

-const runModel = async (modelId: string, settingParams?: ModelSettingParams): Promise<void> => {
-  const janDataFolderPath = getJanDataFolderPath()
-  const modelFolderFullPath = join(janDataFolderPath, 'models', modelId)
-
-  if (!fs.existsSync(modelFolderFullPath)) {
-    throw new Error(`Model not found: ${modelId}`)
-  }
-
-  const files: string[] = fs.readdirSync(modelFolderFullPath)
-
-  // Look for GGUF model file
-  const ggufBinFile = files.find((file) => file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT))
-
-  const modelMetadataPath = join(modelFolderFullPath, 'model.json')
-  const modelMetadata: Model = JSON.parse(fs.readFileSync(modelMetadataPath, 'utf-8'))
-
-  if (!ggufBinFile) {
-    throw new Error('No GGUF model file found')
-  }
-  const modelBinaryPath = join(modelFolderFullPath, ggufBinFile)
-
-  const nitroResourceProbe = await getSystemResourceInfo()
-  const nitroModelSettings: NitroModelSettings = {
-    // This is critical and requires real CPU physical core count (or performance core)
-    cpu_threads: Math.max(1, nitroResourceProbe.numCpuPhysicalCore),
-    ...modelMetadata.settings,
-    ...settingParams,
-    llama_model_path: modelBinaryPath,
-    ...(modelMetadata.settings.mmproj && {
-      mmproj: join(modelFolderFullPath, modelMetadata.settings.mmproj),
-    }),
-  }
-
-  log(`[SERVER]::Debug: Nitro model settings: ${JSON.stringify(nitroModelSettings)}`)
-
-  // Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
-  if (modelMetadata.settings.prompt_template) {
-    const promptTemplate = modelMetadata.settings.prompt_template
-    const prompt = promptTemplateConverter(promptTemplate)
-    if (prompt?.error) {
-      throw new Error(prompt.error)
-    }
-    nitroModelSettings.system_prompt = prompt.system_prompt
-    nitroModelSettings.user_prompt = prompt.user_prompt
-    nitroModelSettings.ai_prompt = prompt.ai_prompt
-  }
-
-  await runNitroAndLoadModel(modelId, nitroModelSettings)
-}
-
-// TODO: move to util
-const promptTemplateConverter = (promptTemplate: string): PromptTemplate => {
-  // Split the string using the markers
-  const systemMarker = '{system_message}'
-  const promptMarker = '{prompt}'
-
-  if (promptTemplate.includes(systemMarker) && promptTemplate.includes(promptMarker)) {
-    // Find the indices of the markers
-    const systemIndex = promptTemplate.indexOf(systemMarker)
-    const promptIndex = promptTemplate.indexOf(promptMarker)
-
-    // Extract the parts of the string
-    const system_prompt = promptTemplate.substring(0, systemIndex)
-    const user_prompt = promptTemplate.substring(systemIndex + systemMarker.length, promptIndex)
-    const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)
-
-    // Return the split parts
-    return { system_prompt, user_prompt, ai_prompt }
-  } else if (promptTemplate.includes(promptMarker)) {
-    // Extract the parts of the string for the case where only promptMarker is present
-    const promptIndex = promptTemplate.indexOf(promptMarker)
-    const user_prompt = promptTemplate.substring(0, promptIndex)
-    const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)
-
-    // Return the split parts
-    return { user_prompt, ai_prompt }
-  }
-
-  // Return an error if none of the conditions are met
-  return { error: 'Cannot split prompt template' }
-}
-
-const runNitroAndLoadModel = async (modelId: string, modelSettings: NitroModelSettings) => {
-  // Gather system information for CPU physical cores and memory
-  const tcpPortUsed = require('tcp-port-used')
-
-  await stopModel(modelId)
-  await tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000)
-
-  /**
-   * There is a problem with Windows process manager
-   * Should wait for awhile to make sure the port is free and subprocess is killed
-   * The tested threshold is 500ms
-   **/
-  if (process.platform === 'win32') {
-    await new Promise((resolve) => setTimeout(resolve, 500))
-  }
-
-  await spawnNitroProcess()
-  await loadLLMModel(modelSettings)
-  await validateModelStatus()
-}
-
-const spawnNitroProcess = async (): Promise<void> => {
-  log(`[SERVER]::Debug: Spawning cortex subprocess...`)
-
-  let binaryFolder = join(
-    getJanExtensionsPath(),
-    '@janhq',
-    'inference-cortex-extension',
-    'dist',
-    'bin'
-  )
-
-  let executableOptions = executableNitroFile()
-  const tcpPortUsed = require('tcp-port-used')
-
-  const args: string[] = ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()]
-  // Execute the binary
-  log(
-    `[SERVER]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
-  )
-  subprocess = spawn(
-    executableOptions.executablePath,
-    ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()],
-    {
-      cwd: binaryFolder,
-      env: {
-        ...process.env,
-        CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
-      },
-    }
-  )
-
-  // Handle subprocess output
-  subprocess.stdout.on('data', (data: any) => {
-    log(`[SERVER]::Debug: ${data}`)
-  })
-
-  subprocess.stderr.on('data', (data: any) => {
-    log(`[SERVER]::Error: ${data}`)
-  })
-
-  subprocess.on('close', (code: any) => {
-    log(`[SERVER]::Debug: cortex exited with code: ${code}`)
-    subprocess = undefined
-  })
-
-  tcpPortUsed.waitUntilUsed(NITRO_DEFAULT_PORT, 300, 30000).then(() => {
-    log(`[SERVER]::Debug: cortex is ready`)
-  })
-}
-
-type NitroExecutableOptions = {
-  executablePath: string
-  cudaVisibleDevices: string
-}
-
-const executableNitroFile = (): NitroExecutableOptions => {
-  const nvidiaInfoFilePath = join(getJanDataFolderPath(), 'settings', 'settings.json')
-  let binaryFolder = join(
-    getJanExtensionsPath(),
-    '@janhq',
-    'inference-cortex-extension',
-    'dist',
-    'bin'
-  )
-
-  let cudaVisibleDevices = ''
-  let binaryName = 'cortex-cpp'
-  /**
-   * The binary folder is different for each platform.
-   */
-  if (process.platform === 'win32') {
-    /**
-     *  For Windows: win-cpu, win-cuda-11-7, win-cuda-12-0
-     */
-    let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
-    if (nvidiaInfo['run_mode'] === 'cpu') {
-      binaryFolder = join(binaryFolder, 'win-cpu')
-    } else {
-      if (nvidiaInfo['cuda'].version === '12') {
-        binaryFolder = join(binaryFolder, 'win-cuda-12-0')
-      } else {
-        binaryFolder = join(binaryFolder, 'win-cuda-11-7')
-      }
-      cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
-    }
-    binaryName = 'cortex-cpp.exe'
-  } else if (process.platform === 'darwin') {
-    /**
-     *  For MacOS: mac-universal both Silicon and InteL
-     */
-    if(process.arch === 'arm64') {
-    binaryFolder = join(binaryFolder, 'mac-arm64')
-    } else {
-      binaryFolder = join(binaryFolder, 'mac-amd64')
-    }
-  } else {
-    /**
-     *  For Linux: linux-cpu, linux-cuda-11-7, linux-cuda-12-0
-     */
-    let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
-    if (nvidiaInfo['run_mode'] === 'cpu') {
-      binaryFolder = join(binaryFolder, 'linux-cpu')
-    } else {
-      if (nvidiaInfo['cuda'].version === '12') {
-        binaryFolder = join(binaryFolder, 'linux-cuda-12-0')
-      } else {
-        binaryFolder = join(binaryFolder, 'linux-cuda-11-7')
-      }
-      cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
-    }
-  }
-
-  return {
-    executablePath: join(binaryFolder, binaryName),
-    cudaVisibleDevices,
-  }
-}
-
-const validateModelStatus = async (): Promise<void> => {
-  // Send a GET request to the validation URL.
-  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
-  const fetchRT = require('fetch-retry')
-  const fetchRetry = fetchRT(fetch)
-
-  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
-    method: 'GET',
-    headers: {
-      'Content-Type': 'application/json',
-    },
-    retries: 5,
-    retryDelay: 500,
-  }).then(async (res: Response) => {
-    log(`[SERVER]::Debug: Validate model state success with response ${JSON.stringify(res)}`)
-    // If the response is OK, check model_loaded status.
-    if (res.ok) {
-      const body = await res.json()
-      // If the model is loaded, return an empty object.
-      // Otherwise, return an object with an error message.
-      if (body.model_loaded) {
-        return Promise.resolve()
-      }
-    }
-    return Promise.reject('Validate model status failed')
-  })
-}
-
-const loadLLMModel = async (settings: NitroModelSettings): Promise<Response> => {
-  log(`[SERVER]::Debug: Loading model with params ${JSON.stringify(settings)}`)
-  const fetchRT = require('fetch-retry')
-  const fetchRetry = fetchRT(fetch)
-
-  return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
-    method: 'POST',
-    headers: {
-      'Content-Type': 'application/json',
-    },
-    body: JSON.stringify(settings),
-    retries: 3,
-    retryDelay: 500,
-  })
-    .then((res: any) => {
-      log(`[SERVER]::Debug: Load model request with response ${JSON.stringify(res)}`)
-      return Promise.resolve(res)
-    })
-    .catch((err: any) => {
-      log(`[SERVER]::Error: Load model failed with error ${err}`)
-      return Promise.reject(err)
-    })
-}
-
 /**
+ * Run a model using installed cortex extension
+ * @param model
+ * @param settingParams
+ */
+const runModel = async (model: string, settingParams?: ModelSettingParams): Promise<void> => {
+  const janDataFolderPath = getJanDataFolderPath()
+  const modelFolder = join(janDataFolderPath, 'models', model)
+  let module = join(
+    getJanExtensionsPath(),
+    '@janhq',
+    'inference-cortex-extension',
+    'dist',
+    'node',
+    'index.cjs'
+  )
+  // Just reuse the cortex extension implementation, don't duplicate then lost of sync
+  return import(module).then((extension) =>
+    extension
+      .loadModel(
+        {
+          modelFolder,
+          model,
+        },
+        settingParams
+      )
+      .then(() => log(`[SERVER]::Debug: Model is loaded`))
+      .then({
+        message: 'Model started',
+      })
+  )
+}
+/*
 * Stop model and kill nitro process.
 */
 export const stopModel = async (_modelId: string) => {
-  if (!subprocess) {
-    return {
-      error: "Model isn't running",
-    }
-  }
-  return new Promise((resolve, reject) => {
-    const controller = new AbortController()
-    setTimeout(() => {
-      controller.abort()
-      reject({
-        error: 'Failed to stop model: Timedout',
+  let module = join(
+    getJanExtensionsPath(),
+    '@janhq',
+    'inference-cortex-extension',
+    'dist',
+    'node',
+    'index.cjs'
+  )
+  // Just reuse the cortex extension implementation, don't duplicate then lost of sync
+  return import(module).then((extension) =>
+    extension
+      .unloadModel()
+      .then(() => log(`[SERVER]::Debug: Model is unloaded`))
+      .then({
+        message: 'Model stopped',
      })
-    }, 5000)
-    const tcpPortUsed = require('tcp-port-used')
-    log(`[SERVER]::Debug: Request to kill cortex`)
-
-    fetch(NITRO_HTTP_KILL_URL, {
-      method: 'DELETE',
-      signal: controller.signal,
-    })
-      .then(() => {
-        subprocess?.kill()
-        subprocess = undefined
-      })
-      .catch(() => {
-        // don't need to do anything, we still kill the subprocess
-      })
-      .then(() => tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000))
-      .then(() => log(`[SERVER]::Debug: Nitro process is terminated`))
-      .then(() =>
-        resolve({
-          message: 'Model stopped',
-        })
-      )
-  })
+  )
 }
--- a/extensions/inference-nitro-extension/download.bat
+++ b/extensions/inference-nitro-extension/download.bat
@ -1,3 +1,31 @@
@echo off
+set BIN_PATH=./bin
 set /p CORTEX_VERSION=<./bin/version.txt
-.\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cpu && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-vulkan && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx.tar.gz -e --strip 1 -o ./bin/win-cpu/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/win-vulkan/engines/cortex.llamacpp
+
+@REM Download cortex.llamacpp binaries
+set VERSION=v0.1.25
+set DOWNLOAD_URL=https://github.com/janhq/cortex.llamacpp/releases/download/%VERSION%/cortex.llamacpp-0.1.25-windows-amd64
+set SUBFOLDERS=win-cuda-12-0 win-cuda-11-7 win-noavx win-avx win-avx2 win-avx512 win-vulkan
+
+call .\node_modules\.bin\download -e --strip 1 -o %BIN_PATH% https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-12-0.tar.gz -e --strip 1 -o %BIN_PATH%/win-cuda-12-0/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-11-7.tar.gz -e --strip 1 -o %BIN_PATH%/win-cuda-11-7/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-noavx.tar.gz -e --strip 1 -o %BIN_PATH%/win-noavx/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx2/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx512.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx512/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-vulkan.tar.gz -e --strip 1 -o %BIN_PATH%/win-vulkan/engines/cortex.llamacpp
+
+@REM Loop through each folder and move DLLs (excluding engine.dll)
+for %%F in (%SUBFOLDERS%) do (
+    echo Processing folder: %BIN_PATH%\%%F
+
+    @REM Move all .dll files except engine.dll
+    for %%D in (%BIN_PATH%\%%F\engines\cortex.llamacpp\*.dll) do (
+        if /I not "%%~nxD"=="engine.dll" (
+            move "%%D" "%BIN_PATH%"
+        )
+    )
+)
+
+echo DLL files moved successfully.
--- a/extensions/inference-nitro-extension/download.sh
+++ b/extensions/inference-nitro-extension/download.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Read CORTEX_VERSION
+CORTEX_VERSION=$(cat ./bin/version.txt)
+CORTEX_RELEASE_URL="https://github.com/janhq/cortex/releases/download"
+
+# Detect platform
+OS_TYPE=$(uname)
+
+if [ "$OS_TYPE" == "Linux" ]; then
+    # Linux downloads
+    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz"  -e --strip 1 -o "./bin"
+    chmod +x "./bin/cortex-cpp"
+
+    ENGINE_DOWNLOAD_URL="https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64"
+
+    # Download engines for Linux
+    download "${ENGINE_DOWNLOAD_URL}-noavx.tar.gz"  -e --strip 1 -o "./bin/linux-noavx/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx.tar.gz"  -e --strip 1 -o "./bin/linux-avx/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx2.tar.gz"  -e --strip 1 -o "./bin/linux-avx2/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx512.tar.gz"  -e --strip 1 -o "./bin/linux-avx512/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx2-cuda-12-0.tar.gz"  -e --strip 1 -o "./bin/linux-cuda-12-0/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx2-cuda-11-7.tar.gz"  -e --strip 1 -o "./bin/linux-cuda-11-7/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-vulkan.tar.gz"  -e --strip 1 -o "./bin/linux-vulkan/engines/cortex.llamacpp" 1
+
+elif [ "$OS_TYPE" == "Darwin" ]; then
+    # macOS downloads
+    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz"  -e --strip 1 -o "./bin/mac-arm64" 1
+    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz"  -e --strip 1 -o "./bin/mac-x64" 1
+    chmod +x "./bin/mac-arm64/cortex-cpp"
+    chmod +x "./bin/mac-x64/cortex-cpp"
+
+    ENGINE_DOWNLOAD_URL="https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac"
+    # Download engines for macOS
+    download "${ENGINE_DOWNLOAD_URL}-arm64.tar.gz" -e --strip 1 -o ./bin/mac-arm64/engines/cortex.llamacpp
+    download "${ENGINE_DOWNLOAD_URL}-amd64.tar.gz" -e --strip 1 -o ./bin/mac-x64/engines/cortex.llamacpp
+
+else
+    echo "Unsupported operating system: $OS_TYPE"
+    exit 1
+fi
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@ -2,7 +2,7 @@
  "name": "@janhq/inference-cortex-extension",
  "productName": "Cortex Inference Engine",
  "version": "1.0.15",
-  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
+  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
  "main": "dist/index.js",
  "node": "dist/node/index.cjs.js",
  "author": "Jan <service@jan.ai>",
@ -10,13 +10,11 @@
  "scripts": {
    "test": "jest",
    "build": "tsc --module commonjs && rollup -c rollup.config.ts",
-    "downloadnitro:linux": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-vulkan && chmod +x ./bin/linux-vulkan/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx.tar.gz -e --strip 1 -o ./bin/linux-cpu/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/linux-vulkan/engines/cortex.llamacpp",
-    "downloadnitro:darwin": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-arm64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz --strip-components=1 -C ./bin/mac-arm64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz && chmod +x ./bin/mac-arm64/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-amd64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz --strip-components=1 -C ./bin/mac-amd64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz && chmod +x ./bin/mac-amd64/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-arm64.tar.gz -e --strip 1 -o ./bin/mac-arm64/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-amd64.tar.gz -e --strip 1 -o ./bin/mac-amd64/engines/cortex.llamacpp",
+    "downloadnitro:linux:darwin": "./download.sh",
    "downloadnitro:win32": "download.bat",
    "downloadnitro": "run-script-os",
    "build:publish:darwin": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && ../../.github/scripts/auto-sign.sh && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:win32": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:linux": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
+    "build:publish:win32:linux": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
    "build:publish": "yarn test && run-script-os"
  },
  "exports": {
@ -49,6 +47,7 @@
  },
  "dependencies": {
    "@janhq/core": "file:../../core",
+    "cpu-instructions": "^0.0.13",
    "decompress": "^4.2.1",
    "fetch-retry": "^5.0.6",
    "rxjs": "^7.8.1",
@ -68,6 +67,7 @@
    "tcp-port-used",
    "fetch-retry",
    "@janhq/core",
-    "decompress"
+    "decompress",
+    "cpu-instructions"
  ]
 }
--- a/extensions/inference-nitro-extension/rollup.config.ts
+++ b/extensions/inference-nitro-extension/rollup.config.ts
@ -96,7 +96,7 @@ export default [
          llama3170bJson,
          gemma22bJson,
          gemma29bJson,
-          gemma227bJson
+          gemma227bJson,
        ]),
        NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
        DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson),
@ -117,7 +117,10 @@ export default [
      // Allow json resolution
      json(),
      //     Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
+      typescript({
+        useTsconfigDeclarationDir: true,
+        exclude: ['**/__tests__', '**/*.test.ts'],
+      }),
      // Compile TypeScript files
      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
      commonjs(),
@ -139,7 +142,7 @@ export default [
      { file: 'dist/node/index.cjs.js', format: 'cjs', sourcemap: true },
    ],
    // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
-    external: ['@janhq/core/node'],
+    external: ['@janhq/core/node', 'cpu-instructions'],
    watch: {
      include: 'src/node/**',
    },
@ -147,7 +150,10 @@ export default [
      // Allow json resolution
      json(),
      // Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
+      typescript({
+        useTsconfigDeclarationDir: true,
+        exclude: ['**/__tests__', '**/*.test.ts'],
+      }),
      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
      commonjs(),
      // Allow node_modules resolution, so you can use 'external' to control
@ -156,7 +162,6 @@ export default [
      resolve({
        extensions: ['.ts', '.js', '.json'],
      }),
-
      // Resolve source maps to the original source
      sourceMaps(),
    ],
--- a/extensions/inference-nitro-extension/src/index.ts
+++ b/extensions/inference-nitro-extension/src/index.ts
@ -73,6 +73,7 @@ export default class JanInferenceNitroExtension extends LocalOAIEngine {
    this.registerModels(models)
    super.onLoad()

+    // Add additional dependencies PATH to the env
    executeOnMain(NODE, 'addAdditionalDependencies', {
      name: this.name,
      version: this.version,
--- a/extensions/inference-nitro-extension/src/node/execute.test.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.test.ts
@ -1,7 +1,7 @@
 import { describe, expect, it } from '@jest/globals'
 import { executableNitroFile } from './execute'
 import { GpuSetting } from '@janhq/core'
-import { sep } from 'path'
+import { cpuInfo } from 'cpu-instructions'

 let testSettings: GpuSetting = {
  run_mode: 'cpu',
@ -22,6 +22,14 @@ let testSettings: GpuSetting = {
 }
 const originalPlatform = process.platform

+jest.mock('cpu-instructions', () => ({
+  cpuInfo: {
+    cpuInfo: jest.fn(),
+  },
+}))
+let mock = cpuInfo.cpuInfo as jest.Mock
+mock.mockReturnValue([])
+
 describe('test executable nitro file', () => {
  afterAll(function () {
    Object.defineProperty(process, 'platform', {
@ -38,17 +46,19 @@ describe('test executable nitro file', () => {
    })
    expect(executableNitroFile(testSettings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`mac-arm64${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`mac-arm64`),
+        executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-arm64/cortex-cpp`) : expect.anything(),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
    )
    Object.defineProperty(process, 'arch', {
-      value: 'amd64',
+      value: 'x64',
    })
    expect(executableNitroFile(testSettings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`mac-amd64${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`mac-x64`),
+        executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-x64/cortex-cpp`) : expect.anything(),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -62,14 +72,11 @@ describe('test executable nitro file', () => {
    const settings: GpuSetting = {
      ...testSettings,
      run_mode: 'cpu',
-      cuda: {
-        exist: true,
-        version: '11',
-      },
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cpu${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win`),
+        executablePath: expect.stringContaining(`cortex-cpp.exe`),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -102,7 +109,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-11-7${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win-cuda-11-7`),
+        executablePath: expect.stringContaining(`cortex-cpp.exe`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -135,7 +143,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-12-0${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win-cuda-12-0`),
+        executablePath: expect.stringContaining(`cortex-cpp.exe`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -152,7 +161,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cpu${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux`),
+        executablePath: expect.stringContaining(`cortex-cpp`),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -185,7 +195,8 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-11-7${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux-cuda-11-7`),
+        executablePath: expect.stringContaining(`cortex-cpp`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -218,10 +229,203 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-12-0${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux-cuda-12-0`),
+        executablePath: expect.stringContaining(`cortex-cpp`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
    )
  })
+
+  // Generate test for different cpu instructions on Linux
+  it(`executes on Linux CPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'linux',
+    })
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'cpu',
+    }
+
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`linux-${instruction}`),
+          executablePath: expect.stringContaining(`cortex-cpp`),
+
+          cudaVisibleDevices: '',
+          vkVisibleDevices: '',
+        })
+      )
+    })
+  })
+  // Generate test for different cpu instructions on Windows
+  it(`executes on Windows CPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'win32',
+    })
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'cpu',
+    }
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`win-${instruction}`),
+          executablePath: expect.stringContaining(`cortex-cpp.exe`),
+          cudaVisibleDevices: '',
+          vkVisibleDevices: '',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on Windows
+  it(`executes on Windows GPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'win32',
+    })
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'gpu',
+      cuda: {
+        exist: true,
+        version: '12',
+      },
+      nvidia_driver: {
+        exist: true,
+        version: '12',
+      },
+      gpus_in_use: ['0'],
+      gpus: [
+        {
+          id: '0',
+          name: 'NVIDIA GeForce GTX 1080',
+          vram: '80000000',
+        },
+      ],
+    }
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`win-cuda-12-0`),
+          executablePath: expect.stringContaining(`cortex-cpp.exe`),
+          cudaVisibleDevices: '0',
+          vkVisibleDevices: '0',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on Linux
+  it(`executes on Linux GPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'linux',
+    })
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'gpu',
+      cuda: {
+        exist: true,
+        version: '12',
+      },
+      nvidia_driver: {
+        exist: true,
+        version: '12',
+      },
+      gpus_in_use: ['0'],
+      gpus: [
+        {
+          id: '0',
+          name: 'NVIDIA GeForce GTX 1080',
+          vram: '80000000',
+        },
+      ],
+    }
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`linux-cuda-12-0`),
+          executablePath: expect.stringContaining(`cortex-cpp`),
+          cudaVisibleDevices: '0',
+          vkVisibleDevices: '0',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on Linux
+  it(`executes on Linux Vulkan should not have CPU instructions included`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'linux',
+    })
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'gpu',
+      vulkan: true,
+      cuda: {
+        exist: true,
+        version: '12',
+      },
+      nvidia_driver: {
+        exist: true,
+        version: '12',
+      },
+      gpus_in_use: ['0'],
+      gpus: [
+        {
+          id: '0',
+          name: 'NVIDIA GeForce GTX 1080',
+          vram: '80000000',
+        },
+      ],
+    }
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`linux-vulkan`),
+          executablePath: expect.stringContaining(`cortex-cpp`),
+          cudaVisibleDevices: '0',
+          vkVisibleDevices: '0',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on MacOS
+  it(`executes on MacOS with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'darwin',
+    })
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach(() => {
+      Object.defineProperty(process, 'platform', {
+        value: 'darwin',
+      })
+      const settings: GpuSetting = {
+        ...testSettings,
+        run_mode: 'cpu',
+      }
+      mock.mockReturnValue([])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`mac-x64`),
+          executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-x64/cortex-cpp`) : expect.anything(),
+          cudaVisibleDevices: '',
+          vkVisibleDevices: '',
+        })
+      )
+    })
+  })
 })
--- a/extensions/inference-nitro-extension/src/node/execute.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.ts
@ -1,37 +1,59 @@
 import { GpuSetting } from '@janhq/core'
 import * as path from 'path'
+import { cpuInfo } from 'cpu-instructions'

 export interface NitroExecutableOptions {
+  enginePath: string
  executablePath: string
  cudaVisibleDevices: string
  vkVisibleDevices: string
 }
-const runMode = (settings?: GpuSetting): string => {
+/**
+ * The GPU runMode that will be set - either 'vulkan', 'cuda', or empty for cpu.
+ * @param settings
+ * @returns
+ */
+const gpuRunMode = (settings?: GpuSetting): string => {
  if (process.platform === 'darwin')
    // MacOS now has universal binaries
    return ''

-  if (!settings) return 'cpu'
+  if (!settings) return ''

  return settings.vulkan === true
    ? 'vulkan'
    : settings.run_mode === 'cpu'
-      ? 'cpu'
+      ? ''
      : 'cuda'
 }

+/**
+ * The OS & architecture that the current process is running on.
+ * @returns win, mac-x64, mac-arm64, or linux
+ */
 const os = (): string => {
  return process.platform === 'win32'
    ? 'win'
    : process.platform === 'darwin'
-      ? process.arch === 'arm64' ? 'mac-arm64' : 'mac-amd64'
+      ? process.arch === 'arm64'
+        ? 'mac-arm64'
+        : 'mac-x64'
      : 'linux'
 }

+/**
+ * The cortex.cpp extension based on the current platform.
+ * @returns .exe if on Windows, otherwise an empty string.
+ */
 const extension = (): '.exe' | '' => {
  return process.platform === 'win32' ? '.exe' : ''
 }

+/**
+ * The CUDA version that will be set - either '11-7' or '12-0'.
+ * @param settings
+ * @returns
+ */
 const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
  const isUsingCuda =
    settings?.vulkan !== true && settings?.run_mode === 'gpu' && os() !== 'mac'
@ -40,6 +62,21 @@ const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
  return settings?.cuda?.version === '11' ? '11-7' : '12-0'
 }

+/**
+ * The CPU instructions that will be set - either 'avx512', 'avx2', 'avx', or 'noavx'.
+ * @returns
+ */
+const cpuInstructions = () => {
+  if (process.platform === 'darwin') return ''
+  return cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX512')
+    ? 'avx512'
+    : cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX2')
+      ? 'avx2'
+      : cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX')
+        ? 'avx'
+        : 'noavx'
+}
+
 /**
 * Find which executable file to run based on the current platform.
 * @returns The name of the executable file to run.
@ -47,15 +84,26 @@ const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
 export const executableNitroFile = (
  gpuSetting?: GpuSetting
 ): NitroExecutableOptions => {
-  let binaryFolder = [os(), runMode(gpuSetting), cudaVersion(gpuSetting)]
+  let engineFolder = [
+    os(),
+    ...(gpuSetting?.vulkan
+      ? []
+      : [
+          gpuRunMode(gpuSetting) !== 'cuda' ? cpuInstructions() : '',
+          gpuRunMode(gpuSetting),
+          cudaVersion(gpuSetting),
+        ]),
+    gpuSetting?.vulkan ? 'vulkan' : undefined,
+  ]
    .filter((e) => !!e)
    .join('-')
  let cudaVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
  let vkVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
-  let binaryName = `cortex-cpp${extension()}`
+  let binaryName = `${process.platform === 'darwin' ? `${os()}/` : ''}cortex-cpp${extension()}`

  return {
-    executablePath: path.join(__dirname, '..', 'bin', binaryFolder, binaryName),
+    enginePath: path.join(__dirname, '..', 'bin', engineFolder),
+    executablePath: path.join(__dirname, '..', 'bin', binaryName),
    cudaVisibleDevices,
    vkVisibleDevices,
  }
--- a/extensions/inference-nitro-extension/src/node/index.test.ts
+++ b/extensions/inference-nitro-extension/src/node/index.test.ts
@ -0,0 +1,465 @@
+jest.mock('fetch-retry', () => ({
+  default: () => () => {
+    return Promise.resolve({
+      ok: true,
+      status: 200,
+      json: () =>
+        Promise.resolve({
+          model_loaded: true,
+        }),
+      text: () => Promise.resolve(''),
+    })
+  },
+}))
+
+jest.mock('path', () => ({
+  default: {
+    isAbsolute: jest.fn(),
+    join: jest.fn(),
+    parse: () => {
+      return { dir: 'dir' }
+    },
+    delimiter: { concat: () => '' },
+  },
+}))
+
+jest.mock('decompress', () => ({
+  default: () => {
+    return Promise.resolve()
+  },
+}))
+
+jest.mock('@janhq/core/node', () => ({
+  ...jest.requireActual('@janhq/core/node'),
+  getJanDataFolderPath: () => '',
+  getSystemResourceInfo: () => {
+    return {
+      cpu: {
+        cores: 1,
+        logicalCores: 1,
+        threads: 1,
+        model: 'model',
+        speed: 1,
+      },
+      memory: {
+        total: 1,
+        free: 1,
+      },
+      gpu: {
+        model: 'model',
+        memory: 1,
+        cuda: {
+          version: 'version',
+          devices: 'devices',
+        },
+        vulkan: {
+          version: 'version',
+          devices: 'devices',
+        },
+      },
+    }
+  },
+}))
+
+jest.mock('fs', () => ({
+  default: {
+    readdirSync: () => [],
+  },
+}))
+
+jest.mock('child_process', () => ({
+  exec: () => {
+    return {
+      stdout: { on: jest.fn() },
+      stderr: { on: jest.fn() },
+      on: jest.fn(),
+    }
+  },
+  spawn: () => {
+    return {
+      stdout: { on: jest.fn() },
+      stderr: { on: jest.fn() },
+      on: jest.fn(),
+      pid: '111',
+    }
+  },
+}))
+
+jest.mock('tcp-port-used', () => ({
+  default: {
+    waitUntilFree: () => Promise.resolve(true),
+    waitUntilUsed: () => Promise.resolve(true),
+  },
+}))
+
+jest.mock('./execute', () => ({
+  executableNitroFile: () => {
+    return {
+      enginePath: 'enginePath',
+      executablePath: 'executablePath',
+      cudaVisibleDevices: 'cudaVisibleDevices',
+      vkVisibleDevices: 'vkVisibleDevices',
+    }
+  },
+}))
+
+jest.mock('terminate', () => ({
+  default: (id: String, func: Function) => {
+    console.log(id)
+    func()
+  },
+}))
+
+import * as execute from './execute'
+import index from './index'
+
+let executeMock = execute
+
+const modelInitOptions: any = {
+  modelFolder: '/path/to/model',
+  model: {
+    id: 'test',
+    name: 'test',
+    engine: 'nitro',
+    version: '0.0',
+    format: 'GGUF',
+    object: 'model',
+    sources: [],
+    created: 0,
+    description: 'test',
+    parameters: {},
+    metadata: {
+      author: '',
+      tags: [],
+      size: 0,
+    },
+    settings: {
+      prompt_template: '{prompt}',
+      llama_model_path: 'model.gguf',
+    },
+  },
+}
+
+describe('loadModel', () => {
+  it('should load a model successfully', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+
+    // Call the loadModel function
+    const result = await index.loadModel(modelInitOptions, systemInfo)
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+
+  it('should reject with an error message if the model is not a nitro model', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+    modelInitOptions.model.engine = 'not-nitro'
+    // Call the loadModel function
+    try {
+      await index.loadModel(modelInitOptions, systemInfo)
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Not a cortex model')
+    }
+    modelInitOptions.model.engine = 'nitro'
+  })
+
+  it('should reject if model load failed with an error message', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+    // Mock the fetch-retry module to return a failed response
+    jest.mock('fetch-retry', () => ({
+      default: () => () => {
+        return Promise.resolve({
+          ok: false,
+          status: 500,
+          json: () =>
+            Promise.resolve({
+              model_loaded: false,
+            }),
+          text: () => Promise.resolve('Failed to load model'),
+        })
+      },
+    }))
+
+    // Call the loadModel function
+    try {
+      await index.loadModel(modelInitOptions, systemInfo)
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Failed to load model')
+    }
+  })
+
+  it('should reject if port not available', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+
+    // Mock the tcp-port-used module to return false
+    jest.mock('tcp-port-used', () => ({
+      default: {
+        waitUntilFree: () => Promise.resolve(false),
+        waitUntilUsed: () => Promise.resolve(false),
+      },
+    }))
+
+    // Call the loadModel function
+    try {
+      await index.loadModel(modelInitOptions, systemInfo)
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Port not available')
+    }
+  })
+
+  it('should run on GPU model if ngl is set', async () => {
+    const systemInfo: any = {
+      gpuSetting: {
+        run_mode: 'gpu',
+      },
+    }
+    // Spy executableNitroFile
+    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
+      enginePath: '',
+      executablePath: '',
+      cudaVisibleDevices: '',
+      vkVisibleDevices: '',
+    })
+
+    Object.defineProperty(process, 'platform', { value: 'win32' })
+    await index.loadModel(
+      {
+        ...modelInitOptions,
+        model: {
+          ...modelInitOptions.model,
+          settings: {
+            ...modelInitOptions.model.settings,
+            ngl: 40,
+          },
+        },
+      },
+      systemInfo
+    )
+    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
+      run_mode: 'gpu',
+    })
+  })
+
+  it('should run on correct CPU instructions if ngl is not set', async () => {
+    const systemInfo: any = {
+      gpuSetting: {
+        run_mode: 'gpu',
+      },
+    }
+    // Spy executableNitroFile
+    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
+      enginePath: '',
+      executablePath: '',
+      cudaVisibleDevices: '',
+      vkVisibleDevices: '',
+    })
+
+    Object.defineProperty(process, 'platform', { value: 'win32' })
+    await index.loadModel(
+      {
+        ...modelInitOptions,
+        model: {
+          ...modelInitOptions.model,
+          settings: {
+            ...modelInitOptions.model.settings,
+            ngl: undefined,
+          },
+        },
+      },
+      systemInfo
+    )
+    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
+      run_mode: 'cpu',
+    })
+  })
+
+  it('should run on correct CPU instructions if ngl is 0', async () => {
+    const systemInfo: any = {
+      gpuSetting: {
+        run_mode: 'gpu',
+      },
+    }
+    // Spy executableNitroFile
+    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
+      enginePath: '',
+      executablePath: '',
+      cudaVisibleDevices: '',
+      vkVisibleDevices: '',
+    })
+
+    Object.defineProperty(process, 'platform', { value: 'win32' })
+    await index.loadModel(
+      {
+        ...modelInitOptions,
+        model: {
+          ...modelInitOptions.model,
+          settings: {
+            ...modelInitOptions.model.settings,
+            ngl: 0,
+          },
+        },
+      },
+      systemInfo
+    )
+    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
+      run_mode: 'cpu',
+    })
+  })
+})
+
+describe('unloadModel', () => {
+  it('should unload a model successfully', async () => {
+    // Call the unloadModel function
+    const result = await index.unloadModel()
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+
+  it('should reject with an error message if the model is not a nitro model', async () => {
+    // Call the unloadModel function
+    try {
+      await index.unloadModel()
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Not a cortex model')
+    }
+  })
+
+  it('should reject if model unload failed with an error message', async () => {
+    // Mock the fetch-retry module to return a failed response
+    jest.mock('fetch-retry', () => ({
+      default: () => () => {
+        return Promise.resolve({
+          ok: false,
+          status: 500,
+          json: () =>
+            Promise.resolve({
+              model_unloaded: false,
+            }),
+          text: () => Promise.resolve('Failed to unload model'),
+        })
+      },
+    }))
+
+    // Call the unloadModel function
+    try {
+      await index.unloadModel()
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Failed to unload model')
+    }
+  })
+
+  it('should reject if port not available', async () => {
+    // Mock the tcp-port-used module to return false
+    jest.mock('tcp-port-used', () => ({
+      default: {
+        waitUntilFree: () => Promise.resolve(false),
+        waitUntilUsed: () => Promise.resolve(false),
+      },
+    }))
+
+    // Call the unloadModel function
+    try {
+      await index.unloadModel()
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Port not available')
+    }
+  })
+})
+describe('dispose', () => {
+  it('should dispose a model successfully on Mac', async () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'darwin',
+    })
+
+    // Call the dispose function
+    const result = await index.dispose()
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+
+  it('should kill the subprocess successfully on Windows', async () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'win32',
+    })
+
+    // Call the killSubprocess function
+    const result = await index.dispose()
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+})
+
+describe('getCurrentNitroProcessInfo', () => {
+  it('should return the current nitro process info', async () => {
+    // Call the getCurrentNitroProcessInfo function
+    const result = await index.getCurrentNitroProcessInfo()
+
+    // Assert that the result is as expected
+    expect(result).toEqual({
+      isRunning: true,
+    })
+  })
+})
+
+describe('decompressRunner', () => {
+  it('should decompress the runner successfully', async () => {
+    jest.mock('decompress', () => ({
+      default: () => {
+        return Promise.resolve()
+      },
+    }))
+    // Call the decompressRunner function
+    const result = await index.decompressRunner('', '')
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+  it('should not reject if decompression failed', async () => {
+    jest.mock('decompress', () => ({
+      default: () => {
+        return Promise.reject('Failed to decompress')
+      },
+    }))
+    // Call the decompressRunner function
+    const result = await index.decompressRunner('', '')
+    expect(result).toBeUndefined()
+  })
+})
+
+describe('addAdditionalDependencies', () => {
+  it('should add additional dependencies successfully', async () => {
+    // Call the addAdditionalDependencies function
+    const result = await index.addAdditionalDependencies({
+      name: 'name',
+      version: 'version',
+    })
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+})
--- a/extensions/inference-nitro-extension/src/node/index.ts
+++ b/extensions/inference-nitro-extension/src/node/index.ts
@ -263,10 +263,10 @@ async function validateModelStatus(modelId: string): Promise<void> {
  log(`[CORTEX]::Debug: Validating model ${modelId}`)
  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
    method: 'POST',
-    body: JSON.stringify({ 
+    body: JSON.stringify({
      model: modelId,
      // TODO: force to use cortex llamacpp by default
-      engine: 'cortex.llamacpp'
+      engine: 'cortex.llamacpp',
    }),
    headers: {
      'Content-Type': 'application/json',
@ -365,14 +365,37 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
  log(`[CORTEX]::Debug: Spawning cortex subprocess...`)

  return new Promise<void>(async (resolve, reject) => {
-    let executableOptions = executableNitroFile(systemInfo?.gpuSetting)
+    let executableOptions = executableNitroFile(
+      // If ngl is not set or equal to 0, run on CPU with correct instructions
+      systemInfo?.gpuSetting
+        ? {
+            ...systemInfo.gpuSetting,
+            run_mode:
+              currentSettings?.ngl === undefined || currentSettings.ngl === 0
+                ? 'cpu'
+                : systemInfo.gpuSetting.run_mode,
+          }
+        : undefined
+    )

    const args: string[] = ['1', LOCAL_HOST, PORT.toString()]
    // Execute the binary
    log(
      `[CORTEX]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
    )
-    log(path.parse(executableOptions.executablePath).dir)
+    log(`[CORTEX]::Debug: Cortex engine path: ${executableOptions.enginePath}`)
+
+    // Add engine path to the PATH and LD_LIBRARY_PATH
+    process.env.PATH = (process.env.PATH || '').concat(
+      path.delimiter,
+      executableOptions.enginePath
+    )
+    log(`[CORTEX] PATH: ${process.env.PATH}`)
+    process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
+      path.delimiter,
+      executableOptions.enginePath
+    )
+
    subprocess = spawn(
      executableOptions.executablePath,
      ['1', LOCAL_HOST, PORT.toString()],
@ -380,6 +403,7 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
        cwd: path.join(path.parse(executableOptions.executablePath).dir),
        env: {
          ...process.env,
+          ENGINE_PATH: executableOptions.enginePath,
          CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
          // Vulkan - Support 1 device at a time for now
          ...(executableOptions.vkVisibleDevices?.length > 0 && {
@ -440,12 +464,19 @@ const getCurrentNitroProcessInfo = (): NitroProcessInfo => {
 }

 const addAdditionalDependencies = (data: { name: string; version: string }) => {
+  log(
+    `[CORTEX]::Debug: Adding additional dependencies for ${data.name} ${data.version}`
+  )
  const additionalPath = path.delimiter.concat(
    path.join(getJanDataFolderPath(), 'engines', data.name, data.version)
  )
  // Set the updated PATH
-  process.env.PATH = (process.env.PATH || '').concat(additionalPath)
+  process.env.PATH = (process.env.PATH || '').concat(
+    path.delimiter,
+    additionalPath
+  )
  process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
+    path.delimiter,
    additionalPath
  )
 }
--- a/extensions/inference-nitro-extension/tsconfig.json
+++ b/extensions/inference-nitro-extension/tsconfig.json
@ -15,5 +15,6 @@
    "importHelpers": true,
    "typeRoots": ["node_modules/@types"]
  },
-  "include": ["src"]
+  "include": ["src"],
+  "exclude": ["src/**/*.test.ts"]
 }