refactor: introduce node module in nitro extension (#1630)

2024-01-17 11:28:54 +07:00 · 2024-01-17 11:28:54 +07:00 · f4f861d0e9
commit f4f861d0e9
parent db987e88f9
12 changed files with 782 additions and 651 deletions
--- a/core/package.json
+++ b/core/package.json
@ -15,13 +15,6 @@
    "dist"
  ],
  "author": "Jan <service@jan.ai>",
-  "repository": {
-    "type": "git",
-    "url": ""
-  },
-  "engines": {
-    "node": ">=6.0.0"
-  },
  "exports": {
    ".": "./dist/core.umd.js",
    "./sdk": "./dist/core.umd.js",
@ -49,53 +42,6 @@
    "build": "tsc --module commonjs && rollup -c rollup.config.ts",
    "start": "rollup -c rollup.config.ts -w"
  },
-  "lint-staged": {
-    "{src,test}/**/*.ts": [
-      "prettier --write",
-      "git add"
-    ]
-  },
-  "config": {
-    "commitizen": {
-      "path": "node_modules/cz-conventional-changelog"
-    }
-  },
-  "jest": {
-    "transform": {
-      ".(ts|tsx)": "ts-jest"
-    },
-    "testEnvironment": "node",
-    "testRegex": "(/__tests__/.*|\\.(test|spec))\\.(ts|tsx|js)$",
-    "moduleFileExtensions": [
-      "ts",
-      "tsx",
-      "js"
-    ],
-    "coveragePathIgnorePatterns": [
-      "/node_modules/",
-      "/test/"
-    ],
-    "coverageThreshold": {
-      "global": {
-        "branches": 90,
-        "functions": 95,
-        "lines": 95,
-        "statements": 95
-      }
-    },
-    "collectCoverageFrom": [
-      "src/*.{js,ts}"
-    ]
-  },
-  "prettier": {
-    "semi": false,
-    "singleQuote": true
-  },
-  "commitlint": {
-    "extends": [
-      "@commitlint/config-conventional"
-    ]
-  },
  "devDependencies": {
    "@types/node": "^12.0.2",
    "rollup": "^2.38.5",
@ -104,7 +50,6 @@
    "rollup-plugin-node-resolve": "^5.2.0",
    "rollup-plugin-sourcemaps": "^0.6.3",
    "rollup-plugin-typescript2": "^0.36.0",
-    "ts-node": "^7.0.1",
    "tslib": "^2.6.2",
    "typescript": "^5.2.2"
  }
--- a/core/src/types/model/modelEntity.ts
+++ b/core/src/types/model/modelEntity.ts
@ -104,6 +104,9 @@ export type ModelSettingParams = {
  n_parallel?: number
  cpu_threads?: number
  prompt_template?: string
+  system_prompt?: string
+  ai_prompt?: string
+  user_prompt?: string
 }

 /**
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@ -3,11 +3,11 @@
  "version": "1.0.0",
  "description": "This extension embeds Nitro, a lightweight (3mb) inference engine written in C++. See nitro.jan.ai",
  "main": "dist/index.js",
-  "module": "dist/module.js",
+  "node": "dist/node/index.cjs.js",
  "author": "Jan <service@jan.ai>",
  "license": "AGPL-3.0",
  "scripts": {
-    "build": "tsc -b . && webpack --config webpack.config.js",
+    "build": "tsc --module commonjs && rollup -c rollup.config.ts",
    "downloadnitro:linux": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/nitro",
    "downloadnitro:darwin": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-arm64.tar.gz -e --strip 1 -o ./bin/mac-arm64 && chmod +x ./bin/mac-arm64/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-amd64.tar.gz -e --strip 1 -o ./bin/mac-x64 && chmod +x ./bin/mac-x64/nitro",
    "downloadnitro:win32": "download.bat",
@ -19,24 +19,33 @@
  },
  "exports": {
    ".": "./dist/index.js",
-    "./main": "./dist/module.js"
+    "./main": "./dist/node/index.cjs.js"
  },
  "devDependencies": {
+    "@rollup/plugin-commonjs": "^25.0.7",
+    "@rollup/plugin-json": "^6.1.0",
+    "@rollup/plugin-node-resolve": "^15.2.3",
+    "@types/node": "^20.11.4",
+    "@types/tcp-port-used": "^1.0.4",
    "cpx": "^1.5.0",
+    "download-cli": "^1.1.1",
    "rimraf": "^3.0.2",
+    "rollup": "^2.38.5",
+    "rollup-plugin-define": "^1.0.1",
+    "rollup-plugin-sourcemaps": "^0.6.3",
+    "rollup-plugin-typescript2": "^0.36.0",
    "run-script-os": "^1.1.6",
-    "webpack": "^5.88.2",
-    "webpack-cli": "^5.1.4"
+    "typescript": "^5.3.3"
  },
  "dependencies": {
    "@janhq/core": "file:../../core",
-    "download-cli": "^1.1.1",
+    "@rollup/plugin-replace": "^5.0.5",
+    "@types/os-utils": "^0.0.4",
    "fetch-retry": "^5.0.6",
    "os-utils": "^0.0.14",
    "path-browserify": "^1.0.1",
    "rxjs": "^7.8.1",
    "tcp-port-used": "^1.0.2",
-    "ts-loader": "^9.5.0",
    "ulid": "^2.3.0"
  },
  "engines": {
--- a/extensions/inference-nitro-extension/rollup.config.ts
+++ b/extensions/inference-nitro-extension/rollup.config.ts
@ -0,0 +1,77 @@
+import resolve from "@rollup/plugin-node-resolve";
+import commonjs from "@rollup/plugin-commonjs";
+import sourceMaps from "rollup-plugin-sourcemaps";
+import typescript from "rollup-plugin-typescript2";
+import json from "@rollup/plugin-json";
+import replace from "@rollup/plugin-replace";
+const packageJson = require("./package.json");
+
+const pkg = require("./package.json");
+
+export default [
+  {
+    input: `src/index.ts`,
+    output: [{ file: pkg.main, format: "es", sourcemap: true }],
+    // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
+    external: [],
+    watch: {
+      include: "src/**",
+    },
+    plugins: [
+      replace({
+        NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
+        INFERENCE_URL: JSON.stringify(
+          process.env.INFERENCE_URL ||
+            "http://127.0.0.1:3928/inferences/llamacpp/chat_completion"
+        ),
+        TROUBLESHOOTING_URL: JSON.stringify(
+          "https://jan.ai/guides/troubleshooting"
+        ),
+      }),
+      // Allow json resolution
+      json(),
+      //     Compile TypeScript files
+      typescript({ useTsconfigDeclarationDir: true }),
+      // Compile TypeScript files
+      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
+      commonjs(),
+      // Allow node_modules resolution, so you can use 'external' to control
+      // which external modules to include in the bundle
+      // https://github.com/rollup/rollup-plugin-node-resolve#usage
+      resolve({
+        extensions: [".js", ".ts", ".svelte"],
+      }),
+
+      // Resolve source maps to the original source
+      sourceMaps(),
+    ],
+  },
+  {
+    input: `src/node/index.ts`,
+    output: [
+      { file: "dist/node/index.cjs.js", format: "cjs", sourcemap: true },
+    ],
+    // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
+    external: ["@janhq/core/node"],
+    watch: {
+      include: "src/node/**",
+    },
+    plugins: [
+      // Allow json resolution
+      json(),
+      // Compile TypeScript files
+      typescript({ useTsconfigDeclarationDir: true }),
+      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
+      commonjs(),
+      // Allow node_modules resolution, so you can use 'external' to control
+      // which external modules to include in the bundle
+      // https://github.com/rollup/rollup-plugin-node-resolve#usage
+      resolve({
+        extensions: [".ts", ".js", ".json"],
+      }),
+
+      // Resolve source maps to the original source
+      sourceMaps(),
+    ],
+  },
+];
--- a/extensions/inference-nitro-extension/src/@types/global.d.ts
+++ b/extensions/inference-nitro-extension/src/@types/global.d.ts
@ -1,4 +1,4 @@
-declare const MODULE: string;
+declare const NODE: string;
 declare const INFERENCE_URL: string;
 declare const TROUBLESHOOTING_URL: string;

--- a/extensions/inference-nitro-extension/src/index.ts
+++ b/extensions/inference-nitro-extension/src/index.ts
@ -26,7 +26,6 @@ import {
 } from "@janhq/core";
 import { requestInference } from "./helpers/sse";
 import { ulid } from "ulid";
-import { join } from "path";

 /**
 * A class that implements the InferenceExtension interface from the @janhq/core package.
@ -43,7 +42,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
   */
  private static readonly _intervalHealthCheck = 5 * 1000;

-  private _currentModel: Model;
+  private _currentModel: Model | undefined;

  private _engineSettings: EngineSettings = {
    ctx_len: 2048,
@ -82,7 +81,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
    if (!(await fs.existsSync(JanInferenceNitroExtension._homeDir))) {
      await fs
        .mkdirSync(JanInferenceNitroExtension._homeDir)
-        .catch((err) => console.debug(err));
+        .catch((err: Error) => console.debug(err));
    }

    if (!(await fs.existsSync(JanInferenceNitroExtension._settingsDir)))
@ -90,7 +89,9 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
    this.writeDefaultEngineSettings();

    // Events subscription
-    events.on(EventName.OnMessageSent, (data) => this.onMessageRequest(data));
+    events.on(EventName.OnMessageSent, (data: MessageRequest) =>
+      this.onMessageRequest(data)
+    );

    events.on(EventName.OnModelInit, (model: Model) => this.onModelInit(model));

@ -99,7 +100,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
    events.on(EventName.OnInferenceStopped, () => this.onInferenceStopped());

    // Attempt to fetch nvidia info
-    await executeOnMain(MODULE, "updateNvidiaInfo", {});
+    await executeOnMain(NODE, "updateNvidiaInfo", {});
  }

  /**
@ -109,10 +110,10 @@ export default class JanInferenceNitroExtension implements InferenceExtension {

  private async writeDefaultEngineSettings() {
    try {
-      const engineFile = join(
+      const engineFile = await joinPath([
        JanInferenceNitroExtension._homeDir,
-        JanInferenceNitroExtension._engineMetadataFileName
-      );
+        JanInferenceNitroExtension._engineMetadataFileName,
+      ]);
      if (await fs.existsSync(engineFile)) {
        const engine = await fs.readFileSync(engineFile, "utf-8");
        this._engineSettings =
@ -133,12 +134,12 @@ export default class JanInferenceNitroExtension implements InferenceExtension {

    const modelFullPath = await joinPath(["models", model.id]);

-    const nitroInitResult = await executeOnMain(MODULE, "initModel", {
-      modelFullPath: modelFullPath,
-      model: model,
+    const nitroInitResult = await executeOnMain(NODE, "runModel", {
+      modelFullPath,
+      model,
    });

-    if (nitroInitResult.error === null) {
+    if (nitroInitResult?.error) {
      events.emit(EventName.OnModelFail, model);
      return;
    }
@ -155,12 +156,11 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
  private async onModelStop(model: Model) {
    if (model.engine !== "nitro") return;

-    await executeOnMain(MODULE, "stopModel");
+    await executeOnMain(NODE, "stopModel");
    events.emit(EventName.OnModelStopped, {});

    // stop the periocally health check
    if (this.getNitroProcesHealthIntervalId) {
-      console.debug("Stop calling Nitro process health check");
      clearInterval(this.getNitroProcesHealthIntervalId);
      this.getNitroProcesHealthIntervalId = undefined;
    }
@ -170,7 +170,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
   * Periodically check for nitro process's health.
   */
  private async periodicallyGetNitroHealth(): Promise<void> {
-    const health = await executeOnMain(MODULE, "getCurrentNitroProcessInfo");
+    const health = await executeOnMain(NODE, "getCurrentNitroProcessInfo");

    const isRunning = this.nitroProcessInfo?.isRunning ?? false;
    if (isRunning && health.isRunning === false) {
@ -204,6 +204,8 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
    };

    return new Promise(async (resolve, reject) => {
+      if (!this._currentModel) return Promise.reject("No model loaded");
+
      requestInference(data.messages ?? [], this._currentModel).subscribe({
        next: (_content) => {},
        complete: async () => {
@ -223,7 +225,9 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
   * @param {MessageRequest} data - The data for the new message request.
   */
  private async onMessageRequest(data: MessageRequest) {
-    if (data.model.engine !== "nitro") return;
+    if (data.model?.engine !== InferenceEngine.nitro || !this._currentModel) {
+      return;
+    }

    const timestamp = Date.now();
    const message: ThreadMessage = {
@ -242,11 +246,12 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
    this.isCancelled = false;
    this.controller = new AbortController();

-    requestInference(
-      data.messages ?? [],
-      { ...this._currentModel, ...data.model },
-      this.controller
-    ).subscribe({
+    // @ts-ignore
+    const model: Model = {
+      ...(this._currentModel || {}),
+      ...(data.model || {}),
+    };
+    requestInference(data.messages ?? [], model, this.controller).subscribe({
      next: (content) => {
        const messageContent: ThreadContent = {
          type: ContentType.Text,
--- a/extensions/inference-nitro-extension/src/module.ts
+++ b/extensions/inference-nitro-extension/src/module.ts
@ -1,514 +0,0 @@
-const fs = require("fs");
-const path = require("path");
-const { exec, spawn } = require("child_process");
-const tcpPortUsed = require("tcp-port-used");
-const fetchRetry = require("fetch-retry")(global.fetch);
-const osUtils = require("os-utils");
-const { readFileSync, writeFileSync, existsSync } = require("fs");
-const { log } = require("@janhq/core/node");
-
-// The PORT to use for the Nitro subprocess
-const PORT = 3928;
-const LOCAL_HOST = "127.0.0.1";
-const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${PORT}`;
-const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/loadmodel`;
-const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/modelstatus`;
-const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`;
-const SUPPORTED_MODEL_FORMAT = ".gguf";
-const NVIDIA_INFO_FILE = path.join(
-  require("os").homedir(),
-  "jan",
-  "settings",
-  "settings.json"
-);
-
-// The subprocess instance for Nitro
-let subprocess = undefined;
-let currentModelFile: string = undefined;
-let currentSettings = undefined;
-
-let nitroProcessInfo = undefined;
-
-/**
- * Default GPU settings
- **/
-const DEFALT_SETTINGS = {
-  notify: true,
-  run_mode: "cpu",
-  nvidia_driver: {
-    exist: false,
-    version: "",
-  },
-  cuda: {
-    exist: false,
-    version: "",
-  },
-  gpus: [],
-  gpu_highest_vram: "",
-};
-
-/**
- * Stops a Nitro subprocess.
- * @param wrapper - The model wrapper.
- * @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
- */
-function stopModel(): Promise<void> {
-  return killSubprocess();
-}
-
-/**
- * Initializes a Nitro subprocess to load a machine learning model.
- * @param wrapper - The model wrapper.
- * @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
- * TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
- * TODO: Should it be startModel instead?
- */
-async function initModel(wrapper: any): Promise<ModelOperationResponse> {
-  currentModelFile = wrapper.modelFullPath;
-  const janRoot = path.join(require("os").homedir(), "jan");
-  if (!currentModelFile.includes(janRoot)) {
-    currentModelFile = path.join(janRoot, currentModelFile);
-  }
-  const files: string[] = fs.readdirSync(currentModelFile);
-
-  // Look for GGUF model file
-  const ggufBinFile = files.find(
-    (file) =>
-      file === path.basename(currentModelFile) ||
-      file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT)
-  );
-
-  currentModelFile = path.join(currentModelFile, ggufBinFile);
-
-  if (wrapper.model.engine !== "nitro") {
-    return Promise.resolve({ error: "Not a nitro model" });
-  } else {
-    const nitroResourceProbe = await getResourcesInfo();
-    // Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
-    if (wrapper.model.settings.prompt_template) {
-      const promptTemplate = wrapper.model.settings.prompt_template;
-      const prompt = promptTemplateConverter(promptTemplate);
-      if (prompt.error) {
-        return Promise.resolve({ error: prompt.error });
-      }
-      wrapper.model.settings.system_prompt = prompt.system_prompt;
-      wrapper.model.settings.user_prompt = prompt.user_prompt;
-      wrapper.model.settings.ai_prompt = prompt.ai_prompt;
-    }
-
-    currentSettings = {
-      llama_model_path: currentModelFile,
-      ...wrapper.model.settings,
-      // This is critical and requires real system information
-      cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
-    };
-    return loadModel(nitroResourceProbe);
-  }
-}
-
-async function loadModel(nitroResourceProbe: any | undefined) {
-  // Gather system information for CPU physical cores and memory
-  if (!nitroResourceProbe) nitroResourceProbe = await getResourcesInfo();
-  return killSubprocess()
-    .then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
-    .then(() => {
-      /**
-       * There is a problem with Windows process manager
-       * Should wait for awhile to make sure the port is free and subprocess is killed
-       * The tested threshold is 500ms
-       **/
-      if (process.platform === "win32") {
-        return new Promise((resolve) => setTimeout(resolve, 500));
-      } else {
-        return Promise.resolve();
-      }
-    })
-    .then(() => spawnNitroProcess(nitroResourceProbe))
-    .then(() => loadLLMModel(currentSettings))
-    .then(validateModelStatus)
-    .catch((err) => {
-      log(`[NITRO]::Error: ${err}`);
-      // TODO: Broadcast error so app could display proper error message
-      return { error: err, currentModelFile };
-    });
-}
-
-function promptTemplateConverter(promptTemplate) {
-  // Split the string using the markers
-  const systemMarker = "{system_message}";
-  const promptMarker = "{prompt}";
-
-  if (
-    promptTemplate.includes(systemMarker) &&
-    promptTemplate.includes(promptMarker)
-  ) {
-    // Find the indices of the markers
-    const systemIndex = promptTemplate.indexOf(systemMarker);
-    const promptIndex = promptTemplate.indexOf(promptMarker);
-
-    // Extract the parts of the string
-    const system_prompt = promptTemplate.substring(0, systemIndex);
-    const user_prompt = promptTemplate.substring(
-      systemIndex + systemMarker.length,
-      promptIndex
-    );
-    const ai_prompt = promptTemplate.substring(
-      promptIndex + promptMarker.length
-    );
-
-    // Return the split parts
-    return { system_prompt, user_prompt, ai_prompt };
-  } else if (promptTemplate.includes(promptMarker)) {
-    // Extract the parts of the string for the case where only promptMarker is present
-    const promptIndex = promptTemplate.indexOf(promptMarker);
-    const user_prompt = promptTemplate.substring(0, promptIndex);
-    const ai_prompt = promptTemplate.substring(
-      promptIndex + promptMarker.length
-    );
-    const system_prompt = "";
-
-    // Return the split parts
-    return { system_prompt, user_prompt, ai_prompt };
-  }
-
-  // Return an error if none of the conditions are met
-  return { error: "Cannot split prompt template" };
-}
-
-/**
- * Loads a LLM model into the Nitro subprocess by sending a HTTP POST request.
- * @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
- */
-function loadLLMModel(settings): Promise<Response> {
-  log(`[NITRO]::Debug: Loading model with params ${JSON.stringify(settings)}`);
-  return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-    },
-    body: JSON.stringify(settings),
-    retries: 3,
-    retryDelay: 500,
-  }).catch((err) => {
-    log(`[NITRO]::Error: Load model failed with error ${err}`);
-  });
-}
-
-/**
- * Validates the status of a model.
- * @returns {Promise<ModelOperationResponse>} A promise that resolves to an object.
- * If the model is loaded successfully, the object is empty.
- * If the model is not loaded successfully, the object contains an error message.
- */
-async function validateModelStatus(): Promise<ModelOperationResponse> {
-  // Send a GET request to the validation URL.
-  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
-  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
-    method: "GET",
-    headers: {
-      "Content-Type": "application/json",
-    },
-    retries: 5,
-    retryDelay: 500,
-  }).then(async (res: Response) => {
-    // If the response is OK, check model_loaded status.
-    if (res.ok) {
-      const body = await res.json();
-      // If the model is loaded, return an empty object.
-      // Otherwise, return an object with an error message.
-      if (body.model_loaded) {
-        return { error: undefined };
-      }
-    }
-    return { error: "Model loading failed" };
-  });
-}
-
-/**
- * Terminates the Nitro subprocess.
- * @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
- */
-async function killSubprocess(): Promise<void> {
-  const controller = new AbortController();
-  setTimeout(() => controller.abort(), 5000);
-  log(`[NITRO]::Debug: Request to kill Nitro`);
-
-  return fetch(NITRO_HTTP_KILL_URL, {
-    method: "DELETE",
-    signal: controller.signal,
-  })
-    .then(() => {
-      subprocess?.kill();
-      subprocess = undefined;
-    })
-    .catch(() => {})
-    .then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
-    .then(() => log(`[NITRO]::Debug: Nitro process is terminated`));
-}
-
-/**
- * Spawns a Nitro subprocess.
- * @param nitroResourceProbe - The Nitro resource probe.
- * @returns A promise that resolves when the Nitro subprocess is started.
- */
-function spawnNitroProcess(nitroResourceProbe: any): Promise<any> {
-  log(`[NITRO]::Debug: Spawning Nitro subprocess...`);
-
-  return new Promise(async (resolve, reject) => {
-    let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
-    let cudaVisibleDevices = "";
-    let binaryName;
-    if (process.platform === "win32") {
-      let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
-      if (nvidiaInfo["run_mode"] === "cpu") {
-        binaryFolder = path.join(binaryFolder, "win-cpu");
-      } else {
-        if (nvidiaInfo["cuda"].version === "12") {
-          binaryFolder = path.join(binaryFolder, "win-cuda-12-0");
-        } else {
-          binaryFolder = path.join(binaryFolder, "win-cuda-11-7");
-        }
-        cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
-      }
-      binaryName = "nitro.exe";
-    } else if (process.platform === "darwin") {
-      if (process.arch === "arm64") {
-        binaryFolder = path.join(binaryFolder, "mac-arm64");
-      } else {
-        binaryFolder = path.join(binaryFolder, "mac-x64");
-      }
-      binaryName = "nitro";
-    } else {
-      let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
-      if (nvidiaInfo["run_mode"] === "cpu") {
-        binaryFolder = path.join(binaryFolder, "linux-cpu");
-      } else {
-        if (nvidiaInfo["cuda"].version === "12") {
-          binaryFolder = path.join(binaryFolder, "linux-cuda-12-0");
-        } else {
-          binaryFolder = path.join(binaryFolder, "linux-cuda-11-7");
-        }
-        cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
-      }
-      binaryName = "nitro";
-    }
-
-    const binaryPath = path.join(binaryFolder, binaryName);
-    // Execute the binary
-    subprocess = spawn(binaryPath, ["1", LOCAL_HOST, PORT.toString()], {
-      cwd: binaryFolder,
-      env: {
-        ...process.env,
-        CUDA_VISIBLE_DEVICES: cudaVisibleDevices,
-      },
-    });
-
-    // Handle subprocess output
-    subprocess.stdout.on("data", (data) => {
-      log(`[NITRO]::Debug: ${data}`);
-    });
-
-    subprocess.stderr.on("data", (data) => {
-      log(`[NITRO]::Error: ${data}`);
-    });
-
-    subprocess.on("close", (code) => {
-      log(`[NITRO]::Debug: Nitro exited with code: ${code}`);
-      subprocess = null;
-      reject(`child process exited with code ${code}`);
-    });
-
-    tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
-      resolve(nitroResourceProbe);
-    });
-  });
-}
-
-/**
- * Get the system resources information
- * TODO: Move to Core so that it can be reused
- */
-function getResourcesInfo(): Promise<ResourcesInfo> {
-  return new Promise(async (resolve) => {
-    const cpu = await osUtils.cpuCount();
-    log(`[NITRO]::CPU informations - ${cpu}`);
-    const response: ResourcesInfo = {
-      numCpuPhysicalCore: cpu,
-      memAvailable: 0,
-    };
-    resolve(response);
-  });
-}
-
-/**
- * This will retrive GPU informations and persist settings.json
- * Will be called when the extension is loaded to turn on GPU acceleration if supported
- */
-async function updateNvidiaInfo() {
-  if (process.platform !== "darwin") {
-    await Promise.all([
-      updateNvidiaDriverInfo(),
-      updateCudaExistence(),
-      updateGpuInfo(),
-    ]);
-  }
-}
-
-/**
- * Retrieve current nitro process
- */
-const getCurrentNitroProcessInfo = (): Promise<any> => {
-  nitroProcessInfo = {
-    isRunning: subprocess != null,
-  };
-  return nitroProcessInfo;
-};
-
-/**
- * Every module should have a dispose function
- * This will be called when the extension is unloaded and should clean up any resources
- * Also called when app is closed
- */
-function dispose() {
-  // clean other registered resources here
-  killSubprocess();
-}
-
-/**
- * Validate nvidia and cuda for linux and windows
- */
-async function updateNvidiaDriverInfo(): Promise<void> {
-  exec(
-    "nvidia-smi --query-gpu=driver_version --format=csv,noheader",
-    (error, stdout) => {
-      let data;
-      try {
-        data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
-      } catch (error) {
-        data = DEFALT_SETTINGS;
-      }
-
-      if (!error) {
-        const firstLine = stdout.split("\n")[0].trim();
-        data["nvidia_driver"].exist = true;
-        data["nvidia_driver"].version = firstLine;
-      } else {
-        data["nvidia_driver"].exist = false;
-      }
-
-      writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
-      Promise.resolve();
-    }
-  );
-}
-
-/**
- * Check if file exists in paths
- */
-function checkFileExistenceInPaths(file: string, paths: string[]): boolean {
-  return paths.some((p) => existsSync(path.join(p, file)));
-}
-
-/**
- * Validate cuda for linux and windows
- */
-function updateCudaExistence() {
-  let filesCuda12: string[];
-  let filesCuda11: string[];
-  let paths: string[];
-  let cudaVersion: string = "";
-
-  if (process.platform === "win32") {
-    filesCuda12 = ["cublas64_12.dll", "cudart64_12.dll", "cublasLt64_12.dll"];
-    filesCuda11 = ["cublas64_11.dll", "cudart64_11.dll", "cublasLt64_11.dll"];
-    paths = process.env.PATH ? process.env.PATH.split(path.delimiter) : [];
-  } else {
-    filesCuda12 = ["libcudart.so.12", "libcublas.so.12", "libcublasLt.so.12"];
-    filesCuda11 = ["libcudart.so.11.0", "libcublas.so.11", "libcublasLt.so.11"];
-    paths = process.env.LD_LIBRARY_PATH
-      ? process.env.LD_LIBRARY_PATH.split(path.delimiter)
-      : [];
-    paths.push("/usr/lib/x86_64-linux-gnu/");
-  }
-
-  let cudaExists = filesCuda12.every(
-    (file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
-  );
-
-  if (!cudaExists) {
-    cudaExists = filesCuda11.every(
-      (file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
-    );
-    if (cudaExists) {
-      cudaVersion = "11";
-    }
-  } else {
-    cudaVersion = "12";
-  }
-
-  let data;
-  try {
-    data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
-  } catch (error) {
-    data = DEFALT_SETTINGS;
-  }
-
-  data["cuda"].exist = cudaExists;
-  data["cuda"].version = cudaVersion;
-  if (cudaExists) {
-    data.run_mode = "gpu";
-  }
-  writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
-}
-
-/**
- * Get GPU information
- */
-async function updateGpuInfo(): Promise<void> {
-  exec(
-    "nvidia-smi --query-gpu=index,memory.total --format=csv,noheader,nounits",
-    (error, stdout) => {
-      let data;
-      try {
-        data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
-      } catch (error) {
-        data = DEFALT_SETTINGS;
-      }
-
-      if (!error) {
-        // Get GPU info and gpu has higher memory first
-        let highestVram = 0;
-        let highestVramId = "0";
-        let gpus = stdout
-          .trim()
-          .split("\n")
-          .map((line) => {
-            let [id, vram] = line.split(", ");
-            vram = vram.replace(/\r/g, "");
-            if (parseFloat(vram) > highestVram) {
-              highestVram = parseFloat(vram);
-              highestVramId = id;
-            }
-            return { id, vram };
-          });
-
-        data["gpus"] = gpus;
-        data["gpu_highest_vram"] = highestVramId;
-      } else {
-        data["gpus"] = [];
-      }
-
-      writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
-      Promise.resolve();
-    }
-  );
-}
-
-module.exports = {
-  initModel,
-  stopModel,
-  killSubprocess,
-  dispose,
-  updateNvidiaInfo,
-  getCurrentNitroProcessInfo,
-};
--- a/extensions/inference-nitro-extension/src/node/execute.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.ts
@ -0,0 +1,65 @@
+import { readFileSync } from "fs";
+import * as path from "path";
+import { NVIDIA_INFO_FILE } from "./nvidia";
+
+export interface NitroExecutableOptions {
+  executablePath: string;
+  cudaVisibleDevices: string;
+}
+/**
+ * Find which executable file to run based on the current platform.
+ * @returns The name of the executable file to run.
+ */
+export const executableNitroFile = (): NitroExecutableOptions => {
+  let binaryFolder = path.join(__dirname, "..", "bin"); // Current directory by default
+  let cudaVisibleDevices = "";
+  let binaryName = "nitro";
+  /**
+   * The binary folder is different for each platform.
+   */
+  if (process.platform === "win32") {
+    /**
+     *  For Windows: win-cpu, win-cuda-11-7, win-cuda-12-0
+     */
+    let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
+    if (nvidiaInfo["run_mode"] === "cpu") {
+      binaryFolder = path.join(binaryFolder, "win-cpu");
+    } else {
+      if (nvidiaInfo["cuda"].version === "12") {
+        binaryFolder = path.join(binaryFolder, "win-cuda-12-0");
+      } else {
+        binaryFolder = path.join(binaryFolder, "win-cuda-11-7");
+      }
+      cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
+    }
+    binaryName = "nitro.exe";
+  } else if (process.platform === "darwin") {
+    /**
+     *  For MacOS: mac-arm64 (Silicon), mac-x64 (InteL)
+     */
+    if (process.arch === "arm64") {
+      binaryFolder = path.join(binaryFolder, "mac-arm64");
+    } else {
+      binaryFolder = path.join(binaryFolder, "mac-x64");
+    }
+  } else {
+    /**
+     *  For Linux: linux-cpu, linux-cuda-11-7, linux-cuda-12-0
+     */
+    let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
+    if (nvidiaInfo["run_mode"] === "cpu") {
+      binaryFolder = path.join(binaryFolder, "linux-cpu");
+    } else {
+      if (nvidiaInfo["cuda"].version === "12") {
+        binaryFolder = path.join(binaryFolder, "linux-cuda-12-0");
+      } else {
+        binaryFolder = path.join(binaryFolder, "linux-cuda-11-7");
+      }
+      cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
+    }
+  }
+  return {
+    executablePath: path.join(binaryFolder, binaryName),
+    cudaVisibleDevices,
+  };
+};
--- a/extensions/inference-nitro-extension/src/node/index.ts
+++ b/extensions/inference-nitro-extension/src/node/index.ts
@ -0,0 +1,379 @@
+import fs from "fs";
+import path from "path";
+import { ChildProcessWithoutNullStreams, spawn } from "child_process";
+import tcpPortUsed from "tcp-port-used";
+import fetchRT from "fetch-retry";
+import osUtils from "os-utils";
+import { log } from "@janhq/core/node";
+import { getNitroProcessInfo, updateNvidiaInfo } from "./nvidia";
+import { Model, InferenceEngine, ModelSettingParams } from "@janhq/core";
+import { executableNitroFile } from "./execute";
+import { homedir } from "os";
+// Polyfill fetch with retry
+const fetchRetry = fetchRT(fetch);
+
+/**
+ * The response object for model init operation.
+ */
+interface ModelInitOptions {
+  modelFullPath: string;
+  model: Model;
+}
+
+/**
+ * The response object of Prompt Template parsing.
+ */
+interface PromptTemplate {
+  system_prompt?: string;
+  ai_prompt?: string;
+  user_prompt?: string;
+  error?: string;
+}
+
+/**
+ * Model setting args for Nitro model load.
+ */
+interface ModelSettingArgs extends ModelSettingParams {
+  llama_model_path: string;
+  cpu_threads: number;
+}
+
+// The PORT to use for the Nitro subprocess
+const PORT = 3928;
+// The HOST address to use for the Nitro subprocess
+const LOCAL_HOST = "127.0.0.1";
+// The URL for the Nitro subprocess
+const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${PORT}`;
+// The URL for the Nitro subprocess to load a model
+const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/loadmodel`;
+// The URL for the Nitro subprocess to validate a model
+const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/modelstatus`;
+// The URL for the Nitro subprocess to kill itself
+const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`;
+
+// The supported model format
+// TODO: Should be an array to support more models
+const SUPPORTED_MODEL_FORMAT = ".gguf";
+
+// The subprocess instance for Nitro
+let subprocess: ChildProcessWithoutNullStreams | undefined = undefined;
+// The current model file url
+let currentModelFile: string = "";
+// The current model settings
+let currentSettings: ModelSettingArgs | undefined = undefined;
+
+/**
+ * Stops a Nitro subprocess.
+ * @param wrapper - The model wrapper.
+ * @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
+ */
+function stopModel(): Promise<void> {
+  return killSubprocess();
+}
+
+/**
+ * Initializes a Nitro subprocess to load a machine learning model.
+ * @param wrapper - The model wrapper.
+ * @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
+ * TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
+ */
+async function runModel(
+  wrapper: ModelInitOptions
+): Promise<ModelOperationResponse | void> {
+  if (wrapper.model.engine !== InferenceEngine.nitro) {
+    // Not a nitro model
+    return Promise.resolve();
+  }
+
+  currentModelFile = wrapper.modelFullPath;
+  const janRoot = path.join(homedir(), "jan");
+  if (!currentModelFile.includes(janRoot)) {
+    currentModelFile = path.join(janRoot, currentModelFile);
+  }
+  const files: string[] = fs.readdirSync(currentModelFile);
+
+  // Look for GGUF model file
+  const ggufBinFile = files.find(
+    (file) =>
+      file === path.basename(currentModelFile) ||
+      file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT)
+  );
+
+  if (!ggufBinFile) return Promise.reject("No GGUF model file found");
+
+  currentModelFile = path.join(currentModelFile, ggufBinFile);
+
+  if (wrapper.model.engine !== InferenceEngine.nitro) {
+    return Promise.reject("Not a nitro model");
+  } else {
+    const nitroResourceProbe = await getResourcesInfo();
+    // Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
+    if (wrapper.model.settings.prompt_template) {
+      const promptTemplate = wrapper.model.settings.prompt_template;
+      const prompt = promptTemplateConverter(promptTemplate);
+      if (prompt?.error) {
+        return Promise.reject(prompt.error);
+      }
+      wrapper.model.settings.system_prompt = prompt.system_prompt;
+      wrapper.model.settings.user_prompt = prompt.user_prompt;
+      wrapper.model.settings.ai_prompt = prompt.ai_prompt;
+    }
+
+    currentSettings = {
+      llama_model_path: currentModelFile,
+      ...wrapper.model.settings,
+      // This is critical and requires real system information
+      cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
+    };
+    return runNitroAndLoadModel();
+  }
+}
+
+/**
+ * 1. Spawn Nitro process
+ * 2. Load model into Nitro subprocess
+ * 3. Validate model status
+ * @returns
+ */
+async function runNitroAndLoadModel() {
+  // Gather system information for CPU physical cores and memory
+  return killSubprocess()
+    .then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
+    .then(() => {
+      /**
+       * There is a problem with Windows process manager
+       * Should wait for awhile to make sure the port is free and subprocess is killed
+       * The tested threshold is 500ms
+       **/
+      if (process.platform === "win32") {
+        return new Promise((resolve) => setTimeout(resolve, 500));
+      } else {
+        return Promise.resolve();
+      }
+    })
+    .then(spawnNitroProcess)
+    .then(() => loadLLMModel(currentSettings))
+    .then(validateModelStatus)
+    .catch((err) => {
+      // TODO: Broadcast error so app could display proper error message
+      log(`[NITRO]::Error: ${err}`);
+      return { error: err };
+    });
+}
+
+/**
+ * Parse prompt template into agrs settings
+ * @param promptTemplate Template as string
+ * @returns
+ */
+function promptTemplateConverter(promptTemplate: string): PromptTemplate {
+  // Split the string using the markers
+  const systemMarker = "{system_message}";
+  const promptMarker = "{prompt}";
+
+  if (
+    promptTemplate.includes(systemMarker) &&
+    promptTemplate.includes(promptMarker)
+  ) {
+    // Find the indices of the markers
+    const systemIndex = promptTemplate.indexOf(systemMarker);
+    const promptIndex = promptTemplate.indexOf(promptMarker);
+
+    // Extract the parts of the string
+    const system_prompt = promptTemplate.substring(0, systemIndex);
+    const user_prompt = promptTemplate.substring(
+      systemIndex + systemMarker.length,
+      promptIndex
+    );
+    const ai_prompt = promptTemplate.substring(
+      promptIndex + promptMarker.length
+    );
+
+    // Return the split parts
+    return { system_prompt, user_prompt, ai_prompt };
+  } else if (promptTemplate.includes(promptMarker)) {
+    // Extract the parts of the string for the case where only promptMarker is present
+    const promptIndex = promptTemplate.indexOf(promptMarker);
+    const user_prompt = promptTemplate.substring(0, promptIndex);
+    const ai_prompt = promptTemplate.substring(
+      promptIndex + promptMarker.length
+    );
+
+    // Return the split parts
+    return { user_prompt, ai_prompt };
+  }
+
+  // Return an error if none of the conditions are met
+  return { error: "Cannot split prompt template" };
+}
+
+/**
+ * Loads a LLM model into the Nitro subprocess by sending a HTTP POST request.
+ * @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
+ */
+function loadLLMModel(settings: any): Promise<Response> {
+  log(`[NITRO]::Debug: Loading model with params ${JSON.stringify(settings)}`);
+  return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify(settings),
+    retries: 3,
+    retryDelay: 500,
+  })
+    .then((res) => {
+      log(
+        `[NITRO]::Debug: Load model success with response ${JSON.stringify(
+          res
+        )}`
+      );
+      return Promise.resolve(res);
+    })
+    .catch((err) => {
+      log(`[NITRO]::Error: Load model failed with error ${err}`);
+      return Promise.reject();
+    });
+}
+
+/**
+ * Validates the status of a model.
+ * @returns {Promise<ModelOperationResponse>} A promise that resolves to an object.
+ * If the model is loaded successfully, the object is empty.
+ * If the model is not loaded successfully, the object contains an error message.
+ */
+async function validateModelStatus(): Promise<void> {
+  // Send a GET request to the validation URL.
+  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
+  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
+    method: "GET",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    retries: 5,
+    retryDelay: 500,
+  }).then(async (res: Response) => {
+    log(
+      `[NITRO]::Debug: Validate model state success with response ${JSON.stringify(
+        res
+      )}`
+    );
+    // If the response is OK, check model_loaded status.
+    if (res.ok) {
+      const body = await res.json();
+      // If the model is loaded, return an empty object.
+      // Otherwise, return an object with an error message.
+      if (body.model_loaded) {
+        return Promise.resolve();
+      }
+    }
+    return Promise.reject("Validate model status failed");
+  });
+}
+
+/**
+ * Terminates the Nitro subprocess.
+ * @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
+ */
+async function killSubprocess(): Promise<void> {
+  const controller = new AbortController();
+  setTimeout(() => controller.abort(), 5000);
+  log(`[NITRO]::Debug: Request to kill Nitro`);
+
+  return fetch(NITRO_HTTP_KILL_URL, {
+    method: "DELETE",
+    signal: controller.signal,
+  })
+    .then(() => {
+      subprocess?.kill();
+      subprocess = undefined;
+    })
+    .catch(() => {})
+    .then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
+    .then(() => log(`[NITRO]::Debug: Nitro process is terminated`));
+}
+
+/**
+ * Spawns a Nitro subprocess.
+ * @returns A promise that resolves when the Nitro subprocess is started.
+ */
+function spawnNitroProcess(): Promise<any> {
+  log(`[NITRO]::Debug: Spawning Nitro subprocess...`);
+
+  return new Promise<void>(async (resolve, reject) => {
+    let binaryFolder = path.join(__dirname, "..", "bin"); // Current directory by default
+    let executableOptions = executableNitroFile();
+
+    const args: string[] = ["1", LOCAL_HOST, PORT.toString()];
+    // Execute the binary
+    log(
+      `[NITRO]::Debug: Spawn nitro at path: ${executableOptions.executablePath}, and args: ${args}`
+    );
+    subprocess = spawn(
+      executableOptions.executablePath,
+      ["1", LOCAL_HOST, PORT.toString()],
+      {
+        cwd: binaryFolder,
+        env: {
+          ...process.env,
+          CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
+        },
+      }
+    );
+
+    // Handle subprocess output
+    subprocess.stdout.on("data", (data: any) => {
+      log(`[NITRO]::Debug: ${data}`);
+    });
+
+    subprocess.stderr.on("data", (data: any) => {
+      log(`[NITRO]::Error: ${data}`);
+    });
+
+    subprocess.on("close", (code: any) => {
+      log(`[NITRO]::Debug: Nitro exited with code: ${code}`);
+      subprocess = undefined;
+      reject(`child process exited with code ${code}`);
+    });
+
+    tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
+      log(`[NITRO]::Debug: Nitro is ready`);
+      resolve();
+    });
+  });
+}
+
+/**
+ * Get the system resources information
+ * TODO: Move to Core so that it can be reused
+ */
+function getResourcesInfo(): Promise<ResourcesInfo> {
+  return new Promise(async (resolve) => {
+    const cpu = await osUtils.cpuCount();
+    log(`[NITRO]::CPU informations - ${cpu}`);
+    const response: ResourcesInfo = {
+      numCpuPhysicalCore: cpu,
+      memAvailable: 0,
+    };
+    resolve(response);
+  });
+}
+
+/**
+ * Every module should have a dispose function
+ * This will be called when the extension is unloaded and should clean up any resources
+ * Also called when app is closed
+ */
+function dispose() {
+  // clean other registered resources here
+  killSubprocess();
+}
+
+export default {
+  runModel,
+  stopModel,
+  killSubprocess,
+  dispose,
+  updateNvidiaInfo,
+  getCurrentNitroProcessInfo: () => getNitroProcessInfo(subprocess),
+};
--- a/extensions/inference-nitro-extension/src/node/nvidia.ts
+++ b/extensions/inference-nitro-extension/src/node/nvidia.ts
@ -0,0 +1,201 @@
+import { writeFileSync, existsSync, readFileSync } from "fs";
+import { exec } from "child_process";
+import path from "path";
+import { homedir } from "os";
+
+/**
+ * Default GPU settings
+ **/
+const DEFALT_SETTINGS = {
+  notify: true,
+  run_mode: "cpu",
+  nvidia_driver: {
+    exist: false,
+    version: "",
+  },
+  cuda: {
+    exist: false,
+    version: "",
+  },
+  gpus: [],
+  gpu_highest_vram: "",
+};
+
+/**
+ * Path to the settings file
+ **/
+export const NVIDIA_INFO_FILE = path.join(
+  homedir(),
+  "jan",
+  "settings",
+  "settings.json"
+);
+
+/**
+ * Current nitro process
+ */
+let nitroProcessInfo: NitroProcessInfo | undefined = undefined;
+
+/**
+ * Nitro process info
+ */
+export interface NitroProcessInfo {
+  isRunning: boolean
+}
+
+/**
+ * This will retrive GPU informations and persist settings.json
+ * Will be called when the extension is loaded to turn on GPU acceleration if supported
+ */
+export async function updateNvidiaInfo() {
+  if (process.platform !== "darwin") {
+    await Promise.all([
+      updateNvidiaDriverInfo(),
+      updateCudaExistence(),
+      updateGpuInfo(),
+    ]);
+  }
+}
+
+/**
+ * Retrieve current nitro process
+ */
+export const getNitroProcessInfo = (subprocess: any): NitroProcessInfo => {
+  nitroProcessInfo = {
+    isRunning: subprocess != null,
+  };
+  return nitroProcessInfo;
+};
+
+/**
+ * Validate nvidia and cuda for linux and windows
+ */
+export async function updateNvidiaDriverInfo(): Promise<void> {
+  exec(
+    "nvidia-smi --query-gpu=driver_version --format=csv,noheader",
+    (error, stdout) => {
+      let data;
+      try {
+        data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
+      } catch (error) {
+        data = DEFALT_SETTINGS;
+      }
+
+      if (!error) {
+        const firstLine = stdout.split("\n")[0].trim();
+        data["nvidia_driver"].exist = true;
+        data["nvidia_driver"].version = firstLine;
+      } else {
+        data["nvidia_driver"].exist = false;
+      }
+
+      writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
+      Promise.resolve();
+    }
+  );
+}
+
+/**
+ * Check if file exists in paths
+ */
+export function checkFileExistenceInPaths(
+  file: string,
+  paths: string[]
+): boolean {
+  return paths.some((p) => existsSync(path.join(p, file)));
+}
+
+/**
+ * Validate cuda for linux and windows
+ */
+export function updateCudaExistence() {
+  let filesCuda12: string[];
+  let filesCuda11: string[];
+  let paths: string[];
+  let cudaVersion: string = "";
+
+  if (process.platform === "win32") {
+    filesCuda12 = ["cublas64_12.dll", "cudart64_12.dll", "cublasLt64_12.dll"];
+    filesCuda11 = ["cublas64_11.dll", "cudart64_11.dll", "cublasLt64_11.dll"];
+    paths = process.env.PATH ? process.env.PATH.split(path.delimiter) : [];
+  } else {
+    filesCuda12 = ["libcudart.so.12", "libcublas.so.12", "libcublasLt.so.12"];
+    filesCuda11 = ["libcudart.so.11.0", "libcublas.so.11", "libcublasLt.so.11"];
+    paths = process.env.LD_LIBRARY_PATH
+      ? process.env.LD_LIBRARY_PATH.split(path.delimiter)
+      : [];
+    paths.push("/usr/lib/x86_64-linux-gnu/");
+  }
+
+  let cudaExists = filesCuda12.every(
+    (file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
+  );
+
+  if (!cudaExists) {
+    cudaExists = filesCuda11.every(
+      (file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
+    );
+    if (cudaExists) {
+      cudaVersion = "11";
+    }
+  } else {
+    cudaVersion = "12";
+  }
+
+  let data;
+  try {
+    data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
+  } catch (error) {
+    data = DEFALT_SETTINGS;
+  }
+
+  data["cuda"].exist = cudaExists;
+  data["cuda"].version = cudaVersion;
+  if (cudaExists) {
+    data.run_mode = "gpu";
+  }
+  writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
+}
+
+/**
+ * Get GPU information
+ */
+export async function updateGpuInfo(): Promise<void> {
+  exec(
+    "nvidia-smi --query-gpu=index,memory.total --format=csv,noheader,nounits",
+    (error, stdout) => {
+      let data;
+      try {
+        data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
+      } catch (error) {
+        data = DEFALT_SETTINGS;
+      }
+
+      if (!error) {
+        // Get GPU info and gpu has higher memory first
+        let highestVram = 0;
+        let highestVramId = "0";
+        let gpus = stdout
+          .trim()
+          .split("\n")
+          .map((line) => {
+            let [id, vram] = line.split(", ");
+            vram = vram.replace(/\r/g, "");
+            if (parseFloat(vram) > highestVram) {
+              highestVram = parseFloat(vram);
+              highestVramId = id;
+            }
+            return { id, vram };
+          });
+
+        data["gpus"] = gpus;
+        data["gpu_highest_vram"] = highestVramId;
+      } else {
+        data["gpus"] = [];
+      }
+
+      writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
+      Promise.resolve();
+    }
+  );
+}
--- a/extensions/inference-nitro-extension/tsconfig.json
+++ b/extensions/inference-nitro-extension/tsconfig.json
@ -1,15 +1,19 @@
 {
  "compilerOptions": {
-    "target": "es2016",
-    "module": "ES6",
    "moduleResolution": "node",
-
-    "outDir": "./dist",
-    "esModuleInterop": true,
-    "forceConsistentCasingInFileNames": true,
-    "strict": false,
-    "skipLibCheck": true,
-    "rootDir": "./src"
+    "target": "es5",
+    "module": "ES2020",
+    "lib": ["es2015", "es2016", "es2017", "dom"],
+    "strict": true,
+    "sourceMap": true,
+    "declaration": true,
+    "allowSyntheticDefaultImports": true,
+    "experimentalDecorators": true,
+    "emitDecoratorMetadata": true,
+    "declarationDir": "dist/types",
+    "outDir": "dist",
+    "importHelpers": true,
+    "typeRoots": ["node_modules/@types"]
  },
-  "include": ["./src"]
+  "include": ["src"]
 }
--- a/extensions/inference-nitro-extension/webpack.config.js
+++ b/extensions/inference-nitro-extension/webpack.config.js
@ -1,43 +0,0 @@
-const path = require("path");
-const webpack = require("webpack");
-const packageJson = require("./package.json");
-
-module.exports = {
-  experiments: { outputModule: true },
-  entry: "./src/index.ts", // Adjust the entry point to match your project's main file
-  mode: "production",
-  module: {
-    rules: [
-      {
-        test: /\.tsx?$/,
-        use: "ts-loader",
-        exclude: /node_modules/,
-      },
-    ],
-  },
-  plugins: [
-    new webpack.DefinePlugin({
-      MODULE: JSON.stringify(`${packageJson.name}/${packageJson.module}`),
-      INFERENCE_URL: JSON.stringify(
-        process.env.INFERENCE_URL ||
-          "http://127.0.0.1:3928/inferences/llamacpp/chat_completion"
-      ),
-      TROUBLESHOOTING_URL: JSON.stringify("https://jan.ai/guides/troubleshooting")
-    }),
-  ],
-  output: {
-    filename: "index.js", // Adjust the output file name as needed
-    path: path.resolve(__dirname, "dist"),
-    library: { type: "module" }, // Specify ESM output format
-  },
-  resolve: {
-    extensions: [".ts", ".js"],
-    fallback: {
-      path: require.resolve("path-browserify"),
-    },
-  },
-  optimization: {
-    minimize: false,
-  },
-  // Add loaders and other configuration as needed for your project
-};