Merge pull request #814 from janhq/feat/inference_engines

feat: Multiple inference engines for nitro and openai
2023-12-09 01:09:47 +07:00 · 2023-12-09 01:09:47 +07:00 · ee16683d0a
commit ee16683d0a
parent 8bde944122 bbffaafcdc
66 changed files with 999 additions and 263 deletions
--- a/.gitignore
+++ b/.gitignore
@ -17,7 +17,7 @@ package-lock.json
 core/lib/**
 # Nitro binary files
-extensions/inference-extension/nitro/*/nitro
+extensions/inference-nitro-extension/bin/*/nitro
-extensions/inference-extension/nitro/*/*.exe
+extensions/inference-nitro-extension/bin/*/*.exe
-extensions/inference-extension/nitro/*/*.dll
+extensions/inference-nitro-extension/bin/*/*.dll
-extensions/inference-extension/nitro/*/*.metal
+extensions/inference-nitro-extension/bin/*/*.metal
--- a/core/src/events.ts
+++ b/core/src/events.ts
@ -8,6 +8,18 @@ export enum EventName {
  OnMessageResponse = "OnMessageResponse",
  /** The `OnMessageUpdate` event is emitted when a message is updated. */
  OnMessageUpdate = "OnMessageUpdate",
  /** The `OnModelInit` event is emitted when a model inits. */
  OnModelInit = "OnModelInit",
  /** The `OnModelReady` event is emitted when a model ready. */
  OnModelReady = "OnModelReady",
  /** The `OnModelFail` event is emitted when a model fails loading. */
  OnModelFail = "OnModelFail",
  /** The `OnModelStop` event is emitted when a model start to stop. */
  OnModelStop = "OnModelStop",
  /** The `OnModelStopped` event is emitted when a model stopped ok. */
  OnModelStopped = "OnModelStopped",
  /** The `OnInferenceStopped` event is emitted when a inference is stopped. */
  OnInferenceStopped = "OnInferenceStopped",
 }
 /**
--- a/core/src/extensions/inference.ts
+++ b/core/src/extensions/inference.ts
@ -5,26 +5,10 @@ import { BaseExtension } from "../extension";
 * Inference extension. Start, stop and inference models.
 */
 export abstract class InferenceExtension extends BaseExtension {
  /**
   * Initializes the model for the extension.
   * @param modelId - The ID of the model to initialize.
   */
  abstract initModel(modelId: string, settings?: ModelSettingParams): Promise<void>;
  /**
   * Stops the model for the extension.
   */
  abstract stopModel(): Promise<void>;
  /**
   * Stops the streaming inference.
   */
  abstract stopInference(): Promise<void>;
  /**
   * Processes an inference request.
   * @param data - The data for the inference request.
   * @returns The result of the inference request.
   */
-  abstract inferenceRequest(data: MessageRequest): Promise<ThreadMessage>;
+  abstract inference(data: MessageRequest): Promise<ThreadMessage>;
 }
--- a/core/src/fs.ts
+++ b/core/src/fs.ts
@ -5,52 +5,52 @@
 * @returns {Promise<any>} A Promise that resolves when the file is written successfully.
 */
 const writeFile: (path: string, data: string) => Promise<any> = (path, data) =>
-  global.core.api?.writeFile(path, data);
+  global.core.api?.writeFile(path, data)
 /**
 * Checks whether the path is a directory.
 * @param path - The path to check.
 * @returns {boolean} A boolean indicating whether the path is a directory.
 */
-const isDirectory = (path: string): Promise<boolean> =>
+const isDirectory = (path: string): Promise<boolean> => global.core.api?.isDirectory(path)
  global.core.api?.isDirectory(path);
 /**
 * Reads the contents of a file at the specified path.
 * @param {string} path - The path of the file to read.
 * @returns {Promise<any>} A Promise that resolves with the contents of the file.
 */
-const readFile: (path: string) => Promise<any> = (path) =>
+const readFile: (path: string) => Promise<any> = (path) => global.core.api?.readFile(path)
-  global.core.api?.readFile(path);
+/**
 * Check whether the file exists
 * @param {string} path
 * @returns {boolean} A boolean indicating whether the path is a file.
 */
 const exists = (path: string): Promise<boolean> => global.core.api?.exists(path)
 /**
 * List the directory files
 * @param {string} path - The path of the directory to list files.
 * @returns {Promise<any>} A Promise that resolves with the contents of the directory.
 */
-const listFiles: (path: string) => Promise<any> = (path) =>
+const listFiles: (path: string) => Promise<any> = (path) => global.core.api?.listFiles(path)
  global.core.api?.listFiles(path);
 /**
 * Creates a directory at the specified path.
 * @param {string} path - The path of the directory to create.
 * @returns {Promise<any>} A Promise that resolves when the directory is created successfully.
 */
-const mkdir: (path: string) => Promise<any> = (path) =>
+const mkdir: (path: string) => Promise<any> = (path) => global.core.api?.mkdir(path)
  global.core.api?.mkdir(path);
 /**
 * Removes a directory at the specified path.
 * @param {string} path - The path of the directory to remove.
 * @returns {Promise<any>} A Promise that resolves when the directory is removed successfully.
 */
-const rmdir: (path: string) => Promise<any> = (path) =>
+const rmdir: (path: string) => Promise<any> = (path) => global.core.api?.rmdir(path)
  global.core.api?.rmdir(path);
 /**
 * Deletes a file from the local file system.
 * @param {string} path - The path of the file to delete.
 * @returns {Promise<any>} A Promise that resolves when the file is deleted.
 */
-const deleteFile: (path: string) => Promise<any> = (path) =>
+const deleteFile: (path: string) => Promise<any> = (path) => global.core.api?.deleteFile(path)
  global.core.api?.deleteFile(path);
 /**
 * Appends data to a file at the specified path.
@ -58,10 +58,10 @@ const deleteFile: (path: string) => Promise<any> = (path) =>
 * @param data data to append
 */
 const appendFile: (path: string, data: string) => Promise<any> = (path, data) =>
-  global.core.api?.appendFile(path, data);
+  global.core.api?.appendFile(path, data)
 const copyFile: (src: string, dest: string) => Promise<any> = (src, dest) =>
-  global.core.api?.copyFile(src, dest);
+  global.core.api?.copyFile(src, dest)
 /**
 * Reads a file line by line.
@ -69,12 +69,13 @@ const copyFile: (src: string, dest: string) => Promise<any> = (src, dest) =>
 * @returns {Promise<any>} A promise that resolves to the lines of the file.
 */
 const readLineByLine: (path: string) => Promise<any> = (path) =>
-  global.core.api?.readLineByLine(path);
+  global.core.api?.readLineByLine(path)
 export const fs = {
  isDirectory,
  writeFile,
  readFile,
  exists,
  listFiles,
  mkdir,
  rmdir,
@ -82,4 +83,4 @@ export const fs = {
  appendFile,
  readLineByLine,
  copyFile,
-};
+}
--- a/core/src/types/index.ts
+++ b/core/src/types/index.ts
@ -41,8 +41,8 @@ export type MessageRequest = {
  /** Messages for constructing a chat completion request **/
  messages?: ChatCompletionMessage[];
-  /** Runtime parameters for constructing a chat completion request **/
+  /** Settings for constructing a chat completion request **/
-  parameters?: ModelRuntimeParam;
+  model?: ModelInfo;
 };
 /**
@ -153,7 +153,8 @@ export type ThreadAssistantInfo = {
 export type ModelInfo = {
  id: string;
  settings: ModelSettingParams;
-  parameters: ModelRuntimeParam;
+  parameters: ModelRuntimeParams;
  engine?: InferenceEngine;
 };
 /**
@ -166,6 +167,17 @@ export type ThreadState = {
  error?: Error;
  lastMessage?: string;
 };
 /**
 * Represents the inference engine.
 * @stored
 */
 enum InferenceEngine {
  nitro = "nitro",
  openai = "openai",
  nvidia_triton = "nvidia_triton",
  hf_endpoint = "hf_endpoint",
 }
 /**
 * Model type defines the shape of a model object.
@ -228,12 +240,16 @@ export interface Model {
  /**
   * The model runtime parameters.
   */
-  parameters: ModelRuntimeParam;
+  parameters: ModelRuntimeParams;
  /**
   * Metadata of the model.
   */
  metadata: ModelMetadata;
  /**
   * The model engine.
   */
  engine: InferenceEngine;
 }
 export type ModelMetadata = {
@ -268,7 +284,7 @@ export type ModelSettingParams = {
 /**
 * The available model runtime parameters.
 */
-export type ModelRuntimeParam = {
+export type ModelRuntimeParams = {
  temperature?: number;
  token_limit?: number;
  top_k?: number;
--- a/docs/openapi/specs/models.yaml
+++ b/docs/openapi/specs/models.yaml
@ -289,7 +289,7 @@ components:
            engine:
              type: string
              description: "The engine used by the model."
-              example: "llamacpp"
+              enum: [nitro, openai, hf_inference]
            quantization:
              type: string
              description: "Quantization parameter of the model."
--- a/electron/handlers/fs.ts
+++ b/electron/handlers/fs.ts
@ -50,6 +50,19 @@ export function handleFsIPCs() {
    })
  })
  /**
   * Checks whether a file exists in the user data directory.
   * @param event - The event object.
   * @param path - The path of the file to check.
   * @returns A promise that resolves with a boolean indicating whether the file exists.
   */
  ipcMain.handle('exists', async (_event, path: string) => {
    return new Promise((resolve, reject) => {
      const fullPath = join(userSpacePath, path)
      fs.existsSync(fullPath) ? resolve(true) : resolve(false)
    })
  })
  /**
   * Writes data to a file in the user data directory.
   * @param event - The event object.
--- a/electron/invokers/fs.ts
+++ b/electron/invokers/fs.ts
@ -27,6 +27,12 @@ export function fsInvokers() {
     */
    readFile: (path: string) => ipcRenderer.invoke('readFile', path),
    /**
     * Reads a file at the specified path.
     * @param {string} path - The path of the file to read.
     */
    exists: (path: string) => ipcRenderer.invoke('exists', path),
    /**
     * Writes data to a file at the specified path.
     * @param {string} path - The path of the file to write to.
--- a/extensions/inference-extension/download.bat
+++ b/extensions/inference-extension/download.bat
@ -1,3 +0,0 @@
@echo off
 set /p NITRO_VERSION=<./nitro/version.txt
 .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64-cuda.tar.gz -e --strip 1 -o ./nitro/win-cuda && .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64.tar.gz -e --strip 1 -o ./nitro/win-cpu
--- a/extensions/inference-extension/nitro/linux-cpu/.gitkeep
+++ b/extensions/inference-extension/nitro/linux-cpu/.gitkeep
--- a/extensions/inference-extension/nitro/linux-cuda/.gitkeep
+++ b/extensions/inference-extension/nitro/linux-cuda/.gitkeep
--- a/extensions/inference-extension/nitro/mac-arm64/.gitkeep
+++ b/extensions/inference-extension/nitro/mac-arm64/.gitkeep
--- a/extensions/inference-extension/nitro/mac-x64/.gitkeep
+++ b/extensions/inference-extension/nitro/mac-x64/.gitkeep
--- a/extensions/inference-extension/nitro/win-cpu/.gitkeep
+++ b/extensions/inference-extension/nitro/win-cpu/.gitkeep
--- a/extensions/inference-extension/nitro/win-cuda/.gitkeep
+++ b/extensions/inference-extension/nitro/win-cuda/.gitkeep
--- a/extensions/inference-extension/package.json
+++ b/extensions/inference-extension/package.json
@ -1,57 +0,0 @@
 {
  "name": "@janhq/inference-extension",
  "version": "1.0.0",
  "description": "Inference Extension, powered by @janhq/nitro, bring a high-performance Llama model inference in pure C++.",
  "main": "dist/index.js",
  "module": "dist/module.js",
  "author": "Jan <service@jan.ai>",
  "license": "AGPL-3.0",
  "scripts": {
    "build": "tsc -b . && webpack --config webpack.config.js",
    "downloadnitro:linux": "NITRO_VERSION=$(cat ./nitro/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./nitro/linux-cpu && chmod +x ./nitro/linux-cpu/nitro && chmod +x ./nitro/linux-start.sh && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda.tar.gz -e --strip 1 -o ./nitro/linux-cuda && chmod +x ./nitro/linux-cuda/nitro && chmod +x ./nitro/linux-start.sh",
    "downloadnitro:darwin": "NITRO_VERSION=$(cat ./nitro/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-arm64.tar.gz -e --strip 1 -o ./nitro/mac-arm64 && chmod +x ./nitro/mac-arm64/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-amd64.tar.gz -e --strip 1 -o ./nitro/mac-x64 && chmod +x ./nitro/mac-x64/nitro",
    "downloadnitro:win32": "download.bat",
    "downloadnitro": "run-script-os",
    "build:publish:darwin": "rimraf *.tgz --glob && npm run build && npm run downloadnitro && ../../.github/scripts/auto-sign.sh && cpx \"nitro/**\" \"dist/nitro\" && npm pack && cpx *.tgz ../../electron/pre-install",
    "build:publish:win32": "rimraf *.tgz --glob && npm run build && npm run downloadnitro && cpx \"nitro/**\" \"dist/nitro\" && npm pack && cpx *.tgz ../../electron/pre-install",
    "build:publish:linux": "rimraf *.tgz --glob && npm run build && npm run downloadnitro && cpx \"nitro/**\" \"dist/nitro\" &&  npm pack && cpx *.tgz ../../electron/pre-install",
    "build:publish": "run-script-os"
  },
  "exports": {
    ".": "./dist/index.js",
    "./main": "./dist/module.js"
  },
  "devDependencies": {
    "cpx": "^1.5.0",
    "rimraf": "^3.0.2",
    "run-script-os": "^1.1.6",
    "webpack": "^5.88.2",
    "webpack-cli": "^5.1.4"
  },
  "dependencies": {
    "@janhq/core": "file:../../core",
    "download-cli": "^1.1.1",
    "electron-log": "^5.0.1",
    "fetch-retry": "^5.0.6",
    "kill-port": "^2.0.1",
    "path-browserify": "^1.0.1",
    "rxjs": "^7.8.1",
    "tcp-port-used": "^1.0.2",
    "ts-loader": "^9.5.0",
    "ulid": "^2.3.0"
  },
  "engines": {
    "node": ">=18.0.0"
  },
  "files": [
    "dist/*",
    "package.json",
    "README.md"
  ],
  "bundleDependencies": [
    "tcp-port-used",
    "kill-port",
    "fetch-retry",
    "electron-log"
  ]
 }
--- a/extensions/inference-extension/src/@types/global.d.ts
+++ b/extensions/inference-extension/src/@types/global.d.ts
@ -1,2 +0,0 @@
 declare const MODULE: string;
 declare const INFERENCE_URL: string;
--- a/extensions/inference-nitro-extension/README.md
+++ b/extensions/inference-nitro-extension/README.md
--- a/extensions/inference-nitro-extension/bin/linux-start.sh
+++ b/extensions/inference-nitro-extension/bin/linux-start.sh
--- a/extensions/inference-nitro-extension/bin/version.txt
+++ b/extensions/inference-nitro-extension/bin/version.txt
--- a/extensions/inference-nitro-extension/bin/win-start.bat
+++ b/extensions/inference-nitro-extension/bin/win-start.bat
--- a/extensions/inference-nitro-extension/download.bat
+++ b/extensions/inference-nitro-extension/download.bat
@ -0,0 +1,3 @@
@echo off
 set /p NITRO_VERSION=<./bin/version.txt
 .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64-cuda.tar.gz -e --strip 1 -o ./bin/win-cuda && .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64.tar.gz -e --strip 1 -o ./bin/win-cpu
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@ -0,0 +1,57 @@
 {
  "name": "@janhq/inference-nitro-extension",
  "version": "1.0.0",
  "description": "Inference Engine for Nitro Extension, powered by @janhq/nitro, bring a high-performance Llama model inference in pure C++.",
  "main": "dist/index.js",
  "module": "dist/module.js",
  "author": "Jan <service@jan.ai>",
  "license": "AGPL-3.0",
  "scripts": {
    "build": "tsc -b . && webpack --config webpack.config.js",
    "downloadnitro:linux": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/nitro && chmod +x ./bin/linux-start.sh && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda.tar.gz -e --strip 1 -o ./bin/linux-cuda && chmod +x ./bin/linux-cuda/nitro && chmod +x ./bin/linux-start.sh",
    "downloadnitro:darwin": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-arm64.tar.gz -e --strip 1 -o ./bin/mac-arm64 && chmod +x ./bin/mac-arm64/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-amd64.tar.gz -e --strip 1 -o ./bin/mac-x64 && chmod +x ./bin/mac-x64/nitro",
    "downloadnitro:win32": "download.bat",
    "downloadnitro": "run-script-os",
    "build:publish:darwin": "rimraf *.tgz --glob && npm run build && npm run downloadnitro && ../../.github/scripts/auto-sign.sh && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../electron/pre-install",
    "build:publish:win32": "rimraf *.tgz --glob && npm run build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../electron/pre-install",
    "build:publish:linux": "rimraf *.tgz --glob && npm run build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" &&  npm pack && cpx *.tgz ../../electron/pre-install",
    "build:publish": "run-script-os"
  },
  "exports": {
    ".": "./dist/index.js",
    "./main": "./dist/module.js"
  },
  "devDependencies": {
    "cpx": "^1.5.0",
    "rimraf": "^3.0.2",
    "run-script-os": "^1.1.6",
    "webpack": "^5.88.2",
    "webpack-cli": "^5.1.4"
  },
  "dependencies": {
    "@janhq/core": "file:../../core",
    "download-cli": "^1.1.1",
    "electron-log": "^5.0.1",
    "fetch-retry": "^5.0.6",
    "kill-port": "^2.0.1",
    "path-browserify": "^1.0.1",
    "rxjs": "^7.8.1",
    "tcp-port-used": "^1.0.2",
    "ts-loader": "^9.5.0",
    "ulid": "^2.3.0"
  },
  "engines": {
    "node": ">=18.0.0"
  },
  "files": [
    "dist/*",
    "package.json",
    "README.md"
  ],
  "bundleDependencies": [
    "tcp-port-used",
    "kill-port",
    "fetch-retry",
    "electron-log"
  ]
 }
--- a/extensions/inference-nitro-extension/src/@types/global.d.ts
+++ b/extensions/inference-nitro-extension/src/@types/global.d.ts
@ -0,0 +1,26 @@
 declare const MODULE: string;
 declare const INFERENCE_URL: string;
 /**
 * The parameters for the initModel function.
 * @property settings - The settings for the machine learning model.
 * @property settings.ctx_len - The context length.
 * @property settings.ngl - The number of generated tokens.
 * @property settings.cont_batching - Whether to use continuous batching.
 * @property settings.embedding - Whether to use embedding.
 */
 interface EngineSettings {
  ctx_len: number;
  ngl: number;
  cont_batching: boolean;
  embedding: boolean;
 }
 /**
 * The response from the initModel function.
 * @property error - An error message if the model fails to load.
 */
 interface ModelOperationResponse {
  error?: any;
  modelFile?: string;
 }
--- a/extensions/inference-nitro-extension/src/helpers/sse.ts
+++ b/extensions/inference-nitro-extension/src/helpers/sse.ts
@ -1,3 +1,4 @@
 import { Model } from "@janhq/core";
 import { Observable } from "rxjs";
 /**
 * Sends a request to the inference server to generate a response based on the recent messages.
@ -6,21 +7,23 @@ import { Observable } from "rxjs";
 */
 export function requestInference(
  recentMessages: any[],
  engine: EngineSettings,
  model: Model,
  controller?: AbortController
 ): Observable<string> {
  return new Observable((subscriber) => {
    const requestBody = JSON.stringify({
      messages: recentMessages,
      model: model.id,
      stream: true,
-      model: "gpt-3.5-turbo",
+      // ...model.parameters,
      max_tokens: 2048,
    });
    fetch(INFERENCE_URL, {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        Accept: "text/event-stream",
        "Access-Control-Allow-Origin": "*",
        Accept: "text/event-stream",
      },
      body: requestBody,
      signal: controller?.signal,
--- a/extensions/inference-nitro-extension/src/index.ts
+++ b/extensions/inference-nitro-extension/src/index.ts
@ -19,6 +19,8 @@ import {
  events,
  executeOnMain,
  getUserSpace,
  fs,
  Model,
 } from "@janhq/core";
 import { InferenceExtension } from "@janhq/core";
 import { requestInference } from "./helpers/sse";
@ -30,7 +32,19 @@ import { join } from "path";
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 */
-export default class JanInferenceExtension implements InferenceExtension {
+export default class JanInferenceNitroExtension implements InferenceExtension {
  private static readonly _homeDir = "engines";
  private static readonly _engineMetadataFileName = "nitro.json";
  private static _currentModel: Model;
  private static _engineSettings: EngineSettings = {
    ctx_len: 2048,
    ngl: 100,
    cont_batching: false,
    embedding: false,
  };
  controller = new AbortController();
  isCancelled = false;
  /**
@ -45,51 +59,88 @@ export default class JanInferenceExtension implements InferenceExtension {
   * Subscribes to events emitted by the @janhq/core package.
   */
  onLoad(): void {
    fs.mkdir(JanInferenceNitroExtension._homeDir);
    this.writeDefaultEngineSettings();
    // Events subscription
    events.on(EventName.OnMessageSent, (data) =>
-      JanInferenceExtension.handleMessageRequest(data, this)
+      JanInferenceNitroExtension.handleMessageRequest(data, this)
    );
    events.on(EventName.OnModelInit, (model: Model) => {
      JanInferenceNitroExtension.handleModelInit(model);
    });
    events.on(EventName.OnModelStop, (model: Model) => {
      JanInferenceNitroExtension.handleModelStop(model);
    });
    events.on(EventName.OnInferenceStopped, () => {
      JanInferenceNitroExtension.handleInferenceStopped(this);
    });
  }
  /**
   * Stops the model inference.
   */
-  onUnload(): void {
+  onUnload(): void {}
-    this.stopModel();
+
  private async writeDefaultEngineSettings() {
    try {
      const engineFile = join(
        JanInferenceNitroExtension._homeDir,
        JanInferenceNitroExtension._engineMetadataFileName
      );
      if (await fs.exists(engineFile)) {
        JanInferenceNitroExtension._engineSettings = JSON.parse(
          await fs.readFile(engineFile)
        );
      } else {
        await fs.writeFile(
          engineFile,
          JSON.stringify(JanInferenceNitroExtension._engineSettings, null, 2)
        );
      }
    } catch (err) {
      console.error(err);
    }
  }
-  /**
+  private static async handleModelInit(model: Model) {
-   * Initializes the model with the specified file name.
+    if (model.engine !== "nitro") {
-   * @param {string} modelId - The ID of the model to initialize.
+      return;
-   * @returns {Promise<void>} A promise that resolves when the model is initialized.
+    }
   */
  async initModel(
    modelId: string,
    settings?: ModelSettingParams
  ): Promise<void> {
    const userSpacePath = await getUserSpace();
-    const modelFullPath = join(userSpacePath, "models", modelId, modelId);
+    const modelFullPath = join(userSpacePath, "models", model.id, model.id);
-    return executeOnMain(MODULE, "initModel", {
+    const nitroInitResult = await executeOnMain(MODULE, "initModel", {
-      modelFullPath,
+      modelFullPath: modelFullPath,
-      settings,
+      model: model,
    });
    if (nitroInitResult.error === null) {
      events.emit(EventName.OnModelFail, model);
    } else {
      JanInferenceNitroExtension._currentModel = model;
      events.emit(EventName.OnModelReady, model);
    }
  }
-  /**
+  private static async handleModelStop(model: Model) {
-   * Stops the model.
+    if (model.engine !== "nitro") {
-   * @returns {Promise<void>} A promise that resolves when the model is stopped.
+      return;
-   */
+    } else {
-  async stopModel(): Promise<void> {
+      await executeOnMain(MODULE, "stopModel");
-    return executeOnMain(MODULE, "killSubprocess");
+      events.emit(EventName.OnModelStopped, model);
    }
  }
-  /**
+  private static async handleInferenceStopped(
-   * Stops streaming inference.
+    instance: JanInferenceNitroExtension
-   * @returns {Promise<void>} A promise that resolves when the streaming is stopped.
+  ) {
-   */
+    instance.isCancelled = true;
-  async stopInference(): Promise<void> {
+    instance.controller?.abort();
    this.isCancelled = true;
    this.controller?.abort();
  }
  /**
@ -97,7 +148,7 @@ export default class JanInferenceExtension implements InferenceExtension {
   * @param {MessageRequest} data - The data for the inference request.
   * @returns {Promise<any>} A promise that resolves with the inference response.
   */
-  async inferenceRequest(data: MessageRequest): Promise<ThreadMessage> {
+  async inference(data: MessageRequest): Promise<ThreadMessage> {
    const timestamp = Date.now();
    const message: ThreadMessage = {
      thread_id: data.threadId,
@ -111,7 +162,11 @@ export default class JanInferenceExtension implements InferenceExtension {
    };
    return new Promise(async (resolve, reject) => {
-      requestInference(data.messages ?? []).subscribe({
+      requestInference(
        data.messages ?? [],
        JanInferenceNitroExtension._engineSettings,
        JanInferenceNitroExtension._currentModel
      ).subscribe({
        next: (_content) => {},
        complete: async () => {
          resolve(message);
@ -131,8 +186,11 @@ export default class JanInferenceExtension implements InferenceExtension {
   */
  private static async handleMessageRequest(
    data: MessageRequest,
-    instance: JanInferenceExtension
+    instance: JanInferenceNitroExtension
  ) {
    if (data.model.engine !== "nitro") {
      return;
    }
    const timestamp = Date.now();
    const message: ThreadMessage = {
      id: ulid(),
@ -150,7 +208,12 @@ export default class JanInferenceExtension implements InferenceExtension {
    instance.isCancelled = false;
    instance.controller = new AbortController();
-    requestInference(data.messages, instance.controller).subscribe({
+    requestInference(
      data.messages ?? [],
      JanInferenceNitroExtension._engineSettings,
      JanInferenceNitroExtension._currentModel,
      instance.controller
    ).subscribe({
      next: (content) => {
        const messageContent: ThreadContent = {
          type: ContentType.Text,
--- a/extensions/inference-nitro-extension/src/module.ts
+++ b/extensions/inference-nitro-extension/src/module.ts
@ -20,51 +20,51 @@ let subprocess = null;
 let currentModelFile = null;
 /**
- * The response from the initModel function.
+ * Stops a Nitro subprocess.
- * @property error - An error message if the model fails to load.
+ * @param wrapper - The model wrapper.
 * @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
 */
-interface InitModelResponse {
+function stopModel(): Promise<ModelOperationResponse> {
-  error?: any;
+  return new Promise((resolve, reject) => {
-  modelFile?: string;
+    checkAndUnloadNitro();
    resolve({ error: undefined });
  });
 }
 /**
 * Initializes a Nitro subprocess to load a machine learning model.
- * @param modelFile - The name of the machine learning model file.
+ * @param wrapper - The model wrapper.
 * @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
 * TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
 * TODO: Should it be startModel instead?
 */
-function initModel(wrapper: any): Promise<InitModelResponse> {
+function initModel(wrapper: any): Promise<ModelOperationResponse> {
  // 1. Check if the model file exists
  currentModelFile = wrapper.modelFullPath;
-  log.info("Started to load model " + wrapper.modelFullPath);
+  if (wrapper.model.engine !== "nitro") {
-
+    return Promise.resolve({ error: "Not a nitro model" });
-  const settings = {
+  } else {
-    llama_model_path: currentModelFile,
+    log.info("Started to load model " + wrapper.model.modelFullPath);
-    ctx_len: 2048,
+    const settings = {
-    ngl: 100,
+      llama_model_path: currentModelFile,
-    cont_batching: false,
+      ...wrapper.model.settings,
-    embedding: false, // Always enable embedding mode on
+    };
-    ...wrapper.settings,
+    log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`);
-  };
+    return (
-  log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`);
+      // 1. Check if the port is used, if used, attempt to unload model / kill nitro process
-
+      validateModelVersion()
-  return (
+        .then(checkAndUnloadNitro)
-    // 1. Check if the port is used, if used, attempt to unload model / kill nitro process
+        // 2. Spawn the Nitro subprocess
-    validateModelVersion()
+        .then(spawnNitroProcess)
-      .then(checkAndUnloadNitro)
+        // 4. Load the model into the Nitro subprocess (HTTP POST request)
-      // 2. Spawn the Nitro subprocess
+        .then(() => loadLLMModel(settings))
-      .then(spawnNitroProcess)
+        // 5. Check if the model is loaded successfully
-      // 4. Load the model into the Nitro subprocess (HTTP POST request)
+        .then(validateModelStatus)
-      .then(() => loadLLMModel(settings))
+        .catch((err) => {
-      // 5. Check if the model is loaded successfully
+          log.error("error: " + JSON.stringify(err));
-      .then(validateModelStatus)
+          return { error: err, currentModelFile };
-      .catch((err) => {
+        })
-        log.error("error: " + JSON.stringify(err));
+    );
-        return { error: err, currentModelFile };
+  }
      })
  );
 }
 /**
@ -91,11 +91,11 @@ function loadLLMModel(settings): Promise<Response> {
 /**
 * Validates the status of a model.
- * @returns {Promise<InitModelResponse>} A promise that resolves to an object.
+ * @returns {Promise<ModelOperationResponse>} A promise that resolves to an object.
 * If the model is loaded successfully, the object is empty.
 * If the model is not loaded successfully, the object contains an error message.
 */
-async function validateModelStatus(): Promise<InitModelResponse> {
+async function validateModelStatus(): Promise<ModelOperationResponse> {
  // Send a GET request to the validation URL.
  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
@ -142,8 +142,8 @@ function killSubprocess(): Promise<void> {
 * Check port is used or not, if used, attempt to unload model
 * If unload failed, kill the port
 */
-function checkAndUnloadNitro() {
+async function checkAndUnloadNitro() {
-  return tcpPortUsed.check(PORT, LOCAL_HOST).then((inUse) => {
+  return tcpPortUsed.check(PORT, LOCAL_HOST).then(async (inUse) => {
    // If inUse - try unload or kill process, otherwise do nothing
    if (inUse) {
      // Attempt to unload model
@ -168,7 +168,7 @@ function checkAndUnloadNitro() {
 */
 async function spawnNitroProcess(): Promise<void> {
  return new Promise((resolve, reject) => {
-    let binaryFolder = path.join(__dirname, "nitro"); // Current directory by default
+    let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
    let binaryName;
    if (process.platform === "win32") {
--- a/extensions/inference-nitro-extension/tsconfig.json
+++ b/extensions/inference-nitro-extension/tsconfig.json
--- a/extensions/inference-nitro-extension/webpack.config.js
+++ b/extensions/inference-nitro-extension/webpack.config.js
--- a/extensions/inference-openai-extension/README.md
+++ b/extensions/inference-openai-extension/README.md
@ -0,0 +1,78 @@
 # Jan inference plugin
 Created using Jan app example
 # Create a Jan Plugin using Typescript
 Use this template to bootstrap the creation of a TypeScript Jan plugin. 🚀
 ## Create Your Own Plugin
 To create your own plugin, you can use this repository as a template! Just follow the below instructions:
 1. Click the Use this template button at the top of the repository
 2. Select Create a new repository
 3. Select an owner and name for your new repository
 4. Click Create repository
 5. Clone your new repository
 ## Initial Setup
 After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your plugin.
 > [!NOTE]
 >
 > You'll need to have a reasonably modern version of
 > [Node.js](https://nodejs.org) handy. If you are using a version manager like
 > [`nodenv`](https://github.com/nodenv/nodenv) or
 > [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
 > root of your repository to install the version specified in
 > [`package.json`](./package.json). Otherwise, 20.x or later should work!
 1. :hammer_and_wrench: Install the dependencies
   ```bash
   npm install
   ```
 1. :building_construction: Package the TypeScript for distribution
   ```bash
   npm run bundle
   ```
 1. :white_check_mark: Check your artifact
   There will be a tgz file in your plugin directory now
 ## Update the Plugin Metadata
 The [`package.json`](package.json) file defines metadata about your plugin, such as
 plugin name, main entry, description and version.
 When you copy this repository, update `package.json` with the name, description for your plugin.
 ## Update the Plugin Code
 The [`src/`](./src/) directory is the heart of your plugin! This contains the
 source code that will be run when your plugin extension functions are invoked. You can replace the
 contents of this directory with your own code.
 There are a few things to keep in mind when writing your plugin code:
 - Most Jan Plugin Extension functions are processed asynchronously.
  In `index.ts`, you will see that the extension function will return a `Promise<any>`.
  ```typescript
  import { core } from "@janhq/core";
  function onStart(): Promise<any> {
    return core.invokePluginFunc(MODULE_PATH, "run", 0);
  }
  ```
  For more information about the Jan Plugin Core module, see the
  [documentation](https://github.com/janhq/jan/blob/main/core/README.md).
 So, what are you waiting for? Go ahead and start customizing your plugin!
--- a/extensions/inference-openai-extension/package.json
+++ b/extensions/inference-openai-extension/package.json
@ -0,0 +1,41 @@
 {
  "name": "@janhq/inference-openai-extension",
  "version": "1.0.0",
  "description": "Inference Engine for OpenAI Extension that can be used with any OpenAI compatible API",
  "main": "dist/index.js",
  "module": "dist/module.js",
  "author": "Jan <service@jan.ai>",
  "license": "AGPL-3.0",
  "scripts": {
    "build": "tsc -b . && webpack --config webpack.config.js",
    "build:publish": "rimraf *.tgz --glob && npm run build && npm pack && cpx *.tgz ../../electron/pre-install"
  },
  "exports": {
    ".": "./dist/index.js",
    "./main": "./dist/module.js"
  },
  "devDependencies": {
    "cpx": "^1.5.0",
    "rimraf": "^3.0.2",
    "webpack": "^5.88.2",
    "webpack-cli": "^5.1.4"
  },
  "dependencies": {
    "@janhq/core": "file:../../core",
    "fetch-retry": "^5.0.6",
    "path-browserify": "^1.0.1",
    "ts-loader": "^9.5.0",
    "ulid": "^2.3.0"
  },
  "engines": {
    "node": ">=18.0.0"
  },
  "files": [
    "dist/*",
    "package.json",
    "README.md"
  ],
  "bundleDependencies": [
    "fetch-retry"
  ]
 }
--- a/extensions/inference-openai-extension/src/@types/global.d.ts
+++ b/extensions/inference-openai-extension/src/@types/global.d.ts
@ -0,0 +1,27 @@
 import { Model } from "@janhq/core";
 declare const MODULE: string;
 declare interface EngineSettings {
  full_url?: string;
  api_key?: string;
 }
 enum OpenAIChatCompletionModelName {
  "gpt-3.5-turbo-instruct" = "gpt-3.5-turbo-instruct",
  "gpt-3.5-turbo-instruct-0914" = "gpt-3.5-turbo-instruct-0914",
  "gpt-4-1106-preview" = "gpt-4-1106-preview",
  "gpt-3.5-turbo-0613" = "gpt-3.5-turbo-0613",
  "gpt-3.5-turbo-0301" = "gpt-3.5-turbo-0301",
  "gpt-3.5-turbo" = "gpt-3.5-turbo",
  "gpt-3.5-turbo-16k-0613" = "gpt-3.5-turbo-16k-0613",
  "gpt-3.5-turbo-1106" = "gpt-3.5-turbo-1106",
  "gpt-4-vision-preview" = "gpt-4-vision-preview",
  "gpt-4" = "gpt-4",
  "gpt-4-0314" = "gpt-4-0314",
  "gpt-4-0613" = "gpt-4-0613",
 }
 declare type OpenAIModel = Omit<Model, "id"> & {
  id: OpenAIChatCompletionModelName;
 };
--- a/extensions/inference-openai-extension/src/helpers/sse.ts
+++ b/extensions/inference-openai-extension/src/helpers/sse.ts
@ -0,0 +1,68 @@
 import { Observable } from "rxjs";
 import { EngineSettings, OpenAIModel } from "../@types/global";
 /**
 * Sends a request to the inference server to generate a response based on the recent messages.
 * @param recentMessages - An array of recent messages to use as context for the inference.
 * @param engine - The engine settings to use for the inference.
 * @param model - The model to use for the inference.
 * @returns An Observable that emits the generated response as a string.
 */
 export function requestInference(
  recentMessages: any[],
  engine: EngineSettings,
  model: OpenAIModel,
  controller?: AbortController
 ): Observable<string> {
  return new Observable((subscriber) => {
    let model_id: string = model.id
    if (engine.full_url.includes("openai.azure.com")){
      model_id = engine.full_url.split("/")[5]
    }
    const requestBody = JSON.stringify({
      messages: recentMessages,
      stream: true,
      model: model_id
      // ...model.parameters,
    });
    fetch(`${engine.full_url}`, {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        Accept: "text/event-stream",
        "Access-Control-Allow-Origin": "*",
        Authorization: `Bearer ${engine.api_key}`,
        "api-key": `${engine.api_key}`,
      },
      body: requestBody,
      signal: controller?.signal,
    })
      .then(async (response) => {
        const stream = response.body;
        const decoder = new TextDecoder("utf-8");
        const reader = stream?.getReader();
        let content = "";
        while (true && reader) {
          const { done, value } = await reader.read();
          if (done) {
            break;
          }
          const text = decoder.decode(value);
          const lines = text.trim().split("\n");
          for (const line of lines) {
            if (line.startsWith("data: ") && !line.includes("data: [DONE]")) {
              const data = JSON.parse(line.replace("data: ", ""));
              content += data.choices[0]?.delta?.content ?? "";
              if (content.startsWith("assistant: ")) {
                content = content.replace("assistant: ", "");
              }
              subscriber.next(content);
            }
          }
        }
        subscriber.complete();
      })
      .catch((err) => subscriber.error(err));
  });
 }
--- a/extensions/inference-openai-extension/src/index.ts
+++ b/extensions/inference-openai-extension/src/index.ts
@ -0,0 +1,231 @@
 /**
 * @file This file exports a class that implements the InferenceExtension interface from the @janhq/core package.
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 * @version 1.0.0
 * @module inference-openai-extension/src/index
 */
 import {
  ChatCompletionRole,
  ContentType,
  EventName,
  MessageRequest,
  MessageStatus,
  ModelSettingParams,
  ExtensionType,
  ThreadContent,
  ThreadMessage,
  events,
  fs,
 } from "@janhq/core";
 import { InferenceExtension } from "@janhq/core";
 import { requestInference } from "./helpers/sse";
 import { ulid } from "ulid";
 import { join } from "path";
 import { EngineSettings, OpenAIModel } from "./@types/global";
 /**
 * A class that implements the InferenceExtension interface from the @janhq/core package.
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 */
 export default class JanInferenceOpenAIExtension implements InferenceExtension {
  private static readonly _homeDir = "engines";
  private static readonly _engineMetadataFileName = "openai.json";
  private static _currentModel: OpenAIModel;
  private static _engineSettings: EngineSettings = {
    full_url: "https://api.openai.com/v1/chat/completions",
    api_key: "sk-<your key here>",
  };
  controller = new AbortController();
  isCancelled = false;
  /**
   * Returns the type of the extension.
   * @returns {ExtensionType} The type of the extension.
   */
  // TODO: To fix
  type(): ExtensionType {
    return undefined;
  }
  /**
   * Subscribes to events emitted by the @janhq/core package.
   */
  onLoad(): void {
    fs.mkdir(JanInferenceOpenAIExtension._homeDir);
    JanInferenceOpenAIExtension.writeDefaultEngineSettings();
    // Events subscription
    events.on(EventName.OnMessageSent, (data) =>
      JanInferenceOpenAIExtension.handleMessageRequest(data, this)
    );
    events.on(EventName.OnModelInit, (model: OpenAIModel) => {
      JanInferenceOpenAIExtension.handleModelInit(model);
    });
    events.on(EventName.OnModelStop, (model: OpenAIModel) => {
      JanInferenceOpenAIExtension.handleModelStop(model);
    });
    events.on(EventName.OnInferenceStopped, () => {
      JanInferenceOpenAIExtension.handleInferenceStopped(this);
    });
  }
  /**
   * Stops the model inference.
   */
  onUnload(): void {}
  static async writeDefaultEngineSettings() {
    try {
      const engineFile = join(
        JanInferenceOpenAIExtension._homeDir,
        JanInferenceOpenAIExtension._engineMetadataFileName
      );
      if (await fs.exists(engineFile)) {
        JanInferenceOpenAIExtension._engineSettings = JSON.parse(
          await fs.readFile(engineFile)
        );
      } else {
        await fs.writeFile(
          engineFile,
          JSON.stringify(JanInferenceOpenAIExtension._engineSettings, null, 2)
        );
      }
    } catch (err) {
      console.error(err);
    }
  }
  /**
   * Makes a single response inference request.
   * @param {MessageRequest} data - The data for the inference request.
   * @returns {Promise<any>} A promise that resolves with the inference response.
   */
  async inference(data: MessageRequest): Promise<ThreadMessage> {
    const timestamp = Date.now();
    const message: ThreadMessage = {
      thread_id: data.threadId,
      created: timestamp,
      updated: timestamp,
      status: MessageStatus.Ready,
      id: "",
      role: ChatCompletionRole.Assistant,
      object: "thread.message",
      content: [],
    };
    return new Promise(async (resolve, reject) => {
      requestInference(
        data.messages ?? [],
        JanInferenceOpenAIExtension._engineSettings,
        JanInferenceOpenAIExtension._currentModel
      ).subscribe({
        next: (_content) => {},
        complete: async () => {
          resolve(message);
        },
        error: async (err) => {
          reject(err);
        },
      });
    });
  }
  private static async handleModelInit(model: OpenAIModel) {
    if (model.engine !== "openai") {
      return;
    } else {
      JanInferenceOpenAIExtension._currentModel = model;
      JanInferenceOpenAIExtension.writeDefaultEngineSettings();
      // Todo: Check model list with API key
      events.emit(EventName.OnModelReady, model);
    }
  }
  private static async handleModelStop(model: OpenAIModel) {
    if (model.engine !== "openai") {
      return;
    }
    events.emit(EventName.OnModelStopped, model);
  }
  private static async handleInferenceStopped(
    instance: JanInferenceOpenAIExtension
  ) {
    instance.isCancelled = true;
    instance.controller?.abort();
  }
  /**
   * Handles a new message request by making an inference request and emitting events.
   * Function registered in event manager, should be static to avoid binding issues.
   * Pass instance as a reference.
   * @param {MessageRequest} data - The data for the new message request.
   */
  private static async handleMessageRequest(
    data: MessageRequest,
    instance: JanInferenceOpenAIExtension
  ) {
    if (data.model.engine !== "openai") {
      return;
    }
    const timestamp = Date.now();
    const message: ThreadMessage = {
      id: ulid(),
      thread_id: data.threadId,
      assistant_id: data.assistantId,
      role: ChatCompletionRole.Assistant,
      content: [],
      status: MessageStatus.Pending,
      created: timestamp,
      updated: timestamp,
      object: "thread.message",
    };
    events.emit(EventName.OnMessageResponse, message);
    instance.isCancelled = false;
    instance.controller = new AbortController();
    requestInference(
      data?.messages ?? [],
      this._engineSettings,
      JanInferenceOpenAIExtension._currentModel,
      instance.controller
    ).subscribe({
      next: (content) => {
        const messageContent: ThreadContent = {
          type: ContentType.Text,
          text: {
            value: content.trim(),
            annotations: [],
          },
        };
        message.content = [messageContent];
        events.emit(EventName.OnMessageUpdate, message);
      },
      complete: async () => {
        message.status = MessageStatus.Ready;
        events.emit(EventName.OnMessageUpdate, message);
      },
      error: async (err) => {
        const messageContent: ThreadContent = {
          type: ContentType.Text,
          text: {
            value: "Error occurred: " + err.message,
            annotations: [],
          },
        };
        message.content = [messageContent];
        message.status = MessageStatus.Ready;
        events.emit(EventName.OnMessageUpdate, message);
      },
    });
  }
 }
--- a/extensions/inference-openai-extension/tsconfig.json
+++ b/extensions/inference-openai-extension/tsconfig.json
@ -0,0 +1,15 @@
 {
  "compilerOptions": {
    "target": "es2016",
    "module": "ES6",
    "moduleResolution": "node",
    "outDir": "./dist",
    "esModuleInterop": true,
    "forceConsistentCasingInFileNames": true,
    "strict": false,
    "skipLibCheck": true,
    "rootDir": "./src"
  },
  "include": ["./src"]
 }
--- a/extensions/inference-openai-extension/webpack.config.js
+++ b/extensions/inference-openai-extension/webpack.config.js
@ -0,0 +1,42 @@
 const path = require("path");
 const webpack = require("webpack");
 const packageJson = require("./package.json");
 module.exports = {
  experiments: { outputModule: true },
  entry: "./src/index.ts", // Adjust the entry point to match your project's main file
  mode: "production",
  module: {
    rules: [
      {
        test: /\.tsx?$/,
        use: "ts-loader",
        exclude: /node_modules/,
      },
    ],
  },
  plugins: [
    new webpack.DefinePlugin({
      MODULE: JSON.stringify(`${packageJson.name}/${packageJson.module}`),
      INFERENCE_URL: JSON.stringify(
        process.env.INFERENCE_URL ||
          "http://127.0.0.1:3928/inferences/llamacpp/chat_completion"
      ),
    }),
  ],
  output: {
    filename: "index.js", // Adjust the output file name as needed
    path: path.resolve(__dirname, "dist"),
    library: { type: "module" }, // Specify ESM output format
  },
  resolve: {
    extensions: [".ts", ".js"],
    fallback: {
      path: require.resolve("path-browserify"),
    },
  },
  optimization: {
    minimize: false,
  },
  // Add loaders and other configuration as needed for your project
 };
--- a/models/capybara-34b/model.json
+++ b/models/capybara-34b/model.json
@ -19,6 +19,7 @@
      "author": "NousResearch, The Bloke",
      "tags": ["34B", "Finetuned"],
      "size": 24320000000
-    }
+    }, 
    "engine": "nitro"
  }
--- a/models/deepseek-coder-1.3b/model.json
+++ b/models/deepseek-coder-1.3b/model.json
@ -1,3 +1,4 @@
 {
    "source_url": "https://huggingface.co/TheBloke/deepseek-coder-1.3b-instruct-GGUF/resolve/main/deepseek-coder-1.3b-instruct.Q8_0.gguf",
    "id": "deepseek-coder-1.3b",
@ -19,5 +20,6 @@
      "author": "Deepseek, The Bloke",
      "tags": ["Tiny", "Foundational Model"],
      "size": 1430000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/deepseek-coder-34b/model.json
+++ b/models/deepseek-coder-34b/model.json
@ -19,6 +19,7 @@
      "author": "Deepseek, The Bloke",
      "tags": ["34B", "Foundational Model"],
      "size": 26040000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/gpt-3.5-turbo-16k-0613/model.json
+++ b/models/gpt-3.5-turbo-16k-0613/model.json
@ -0,0 +1,20 @@
 {
    "source_url": "https://openai.com",
    "id": "gpt-3.5-turbo-16k-0613",
    "object": "model",
    "name": "OpenAI GPT 3.5 Turbo 16k 0613",
    "version": 1.0,
    "description": "OpenAI GPT 3.5 Turbo 16k 0613 model is extremely good",
    "format": "api",
    "settings": {},
    "parameters": {
      "max_tokens": 4096
    },
    "metadata": {
      "author": "OpenAI",
      "tags": ["General", "Big Context Length"]
    },
    "engine": "openai",
    "state": "ready"
 }
--- a/models/gpt-3.5-turbo/model.json
+++ b/models/gpt-3.5-turbo/model.json
@ -0,0 +1,18 @@
 {
    "source_url": "https://openai.com",
    "id": "gpt-3.5-turbo",
    "object": "model",
    "name": "OpenAI GPT 3.5 Turbo",
    "version": 1.0,
    "description": "OpenAI GPT 3.5 Turbo model is extremely good",
    "format": "api",
    "settings": {},
    "parameters": {},
    "metadata": {
      "author": "OpenAI",
      "tags": ["General", "Big Context Length"]
    },
    "engine": "openai",
    "state": "ready"
 }
--- a/models/gpt-4/model.json
+++ b/models/gpt-4/model.json
@ -0,0 +1,20 @@
 {
    "source_url": "https://openai.com",
    "id": "gpt-4",
    "object": "model",
    "name": "OpenAI GPT 3.5",
    "version": 1.0,
    "description": "OpenAI GPT 3.5 model is extremely good",
    "format": "api",
    "settings": {},
    "parameters": {
      "max_tokens": 4096
    },
    "metadata": {
      "author": "OpenAI",
      "tags": ["General", "Big Context Length"]
    },
    "engine": "openai",
    "state": "ready"
 }
--- a/models/llama2-chat-70b-q4/model.json
+++ b/models/llama2-chat-70b-q4/model.json
@ -19,6 +19,7 @@
      "author": "MetaAI, The Bloke",
      "tags": ["70B", "Foundational Model"],
      "size": 43920000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/llama2-chat-7b-q4/model.json
+++ b/models/llama2-chat-7b-q4/model.json
@ -19,6 +19,7 @@
      "author": "MetaAI, The Bloke",
      "tags": ["7B", "Foundational Model"],
      "size": 4080000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/llama2-chat-7b-q5/model.json
+++ b/models/llama2-chat-7b-q5/model.json
@ -19,6 +19,7 @@
      "author": "MetaAI, The Bloke",
      "tags": ["7B", "Foundational Model"],
      "size": 4780000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/lzlv-70b/model.json
+++ b/models/lzlv-70b/model.json
@ -19,6 +19,7 @@
      "author": "Lizpreciatior, The Bloke",
      "tags": ["70B", "Finetuned"],
      "size": 48750000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/mistral-ins-7b-q4/model.json
+++ b/models/mistral-ins-7b-q4/model.json
@ -20,6 +20,7 @@
      "tags": ["Featured", "7B", "Foundational Model"],
      "size": 4370000000,
      "cover": "https://raw.githubusercontent.com/janhq/jan/main/models/mistral-ins-7b-q4/cover.png"
-    }
+    },
    "engine": "nitro"
  }
--- a/models/mistral-ins-7b-q5/model.json
+++ b/models/mistral-ins-7b-q5/model.json
@ -19,6 +19,7 @@
      "author": "MistralAI, The Bloke",
      "tags": ["7B", "Foundational Model"],
      "size": 5130000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/neural-chat-7b/model.json
+++ b/models/neural-chat-7b/model.json
@ -19,6 +19,7 @@
      "author": "Intel, The Bloke",
      "tags": ["Recommended", "7B", "Finetuned"],
      "size": 4370000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/noromaid-20b/model.json
+++ b/models/noromaid-20b/model.json
@ -19,6 +19,7 @@
      "author": "NeverSleep, The Bloke",
      "tags": ["34B", "Finetuned"],
      "size": 12040000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/openhermes-neural-7b/model.json
+++ b/models/openhermes-neural-7b/model.json
@ -20,5 +20,6 @@
      "tags": ["Featured", "7B", "Merged"],
      "size": 4370000000,
      "cover": "https://raw.githubusercontent.com/janhq/jan/main/models/openhermes-neural-7b/cover.png"
-    }
+    },
-  }
+    "engine": "nitro"
  }
--- a/models/openorca-13b/model.json
+++ b/models/openorca-13b/model.json
@ -19,6 +19,7 @@
      "author": "Microsoft, The Bloke",
      "tags": ["13B", "Finetuned"],
      "size": 9230000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/phind-34b/model.json
+++ b/models/phind-34b/model.json
@ -19,6 +19,7 @@
      "author": "Phind, The Bloke",
      "tags": ["34B", "Finetuned"],
      "size": 24320000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/rocket-3b/model.json
+++ b/models/rocket-3b/model.json
@ -19,5 +19,6 @@
      "author": "Pansophic, The Bloke",
      "tags": ["Tiny", "Finetuned"],
      "size": 1710000000
-    }
+    },
    "engine": "nitro"
  }  
--- a/models/starling-7b/model.json
+++ b/models/starling-7b/model.json
@ -19,6 +19,7 @@
      "author": "Berkeley-nest, The Bloke",
      "tags": ["Recommended", "7B","Finetuned"],
      "size": 4370000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/tiefighter-13b/model.json
+++ b/models/tiefighter-13b/model.json
@ -19,6 +19,7 @@
      "author": "KoboldAI, The Bloke",
      "tags": ["13B", "Finetuned"],
      "size": 9230000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/tinyllama-1.1b/model.json
+++ b/models/tinyllama-1.1b/model.json
@ -19,5 +19,6 @@
      "author": "TinyLlama",
      "tags": ["Tiny", "Foundation Model"],
      "size": 637000000
-  }
+  },
  "engine": "nitro"
 }
--- a/models/wizardcoder-13b/model.json
+++ b/models/wizardcoder-13b/model.json
@ -19,6 +19,7 @@
      "author": "WizardLM, The Bloke",
      "tags": ["Recommended", "13B", "Finetuned"],
      "size": 9230000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/yi-34b/model.json
+++ b/models/yi-34b/model.json
@ -19,6 +19,7 @@
      "author": "01-ai, The Bloke",
      "tags": ["34B", "Foundational Model"],
      "size": 24320000000
-    }
+    },
    "engine": "nitro"
  }
--- a/models/zephyr-beta-7b/model.json
+++ b/models/zephyr-beta-7b/model.json
@ -19,6 +19,7 @@
      "author": "HuggingFaceH4, The Bloke",
      "tags": ["7B", "Finetuned"],
      "size": 4370000000
-    }
+    },
    "engine": "nitro"
  }
--- a/web/containers/Providers/EventHandler.tsx
+++ b/web/containers/Providers/EventHandler.tsx
@ -7,10 +7,16 @@ import {
  ThreadMessage,
  ExtensionType,
  MessageStatus,
  Model,
 } from '@janhq/core'
 import { ConversationalExtension } from '@janhq/core'
 import { useAtomValue, useSetAtom } from 'jotai'
 import { activeModelAtom, stateModelAtom } from '@/hooks/useActiveModel'
 import { useGetDownloadedModels } from '@/hooks/useGetDownloadedModels'
 import { toaster } from '../Toast'
 import { extensionManager } from '@/extension'
 import {
  addNewMessageAtom,
@ -24,19 +30,61 @@ import {
 export default function EventHandler({ children }: { children: ReactNode }) {
  const addNewMessage = useSetAtom(addNewMessageAtom)
  const updateMessage = useSetAtom(updateMessageAtom)
  const { downloadedModels } = useGetDownloadedModels()
  const setActiveModel = useSetAtom(activeModelAtom)
  const setStateModel = useSetAtom(stateModelAtom)
  const updateThreadWaiting = useSetAtom(updateThreadWaitingForResponseAtom)
  const threads = useAtomValue(threadsAtom)
  const modelsRef = useRef(downloadedModels)
  const threadsRef = useRef(threads)
  useEffect(() => {
    threadsRef.current = threads
  }, [threads])
  useEffect(() => {
    modelsRef.current = downloadedModels
  }, [downloadedModels])
  async function handleNewMessageResponse(message: ThreadMessage) {
    addNewMessage(message)
  }
  async function handleModelReady(model: Model) {
    setActiveModel(model)
    toaster({
      title: 'Success!',
      description: `Model ${model.id} has been started.`,
    })
    setStateModel(() => ({
      state: 'stop',
      loading: false,
      model: model.id,
    }))
  }
  async function handleModelStopped(model: Model) {
    setTimeout(async () => {
      setActiveModel(undefined)
      setStateModel({ state: 'start', loading: false, model: '' })
      toaster({
        title: 'Success!',
        description: `Model ${model.id} has been stopped.`,
      })
    }, 500)
  }
  async function handleModelFail(res: any) {
    const errorMessage = `${res.error}`
    alert(errorMessage)
    setStateModel(() => ({
      state: 'start',
      loading: false,
      model: res.modelId,
    }))
  }
  async function handleMessageResponseUpdate(message: ThreadMessage) {
    updateMessage(
      message.id,
@ -73,6 +121,9 @@ export default function EventHandler({ children }: { children: ReactNode }) {
    if (window.core.events) {
      events.on(EventName.OnMessageResponse, handleNewMessageResponse)
      events.on(EventName.OnMessageUpdate, handleMessageResponseUpdate)
      events.on(EventName.OnModelReady, handleModelReady)
      events.on(EventName.OnModelFail, handleModelFail)
      events.on(EventName.OnModelStopped, handleModelStopped)
    }
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [])
--- a/web/hooks/useActiveModel.ts
+++ b/web/hooks/useActiveModel.ts
@ -1,5 +1,8 @@
 /* eslint-disable @typescript-eslint/no-explicit-any */
-import { ExtensionType, InferenceExtension } from '@janhq/core'
+import {
  EventName,
  events,
 } from '@janhq/core'
 import { Model, ModelSettingParams } from '@janhq/core'
 import { atom, useAtom } from 'jotai'
@ -9,9 +12,13 @@ import { useGetDownloadedModels } from './useGetDownloadedModels'
 import { extensionManager } from '@/extension'
-const activeModelAtom = atom<Model | undefined>(undefined)
+export const activeModelAtom = atom<Model | undefined>(undefined)
-const stateModelAtom = atom({ state: 'start', loading: false, model: '' })
+export const stateModelAtom = atom({
  state: 'start',
  loading: false,
  model: '',
 })
 export function useActiveModel() {
  const [activeModel, setActiveModel] = useAtom(activeModelAtom)
@ -47,59 +54,14 @@ export function useActiveModel() {
      return
    }
-    const currentTime = Date.now()
+    events.emit(EventName.OnModelInit, model)
    const res = await initModel(modelId, model?.settings)
    if (res && res.error) {
      const errorMessage = `${res.error}`
      alert(errorMessage)
      setStateModel(() => ({
        state: 'start',
        loading: false,
        model: modelId,
      }))
    } else {
      console.debug(
        `Model ${modelId} successfully initialized! Took ${
          Date.now() - currentTime
        }ms`
      )
      setActiveModel(model)
      toaster({
        title: 'Success!',
        description: `Model ${modelId} has been started.`,
      })
      setStateModel(() => ({
        state: 'stop',
        loading: false,
        model: modelId,
      }))
    }
  }
  const stopModel = async (modelId: string) => {
    const model = downloadedModels.find((e) => e.id === modelId)
    setStateModel({ state: 'stop', loading: true, model: modelId })
-    setTimeout(async () => {
+    events.emit(EventName.OnModelStop, model)
      extensionManager
        .get<InferenceExtension>(ExtensionType.Inference)
        ?.stopModel()
      setActiveModel(undefined)
      setStateModel({ state: 'start', loading: false, model: '' })
      toaster({
        title: 'Success!',
        description: `Model ${modelId} has been stopped.`,
      })
    }, 500)
  }
  return { activeModel, startModel, stopModel, stateModel }
 }
 const initModel = async (
  modelId: string,
  settings?: ModelSettingParams
 ): Promise<any> => {
  return extensionManager
    .get<InferenceExtension>(ExtensionType.Inference)
    ?.initModel(modelId, settings)
 }
--- a/web/hooks/useCreateNewThread.ts
+++ b/web/hooks/useCreateNewThread.ts
@ -67,6 +67,7 @@ export const useCreateNewThread = () => {
          top_p: 0,
          stream: false,
        },
        engine: undefined
      },
      instructions: assistant.instructions,
    }
--- a/web/hooks/useSendChatMessage.ts
+++ b/web/hooks/useSendChatMessage.ts
@ -50,7 +50,6 @@ export default function useSendChatMessage() {
  const [queuedMessage, setQueuedMessage] = useState(false)
  const modelRef = useRef<Model | undefined>()
  useEffect(() => {
    modelRef.current = activeModel
  }, [activeModel])
@ -91,18 +90,35 @@ export default function useSendChatMessage() {
      id: ulid(),
      messages: messages,
      threadId: activeThread.id,
      model: activeThread.assistants[0].model ?? selectedModel,
    }
    const modelId = selectedModel?.id ?? activeThread.assistants[0].model.id
    if (activeModel?.id !== modelId) {
      setQueuedMessage(true)
-      await startModel(modelId)
+      startModel(modelId)
      await WaitForModelStarting(modelId)
      setQueuedMessage(false)
    }
    events.emit(EventName.OnMessageSent, messageRequest)
  }
  // TODO: Refactor @louis
  const WaitForModelStarting = async (modelId: string) => {
    return new Promise<void>((resolve) => {
      setTimeout(async () => {
        if (modelRef.current?.id !== modelId) {
          console.log('waiting for model to start')
          await WaitForModelStarting(modelId)
          resolve()
        } else {
          resolve()
        }
      }, 200)
    })
  }
  const sendChatMessage = async () => {
    if (!currentPrompt || currentPrompt.trim().length === 0) {
      return
@ -132,6 +148,7 @@ export default function useSendChatMessage() {
              id: selectedModel.id,
              settings: selectedModel.settings,
              parameters: selectedModel.parameters,
              engine: selectedModel.engine,
            },
          },
        ],
@ -178,7 +195,7 @@ export default function useSendChatMessage() {
      id: msgId,
      threadId: activeThread.id,
      messages,
-      parameters: activeThread.assistants[0].model.parameters,
+      model: selectedModel ?? activeThread.assistants[0].model,
    }
    const timestamp = Date.now()
    const threadMessage: ThreadMessage = {
@ -210,7 +227,8 @@ export default function useSendChatMessage() {
    if (activeModel?.id !== modelId) {
      setQueuedMessage(true)
-      await startModel(modelId)
+      startModel(modelId)
      await WaitForModelStarting(modelId)
      setQueuedMessage(false)
    }
    events.emit(EventName.OnMessageSent, messageRequest)
--- a/web/screens/Chat/MessageToolbar/index.tsx
+++ b/web/screens/Chat/MessageToolbar/index.tsx
@ -30,9 +30,8 @@ const MessageToolbar = ({ message }: { message: ThreadMessage }) => {
  const { resendChatMessage } = useSendChatMessage()
  const onStopInferenceClick = async () => {
-    await extensionManager
+    events.emit(EventName.OnInferenceStopped, {})
-      .get<InferenceExtension>(ExtensionType.Inference)
+
      ?.stopInference()
    setTimeout(() => {
      events.emit(EventName.OnMessageUpdate, {
        ...message,
--- a/web/utils/converter.ts
+++ b/web/utils/converter.ts
@ -1,4 +1,5 @@
 export const toGigabytes = (input: number) => {
  if (!input) return ''
  if (input > 1024 ** 3) {
    return (input / 1000 ** 3).toFixed(2) + 'GB'
  } else if (input > 1024 ** 2) {
		`@ -1,2 +0,0 @@`
			`declare const MODULE: string;`
			`declare const INFERENCE_URL: string;`