Merge pull request #931 from janhq/fix/nitro_perf_cpu

feat: Hotfit for Nitro loading on CPU with hyper-threading support
2023-12-12 08:15:02 +07:00 · 2023-12-12 08:15:02 +07:00 · fef97f6736
commit fef97f6736
parent 121dc119f1 14f83ddb70
6 changed files with 43 additions and 19 deletions
--- a/core/src/types/index.ts
+++ b/core/src/types/index.ts
@ -275,6 +275,7 @@ export type ModelSettingParams = {
  ngl?: number;
  embedding?: boolean;
  n_parallel?: number;
  cpu_threads?: number;
  system_prompt?: string;
  user_prompt?: string;
  ai_prompt?: string;
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@ -36,6 +36,7 @@
    "kill-port": "^2.0.1",
    "path-browserify": "^1.0.1",
    "rxjs": "^7.8.1",
    "systeminformation": "^5.21.20",
    "tcp-port-used": "^1.0.2",
    "ts-loader": "^9.5.0",
    "ulid": "^2.3.0"
@ -52,6 +53,7 @@
    "tcp-port-used",
    "kill-port",
    "fetch-retry",
-    "electron-log"
+    "electron-log",
    "systeminformation"
  ]
 }
--- a/extensions/inference-nitro-extension/src/@types/global.d.ts
+++ b/extensions/inference-nitro-extension/src/@types/global.d.ts
@ -12,6 +12,7 @@ declare const INFERENCE_URL: string;
 interface EngineSettings {
  ctx_len: number;
  ngl: number;
  cpu_threads: number;
  cont_batching: boolean;
  embedding: boolean;
 }
@ -24,3 +25,8 @@ interface ModelOperationResponse {
  error?: any;
  modelFile?: string;
 }
 interface ResourcesInfo {
  numCpuPhysicalCore: number;
  memAvailable: number;
 }
--- a/extensions/inference-nitro-extension/src/index.ts
+++ b/extensions/inference-nitro-extension/src/index.ts
@ -12,7 +12,6 @@ import {
  EventName,
  MessageRequest,
  MessageStatus,
  ModelSettingParams,
  ExtensionType,
  ThreadContent,
  ThreadMessage,
@ -41,6 +40,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
  private static _engineSettings: EngineSettings = {
    ctx_len: 2048,
    ngl: 100,
    cpu_threads: 1,
    cont_batching: false,
    embedding: false,
  };
--- a/extensions/inference-nitro-extension/src/module.ts
+++ b/extensions/inference-nitro-extension/src/module.ts
@ -4,6 +4,7 @@ const path = require("path");
 const { spawn } = require("child_process");
 const tcpPortUsed = require("tcp-port-used");
 const fetchRetry = require("fetch-retry")(global.fetch);
 const si = require("systeminformation");
 const log = require("electron-log");
@ -38,15 +39,21 @@ function stopModel(): Promise<ModelOperationResponse> {
 * TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
 * TODO: Should it be startModel instead?
 */
-function initModel(wrapper: any): Promise<ModelOperationResponse> {
+async function initModel(wrapper: any): Promise<ModelOperationResponse> {
  currentModelFile = wrapper.modelFullPath;
  if (wrapper.model.engine !== "nitro") {
    return Promise.resolve({ error: "Not a nitro model" });
  } else {
-    log.info("Started to load model " + wrapper.model.modelFullPath);
+    // Gather system information for CPU physical cores and memory
    const nitroResourceProbe = await getResourcesInfo();
    console.log(
      "Nitro with physical core: " + nitroResourceProbe.numCpuPhysicalCore
    );
    const settings = {
      llama_model_path: currentModelFile,
      ...wrapper.model.settings,
      // This is critical and requires real system information
      cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
    };
    log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`);
    return (
@ -54,7 +61,7 @@ function initModel(wrapper: any): Promise<ModelOperationResponse> {
      validateModelVersion()
        .then(checkAndUnloadNitro)
        // 2. Spawn the Nitro subprocess
-        .then(spawnNitroProcess)
+        .then(await spawnNitroProcess(nitroResourceProbe))
        // 4. Load the model into the Nitro subprocess (HTTP POST request)
        .then(() => loadLLMModel(settings))
        // 5. Check if the model is loaded successfully
@ -166,16 +173,14 @@ async function checkAndUnloadNitro() {
 * Using child-process to spawn the process
 * Should run exactly platform specified Nitro binary version
 */
-async function spawnNitroProcess(): Promise<void> {
+async function spawnNitroProcess(nitroResourceProbe: any): Promise<any> {
-  return new Promise((resolve, reject) => {
+  return new Promise(async (resolve, reject) => {
    let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
    let binaryName;
    if (process.platform === "win32") {
      // Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries
      binaryName = "win-start.bat";
    } else if (process.platform === "darwin") {
      // Mac OS platform
      if (process.arch === "arm64") {
        binaryFolder = path.join(binaryFolder, "mac-arm64");
      } else {
@ -183,15 +188,13 @@ async function spawnNitroProcess(): Promise<void> {
      }
      binaryName = "nitro";
    } else {
-      // Linux
+      binaryName = "linux-start.sh";
      // Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries
      binaryName = "linux-start.sh"; // For other platforms
    }
    const binaryPath = path.join(binaryFolder, binaryName);
    // Execute the binary
-    subprocess = spawn(binaryPath, [1, "127.0.0.1", PORT], {
+    subprocess = spawn(binaryPath, [1, LOCAL_HOST, PORT], {
      cwd: binaryFolder,
    });
@ -211,7 +214,7 @@ async function spawnNitroProcess(): Promise<void> {
      reject(`Nitro process exited. ${code ?? ""}`);
    });
    tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
-      resolve();
+      resolve(nitroResourceProbe);
    });
  });
 }
@ -263,17 +266,30 @@ function validateModelVersion(): Promise<void> {
  });
 }
 /**
 * Cleans up any registered resources.
 * Its module specific function, should be called when application is closed
 */
 function dispose() {
  // clean other registered resources here
  killSubprocess();
 }
 /**
 * Get the system resources information
 */
 async function getResourcesInfo(): Promise<ResourcesInfo> {
  return new Promise(async (resolve) => {
    const cpu = await si.cpu();
    const mem = await si.mem();
    const response = {
      numCpuPhysicalCore: cpu.physicalCores,
      memAvailable: mem.available,
    };
    resolve(response);
  });
 }
 module.exports = {
  initModel,
  stopModel,
  killSubprocess,
  dispose,
 };
--- a/extensions/inference-openai-extension/src/index.ts
+++ b/extensions/inference-openai-extension/src/index.ts
@ -12,7 +12,6 @@ import {
  EventName,
  MessageRequest,
  MessageStatus,
  ModelSettingParams,
  ExtensionType,
  ThreadContent,
  ThreadMessage,