diff --git a/core/src/types/index.ts b/core/src/types/index.ts index d5b51cfc0..7314a4ae3 100644 --- a/core/src/types/index.ts +++ b/core/src/types/index.ts @@ -275,6 +275,7 @@ export type ModelSettingParams = { ngl?: number; embedding?: boolean; n_parallel?: number; + cpu_threads?: number; system_prompt?: string; user_prompt?: string; ai_prompt?: string; diff --git a/extensions/inference-nitro-extension/package.json b/extensions/inference-nitro-extension/package.json index ef74fff08..ecbbf17a8 100644 --- a/extensions/inference-nitro-extension/package.json +++ b/extensions/inference-nitro-extension/package.json @@ -36,6 +36,7 @@ "kill-port": "^2.0.1", "path-browserify": "^1.0.1", "rxjs": "^7.8.1", + "systeminformation": "^5.21.20", "tcp-port-used": "^1.0.2", "ts-loader": "^9.5.0", "ulid": "^2.3.0" @@ -52,6 +53,7 @@ "tcp-port-used", "kill-port", "fetch-retry", - "electron-log" + "electron-log", + "systeminformation" ] } diff --git a/extensions/inference-nitro-extension/src/@types/global.d.ts b/extensions/inference-nitro-extension/src/@types/global.d.ts index 642f10909..62eb65e52 100644 --- a/extensions/inference-nitro-extension/src/@types/global.d.ts +++ b/extensions/inference-nitro-extension/src/@types/global.d.ts @@ -12,6 +12,7 @@ declare const INFERENCE_URL: string; interface EngineSettings { ctx_len: number; ngl: number; + cpu_threads: number; cont_batching: boolean; embedding: boolean; } @@ -24,3 +25,8 @@ interface ModelOperationResponse { error?: any; modelFile?: string; } + +interface ResourcesInfo { + numCpuPhysicalCore: number; + memAvailable: number; +} \ No newline at end of file diff --git a/extensions/inference-nitro-extension/src/index.ts b/extensions/inference-nitro-extension/src/index.ts index e5f3f4360..f2fbf0d34 100644 --- a/extensions/inference-nitro-extension/src/index.ts +++ b/extensions/inference-nitro-extension/src/index.ts @@ -12,7 +12,6 @@ import { EventName, MessageRequest, MessageStatus, - ModelSettingParams, ExtensionType, ThreadContent, ThreadMessage, @@ -41,6 +40,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension { private static _engineSettings: EngineSettings = { ctx_len: 2048, ngl: 100, + cpu_threads: 1, cont_batching: false, embedding: false, }; diff --git a/extensions/inference-nitro-extension/src/module.ts b/extensions/inference-nitro-extension/src/module.ts index d36553f40..047581dbe 100644 --- a/extensions/inference-nitro-extension/src/module.ts +++ b/extensions/inference-nitro-extension/src/module.ts @@ -4,6 +4,7 @@ const path = require("path"); const { spawn } = require("child_process"); const tcpPortUsed = require("tcp-port-used"); const fetchRetry = require("fetch-retry")(global.fetch); +const si = require("systeminformation"); const log = require("electron-log"); @@ -38,15 +39,21 @@ function stopModel(): Promise { * TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package * TODO: Should it be startModel instead? */ -function initModel(wrapper: any): Promise { +async function initModel(wrapper: any): Promise { currentModelFile = wrapper.modelFullPath; if (wrapper.model.engine !== "nitro") { return Promise.resolve({ error: "Not a nitro model" }); } else { - log.info("Started to load model " + wrapper.model.modelFullPath); + // Gather system information for CPU physical cores and memory + const nitroResourceProbe = await getResourcesInfo(); + console.log( + "Nitro with physical core: " + nitroResourceProbe.numCpuPhysicalCore + ); const settings = { llama_model_path: currentModelFile, ...wrapper.model.settings, + // This is critical and requires real system information + cpu_threads: nitroResourceProbe.numCpuPhysicalCore, }; log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`); return ( @@ -54,7 +61,7 @@ function initModel(wrapper: any): Promise { validateModelVersion() .then(checkAndUnloadNitro) // 2. Spawn the Nitro subprocess - .then(spawnNitroProcess) + .then(await spawnNitroProcess(nitroResourceProbe)) // 4. Load the model into the Nitro subprocess (HTTP POST request) .then(() => loadLLMModel(settings)) // 5. Check if the model is loaded successfully @@ -166,16 +173,14 @@ async function checkAndUnloadNitro() { * Using child-process to spawn the process * Should run exactly platform specified Nitro binary version */ -async function spawnNitroProcess(): Promise { - return new Promise((resolve, reject) => { +async function spawnNitroProcess(nitroResourceProbe: any): Promise { + return new Promise(async (resolve, reject) => { let binaryFolder = path.join(__dirname, "bin"); // Current directory by default let binaryName; if (process.platform === "win32") { - // Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries binaryName = "win-start.bat"; } else if (process.platform === "darwin") { - // Mac OS platform if (process.arch === "arm64") { binaryFolder = path.join(binaryFolder, "mac-arm64"); } else { @@ -183,15 +188,13 @@ async function spawnNitroProcess(): Promise { } binaryName = "nitro"; } else { - // Linux - // Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries - binaryName = "linux-start.sh"; // For other platforms + binaryName = "linux-start.sh"; } const binaryPath = path.join(binaryFolder, binaryName); // Execute the binary - subprocess = spawn(binaryPath, [1, "127.0.0.1", PORT], { + subprocess = spawn(binaryPath, [1, LOCAL_HOST, PORT], { cwd: binaryFolder, }); @@ -211,7 +214,7 @@ async function spawnNitroProcess(): Promise { reject(`Nitro process exited. ${code ?? ""}`); }); tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => { - resolve(); + resolve(nitroResourceProbe); }); }); } @@ -263,17 +266,30 @@ function validateModelVersion(): Promise { }); } -/** - * Cleans up any registered resources. - * Its module specific function, should be called when application is closed - */ function dispose() { // clean other registered resources here killSubprocess(); } +/** + * Get the system resources information + */ +async function getResourcesInfo(): Promise { + return new Promise(async (resolve) => { + const cpu = await si.cpu(); + const mem = await si.mem(); + + const response = { + numCpuPhysicalCore: cpu.physicalCores, + memAvailable: mem.available, + }; + resolve(response); + }); +} + module.exports = { initModel, + stopModel, killSubprocess, dispose, }; diff --git a/extensions/inference-openai-extension/src/index.ts b/extensions/inference-openai-extension/src/index.ts index 7e3e6e71e..6bab563dd 100644 --- a/extensions/inference-openai-extension/src/index.ts +++ b/extensions/inference-openai-extension/src/index.ts @@ -12,7 +12,6 @@ import { EventName, MessageRequest, MessageStatus, - ModelSettingParams, ExtensionType, ThreadContent, ThreadMessage,