Merge pull request #931 from janhq/fix/nitro_perf_cpu
feat: Hotfit for Nitro loading on CPU with hyper-threading support
This commit is contained in:
commit
fef97f6736
@ -275,6 +275,7 @@ export type ModelSettingParams = {
|
|||||||
ngl?: number;
|
ngl?: number;
|
||||||
embedding?: boolean;
|
embedding?: boolean;
|
||||||
n_parallel?: number;
|
n_parallel?: number;
|
||||||
|
cpu_threads?: number;
|
||||||
system_prompt?: string;
|
system_prompt?: string;
|
||||||
user_prompt?: string;
|
user_prompt?: string;
|
||||||
ai_prompt?: string;
|
ai_prompt?: string;
|
||||||
|
|||||||
@ -36,6 +36,7 @@
|
|||||||
"kill-port": "^2.0.1",
|
"kill-port": "^2.0.1",
|
||||||
"path-browserify": "^1.0.1",
|
"path-browserify": "^1.0.1",
|
||||||
"rxjs": "^7.8.1",
|
"rxjs": "^7.8.1",
|
||||||
|
"systeminformation": "^5.21.20",
|
||||||
"tcp-port-used": "^1.0.2",
|
"tcp-port-used": "^1.0.2",
|
||||||
"ts-loader": "^9.5.0",
|
"ts-loader": "^9.5.0",
|
||||||
"ulid": "^2.3.0"
|
"ulid": "^2.3.0"
|
||||||
@ -52,6 +53,7 @@
|
|||||||
"tcp-port-used",
|
"tcp-port-used",
|
||||||
"kill-port",
|
"kill-port",
|
||||||
"fetch-retry",
|
"fetch-retry",
|
||||||
"electron-log"
|
"electron-log",
|
||||||
|
"systeminformation"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@ -12,6 +12,7 @@ declare const INFERENCE_URL: string;
|
|||||||
interface EngineSettings {
|
interface EngineSettings {
|
||||||
ctx_len: number;
|
ctx_len: number;
|
||||||
ngl: number;
|
ngl: number;
|
||||||
|
cpu_threads: number;
|
||||||
cont_batching: boolean;
|
cont_batching: boolean;
|
||||||
embedding: boolean;
|
embedding: boolean;
|
||||||
}
|
}
|
||||||
@ -24,3 +25,8 @@ interface ModelOperationResponse {
|
|||||||
error?: any;
|
error?: any;
|
||||||
modelFile?: string;
|
modelFile?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface ResourcesInfo {
|
||||||
|
numCpuPhysicalCore: number;
|
||||||
|
memAvailable: number;
|
||||||
|
}
|
||||||
@ -12,7 +12,6 @@ import {
|
|||||||
EventName,
|
EventName,
|
||||||
MessageRequest,
|
MessageRequest,
|
||||||
MessageStatus,
|
MessageStatus,
|
||||||
ModelSettingParams,
|
|
||||||
ExtensionType,
|
ExtensionType,
|
||||||
ThreadContent,
|
ThreadContent,
|
||||||
ThreadMessage,
|
ThreadMessage,
|
||||||
@ -41,6 +40,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
|||||||
private static _engineSettings: EngineSettings = {
|
private static _engineSettings: EngineSettings = {
|
||||||
ctx_len: 2048,
|
ctx_len: 2048,
|
||||||
ngl: 100,
|
ngl: 100,
|
||||||
|
cpu_threads: 1,
|
||||||
cont_batching: false,
|
cont_batching: false,
|
||||||
embedding: false,
|
embedding: false,
|
||||||
};
|
};
|
||||||
|
|||||||
@ -4,6 +4,7 @@ const path = require("path");
|
|||||||
const { spawn } = require("child_process");
|
const { spawn } = require("child_process");
|
||||||
const tcpPortUsed = require("tcp-port-used");
|
const tcpPortUsed = require("tcp-port-used");
|
||||||
const fetchRetry = require("fetch-retry")(global.fetch);
|
const fetchRetry = require("fetch-retry")(global.fetch);
|
||||||
|
const si = require("systeminformation");
|
||||||
|
|
||||||
const log = require("electron-log");
|
const log = require("electron-log");
|
||||||
|
|
||||||
@ -38,15 +39,21 @@ function stopModel(): Promise<ModelOperationResponse> {
|
|||||||
* TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
|
* TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
|
||||||
* TODO: Should it be startModel instead?
|
* TODO: Should it be startModel instead?
|
||||||
*/
|
*/
|
||||||
function initModel(wrapper: any): Promise<ModelOperationResponse> {
|
async function initModel(wrapper: any): Promise<ModelOperationResponse> {
|
||||||
currentModelFile = wrapper.modelFullPath;
|
currentModelFile = wrapper.modelFullPath;
|
||||||
if (wrapper.model.engine !== "nitro") {
|
if (wrapper.model.engine !== "nitro") {
|
||||||
return Promise.resolve({ error: "Not a nitro model" });
|
return Promise.resolve({ error: "Not a nitro model" });
|
||||||
} else {
|
} else {
|
||||||
log.info("Started to load model " + wrapper.model.modelFullPath);
|
// Gather system information for CPU physical cores and memory
|
||||||
|
const nitroResourceProbe = await getResourcesInfo();
|
||||||
|
console.log(
|
||||||
|
"Nitro with physical core: " + nitroResourceProbe.numCpuPhysicalCore
|
||||||
|
);
|
||||||
const settings = {
|
const settings = {
|
||||||
llama_model_path: currentModelFile,
|
llama_model_path: currentModelFile,
|
||||||
...wrapper.model.settings,
|
...wrapper.model.settings,
|
||||||
|
// This is critical and requires real system information
|
||||||
|
cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
|
||||||
};
|
};
|
||||||
log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`);
|
log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`);
|
||||||
return (
|
return (
|
||||||
@ -54,7 +61,7 @@ function initModel(wrapper: any): Promise<ModelOperationResponse> {
|
|||||||
validateModelVersion()
|
validateModelVersion()
|
||||||
.then(checkAndUnloadNitro)
|
.then(checkAndUnloadNitro)
|
||||||
// 2. Spawn the Nitro subprocess
|
// 2. Spawn the Nitro subprocess
|
||||||
.then(spawnNitroProcess)
|
.then(await spawnNitroProcess(nitroResourceProbe))
|
||||||
// 4. Load the model into the Nitro subprocess (HTTP POST request)
|
// 4. Load the model into the Nitro subprocess (HTTP POST request)
|
||||||
.then(() => loadLLMModel(settings))
|
.then(() => loadLLMModel(settings))
|
||||||
// 5. Check if the model is loaded successfully
|
// 5. Check if the model is loaded successfully
|
||||||
@ -166,16 +173,14 @@ async function checkAndUnloadNitro() {
|
|||||||
* Using child-process to spawn the process
|
* Using child-process to spawn the process
|
||||||
* Should run exactly platform specified Nitro binary version
|
* Should run exactly platform specified Nitro binary version
|
||||||
*/
|
*/
|
||||||
async function spawnNitroProcess(): Promise<void> {
|
async function spawnNitroProcess(nitroResourceProbe: any): Promise<any> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise(async (resolve, reject) => {
|
||||||
let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
|
let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
|
||||||
let binaryName;
|
let binaryName;
|
||||||
|
|
||||||
if (process.platform === "win32") {
|
if (process.platform === "win32") {
|
||||||
// Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries
|
|
||||||
binaryName = "win-start.bat";
|
binaryName = "win-start.bat";
|
||||||
} else if (process.platform === "darwin") {
|
} else if (process.platform === "darwin") {
|
||||||
// Mac OS platform
|
|
||||||
if (process.arch === "arm64") {
|
if (process.arch === "arm64") {
|
||||||
binaryFolder = path.join(binaryFolder, "mac-arm64");
|
binaryFolder = path.join(binaryFolder, "mac-arm64");
|
||||||
} else {
|
} else {
|
||||||
@ -183,15 +188,13 @@ async function spawnNitroProcess(): Promise<void> {
|
|||||||
}
|
}
|
||||||
binaryName = "nitro";
|
binaryName = "nitro";
|
||||||
} else {
|
} else {
|
||||||
// Linux
|
binaryName = "linux-start.sh";
|
||||||
// Todo: Need to check for CUDA support to switch between CUDA and non-CUDA binaries
|
|
||||||
binaryName = "linux-start.sh"; // For other platforms
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const binaryPath = path.join(binaryFolder, binaryName);
|
const binaryPath = path.join(binaryFolder, binaryName);
|
||||||
|
|
||||||
// Execute the binary
|
// Execute the binary
|
||||||
subprocess = spawn(binaryPath, [1, "127.0.0.1", PORT], {
|
subprocess = spawn(binaryPath, [1, LOCAL_HOST, PORT], {
|
||||||
cwd: binaryFolder,
|
cwd: binaryFolder,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -211,7 +214,7 @@ async function spawnNitroProcess(): Promise<void> {
|
|||||||
reject(`Nitro process exited. ${code ?? ""}`);
|
reject(`Nitro process exited. ${code ?? ""}`);
|
||||||
});
|
});
|
||||||
tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
|
tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
|
||||||
resolve();
|
resolve(nitroResourceProbe);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -263,17 +266,30 @@ function validateModelVersion(): Promise<void> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Cleans up any registered resources.
|
|
||||||
* Its module specific function, should be called when application is closed
|
|
||||||
*/
|
|
||||||
function dispose() {
|
function dispose() {
|
||||||
// clean other registered resources here
|
// clean other registered resources here
|
||||||
killSubprocess();
|
killSubprocess();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the system resources information
|
||||||
|
*/
|
||||||
|
async function getResourcesInfo(): Promise<ResourcesInfo> {
|
||||||
|
return new Promise(async (resolve) => {
|
||||||
|
const cpu = await si.cpu();
|
||||||
|
const mem = await si.mem();
|
||||||
|
|
||||||
|
const response = {
|
||||||
|
numCpuPhysicalCore: cpu.physicalCores,
|
||||||
|
memAvailable: mem.available,
|
||||||
|
};
|
||||||
|
resolve(response);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
initModel,
|
initModel,
|
||||||
|
stopModel,
|
||||||
killSubprocess,
|
killSubprocess,
|
||||||
dispose,
|
dispose,
|
||||||
};
|
};
|
||||||
|
|||||||
@ -12,7 +12,6 @@ import {
|
|||||||
EventName,
|
EventName,
|
||||||
MessageRequest,
|
MessageRequest,
|
||||||
MessageStatus,
|
MessageStatus,
|
||||||
ModelSettingParams,
|
|
||||||
ExtensionType,
|
ExtensionType,
|
||||||
ThreadContent,
|
ThreadContent,
|
||||||
ThreadMessage,
|
ThreadMessage,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user