465 lines
14 KiB
TypeScript
465 lines
14 KiB
TypeScript
const fs = require("fs");
|
|
const fsPromises = fs.promises;
|
|
const path = require("path");
|
|
const { exec, spawn } = require("child_process");
|
|
const tcpPortUsed = require("tcp-port-used");
|
|
const fetchRetry = require("fetch-retry")(global.fetch);
|
|
const osUtils = require("os-utils");
|
|
const { readFileSync, writeFileSync, existsSync } = require("fs");
|
|
|
|
// The PORT to use for the Nitro subprocess
|
|
const PORT = 3928;
|
|
const LOCAL_HOST = "127.0.0.1";
|
|
const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${PORT}`;
|
|
const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/loadmodel`;
|
|
const NITRO_HTTP_UNLOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/unloadModel`;
|
|
const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/modelstatus`;
|
|
const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`;
|
|
const SUPPORTED_MODEL_FORMAT = ".gguf";
|
|
const NVIDIA_INFO_FILE = path.join(
|
|
require("os").homedir(),
|
|
"jan",
|
|
"settings",
|
|
"settings.json"
|
|
);
|
|
|
|
const DEFALT_SETTINGS = {
|
|
"notify": true,
|
|
"run_mode": "cpu",
|
|
"nvidia_driver": {
|
|
"exist": false,
|
|
"version": ""
|
|
},
|
|
"cuda": {
|
|
"exist": false,
|
|
"version": ""
|
|
},
|
|
"gpus": [],
|
|
"gpu_highest_vram": ""
|
|
}
|
|
|
|
// The subprocess instance for Nitro
|
|
let subprocess = undefined;
|
|
let currentModelFile: string = undefined;
|
|
let currentSettings = undefined;
|
|
|
|
/**
|
|
* Stops a Nitro subprocess.
|
|
* @param wrapper - The model wrapper.
|
|
* @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
|
|
*/
|
|
function stopModel(): Promise<void> {
|
|
return killSubprocess();
|
|
}
|
|
|
|
/**
|
|
* Validate nvidia and cuda for linux and windows
|
|
*/
|
|
async function updateNvidiaDriverInfo(): Promise<void> {
|
|
exec(
|
|
"nvidia-smi --query-gpu=driver_version --format=csv,noheader",
|
|
(error, stdout) => {
|
|
let data;
|
|
try {
|
|
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf8"));
|
|
} catch (error) {
|
|
data = DEFALT_SETTINGS;
|
|
}
|
|
|
|
if (!error) {
|
|
const firstLine = stdout.split("\n")[0].trim();
|
|
data["nvidia_driver"].exist = true;
|
|
data["nvidia_driver"].version = firstLine;
|
|
} else {
|
|
data["nvidia_driver"].exist = false;
|
|
}
|
|
|
|
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
|
Promise.resolve();
|
|
}
|
|
);
|
|
}
|
|
|
|
function checkFileExistenceInPaths(file: string, paths: string[]): boolean {
|
|
return paths.some((p) => existsSync(path.join(p, file)));
|
|
}
|
|
|
|
function updateCudaExistence() {
|
|
let files: string[];
|
|
let paths: string[];
|
|
|
|
if (process.platform === "win32") {
|
|
files = ["cublas64_12.dll", "cudart64_12.dll", "cublasLt64_12.dll"];
|
|
paths = process.env.PATH ? process.env.PATH.split(path.delimiter) : [];
|
|
const nitro_cuda_path = path.join(__dirname, "bin", "win-cuda");
|
|
paths.push(nitro_cuda_path);
|
|
} else {
|
|
files = ["libcudart.so.12", "libcublas.so.12", "libcublasLt.so.12"];
|
|
paths = process.env.LD_LIBRARY_PATH
|
|
? process.env.LD_LIBRARY_PATH.split(path.delimiter)
|
|
: [];
|
|
const nitro_cuda_path = path.join(__dirname, "bin", "linux-cuda");
|
|
paths.push(nitro_cuda_path);
|
|
paths.push("/usr/lib/x86_64-linux-gnu/");
|
|
}
|
|
|
|
let cudaExists = files.every(
|
|
(file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
|
|
);
|
|
|
|
let data;
|
|
try {
|
|
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf8"));
|
|
} catch (error) {
|
|
data = DEFALT_SETTINGS;
|
|
}
|
|
|
|
data["cuda"].exist = cudaExists;
|
|
if (cudaExists) {
|
|
data.run_mode = "gpu";
|
|
}
|
|
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
|
}
|
|
|
|
async function updateGpuInfo(): Promise<void> {
|
|
exec(
|
|
"nvidia-smi --query-gpu=index,memory.total --format=csv,noheader,nounits",
|
|
(error, stdout) => {
|
|
let data;
|
|
try {
|
|
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf8"));
|
|
} catch (error) {
|
|
data = DEFALT_SETTINGS;
|
|
}
|
|
|
|
if (!error) {
|
|
// Get GPU info and gpu has higher memory first
|
|
let highestVram = 0;
|
|
let highestVramId = "0";
|
|
let gpus = stdout
|
|
.trim()
|
|
.split("\n")
|
|
.map((line) => {
|
|
let [id, vram] = line.split(", ");
|
|
vram = vram.replace(/\r/g, "");
|
|
if (parseFloat(vram) > highestVram) {
|
|
highestVram = parseFloat(vram);
|
|
highestVramId = id;
|
|
}
|
|
return { id, vram };
|
|
});
|
|
|
|
data["gpus"] = gpus;
|
|
data["gpu_highest_vram"] = highestVramId;
|
|
} else {
|
|
data["gpus"] = [];
|
|
}
|
|
|
|
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
|
Promise.resolve();
|
|
}
|
|
);
|
|
}
|
|
|
|
async function updateNvidiaInfo() {
|
|
if (process.platform !== "darwin") {
|
|
await Promise.all([
|
|
updateNvidiaDriverInfo(),
|
|
updateCudaExistence(),
|
|
updateGpuInfo(),
|
|
]);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Initializes a Nitro subprocess to load a machine learning model.
|
|
* @param wrapper - The model wrapper.
|
|
* @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
|
|
* TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
|
|
* TODO: Should it be startModel instead?
|
|
*/
|
|
async function initModel(wrapper: any): Promise<ModelOperationResponse> {
|
|
currentModelFile = wrapper.modelFullPath;
|
|
const janRoot = path.join(require("os").homedir(), "jan");
|
|
if (!currentModelFile.includes(janRoot)) {
|
|
currentModelFile = path.join(janRoot, currentModelFile);
|
|
}
|
|
const files: string[] = fs.readdirSync(currentModelFile);
|
|
|
|
// Look for GGUF model file
|
|
const ggufBinFile = files.find(
|
|
(file) =>
|
|
file === path.basename(currentModelFile) ||
|
|
file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT)
|
|
);
|
|
|
|
currentModelFile = path.join(currentModelFile, ggufBinFile);
|
|
|
|
if (wrapper.model.engine !== "nitro") {
|
|
return Promise.resolve({ error: "Not a nitro model" });
|
|
} else {
|
|
const nitroResourceProbe = await getResourcesInfo();
|
|
// Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
|
|
if (wrapper.model.settings.prompt_template) {
|
|
const promptTemplate = wrapper.model.settings.prompt_template;
|
|
const prompt = promptTemplateConverter(promptTemplate);
|
|
if (prompt.error) {
|
|
return Promise.resolve({ error: prompt.error });
|
|
}
|
|
wrapper.model.settings.system_prompt = prompt.system_prompt;
|
|
wrapper.model.settings.user_prompt = prompt.user_prompt;
|
|
wrapper.model.settings.ai_prompt = prompt.ai_prompt;
|
|
}
|
|
|
|
currentSettings = {
|
|
llama_model_path: currentModelFile,
|
|
...wrapper.model.settings,
|
|
// This is critical and requires real system information
|
|
cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
|
|
};
|
|
return loadModel(nitroResourceProbe);
|
|
}
|
|
}
|
|
|
|
async function loadModel(nitroResourceProbe: any | undefined) {
|
|
// Gather system information for CPU physical cores and memory
|
|
if (!nitroResourceProbe) nitroResourceProbe = await getResourcesInfo();
|
|
return (
|
|
killSubprocess()
|
|
.then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
|
|
// wait for 500ms to make sure the port is free for windows platform
|
|
.then(() => {
|
|
if (process.platform === "win32") {
|
|
return sleep(500);
|
|
} else {
|
|
return sleep(0);
|
|
}
|
|
})
|
|
.then(() => spawnNitroProcess(nitroResourceProbe))
|
|
.then(() => loadLLMModel(currentSettings))
|
|
.then(validateModelStatus)
|
|
.catch((err) => {
|
|
console.error("error: ", err);
|
|
// TODO: Broadcast error so app could display proper error message
|
|
return { error: err, currentModelFile };
|
|
})
|
|
);
|
|
}
|
|
|
|
// Add function sleep
|
|
function sleep(ms) {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function promptTemplateConverter(promptTemplate) {
|
|
// Split the string using the markers
|
|
const systemMarker = "{system_message}";
|
|
const promptMarker = "{prompt}";
|
|
|
|
if (
|
|
promptTemplate.includes(systemMarker) &&
|
|
promptTemplate.includes(promptMarker)
|
|
) {
|
|
// Find the indices of the markers
|
|
const systemIndex = promptTemplate.indexOf(systemMarker);
|
|
const promptIndex = promptTemplate.indexOf(promptMarker);
|
|
|
|
// Extract the parts of the string
|
|
const system_prompt = promptTemplate.substring(0, systemIndex);
|
|
const user_prompt = promptTemplate.substring(
|
|
systemIndex + systemMarker.length,
|
|
promptIndex
|
|
);
|
|
const ai_prompt = promptTemplate.substring(
|
|
promptIndex + promptMarker.length
|
|
);
|
|
|
|
// Return the split parts
|
|
return { system_prompt, user_prompt, ai_prompt };
|
|
} else if (promptTemplate.includes(promptMarker)) {
|
|
// Extract the parts of the string for the case where only promptMarker is present
|
|
const promptIndex = promptTemplate.indexOf(promptMarker);
|
|
const user_prompt = promptTemplate.substring(0, promptIndex);
|
|
const ai_prompt = promptTemplate.substring(
|
|
promptIndex + promptMarker.length
|
|
);
|
|
const system_prompt = "";
|
|
|
|
// Return the split parts
|
|
return { system_prompt, user_prompt, ai_prompt };
|
|
}
|
|
|
|
// Return an error if none of the conditions are met
|
|
return { error: "Cannot split prompt template" };
|
|
}
|
|
|
|
/**
|
|
* Loads a LLM model into the Nitro subprocess by sending a HTTP POST request.
|
|
* @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
|
|
*/
|
|
function loadLLMModel(settings): Promise<Response> {
|
|
return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
|
|
method: "POST",
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify(settings),
|
|
retries: 3,
|
|
retryDelay: 500,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Validates the status of a model.
|
|
* @returns {Promise<ModelOperationResponse>} A promise that resolves to an object.
|
|
* If the model is loaded successfully, the object is empty.
|
|
* If the model is not loaded successfully, the object contains an error message.
|
|
*/
|
|
async function validateModelStatus(): Promise<ModelOperationResponse> {
|
|
// Send a GET request to the validation URL.
|
|
// Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
|
|
return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
|
|
method: "GET",
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
},
|
|
retries: 5,
|
|
retryDelay: 500,
|
|
}).then(async (res: Response) => {
|
|
// If the response is OK, check model_loaded status.
|
|
if (res.ok) {
|
|
const body = await res.json();
|
|
// If the model is loaded, return an empty object.
|
|
// Otherwise, return an object with an error message.
|
|
if (body.model_loaded) {
|
|
return { error: undefined };
|
|
}
|
|
}
|
|
return { error: "Model loading failed" };
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Terminates the Nitro subprocess.
|
|
* @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
|
|
*/
|
|
async function killSubprocess(): Promise<void> {
|
|
const controller = new AbortController();
|
|
setTimeout(() => controller.abort(), 5000);
|
|
console.debug("Start requesting to kill Nitro...");
|
|
return fetch(NITRO_HTTP_KILL_URL, {
|
|
method: "DELETE",
|
|
signal: controller.signal,
|
|
})
|
|
.then(() => {
|
|
subprocess?.kill();
|
|
subprocess = undefined;
|
|
})
|
|
.catch(() => {})
|
|
.then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
|
|
.then(() => console.debug("Nitro is killed"));
|
|
}
|
|
/**
|
|
* Look for the Nitro binary and execute it
|
|
* Using child-process to spawn the process
|
|
* Should run exactly platform specified Nitro binary version
|
|
*/
|
|
/**
|
|
* Spawns a Nitro subprocess.
|
|
* @param nitroResourceProbe - The Nitro resource probe.
|
|
* @returns A promise that resolves when the Nitro subprocess is started.
|
|
*/
|
|
function spawnNitroProcess(nitroResourceProbe: any): Promise<any> {
|
|
console.debug("Starting Nitro subprocess...");
|
|
return new Promise(async (resolve, reject) => {
|
|
let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
|
|
let cudaVisibleDevices = "";
|
|
let binaryName;
|
|
if (process.platform === "win32") {
|
|
let nvida_info = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf8"));
|
|
if (nvida_info["run_mode"] === "cpu") {
|
|
binaryFolder = path.join(binaryFolder, "win-cpu");
|
|
} else {
|
|
binaryFolder = path.join(binaryFolder, "win-cuda");
|
|
cudaVisibleDevices = nvida_info["gpu_highest_vram"];
|
|
}
|
|
binaryName = "nitro.exe";
|
|
} else if (process.platform === "darwin") {
|
|
if (process.arch === "arm64") {
|
|
binaryFolder = path.join(binaryFolder, "mac-arm64");
|
|
} else {
|
|
binaryFolder = path.join(binaryFolder, "mac-x64");
|
|
}
|
|
binaryName = "nitro";
|
|
} else {
|
|
let nvida_info = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf8"));
|
|
if (nvida_info["run_mode"] === "cpu") {
|
|
binaryFolder = path.join(binaryFolder, "linux-cpu");
|
|
} else {
|
|
binaryFolder = path.join(binaryFolder, "linux-cuda");
|
|
cudaVisibleDevices = nvida_info["gpu_highest_vram"];
|
|
}
|
|
binaryName = "nitro";
|
|
}
|
|
|
|
const binaryPath = path.join(binaryFolder, binaryName);
|
|
// Execute the binary
|
|
subprocess = spawn(binaryPath, [1, LOCAL_HOST, PORT], {
|
|
cwd: binaryFolder,
|
|
env: {
|
|
...process.env,
|
|
CUDA_VISIBLE_DEVICES: cudaVisibleDevices,
|
|
},
|
|
});
|
|
|
|
// Handle subprocess output
|
|
subprocess.stdout.on("data", (data) => {
|
|
console.debug(`stdout: ${data}`);
|
|
});
|
|
|
|
subprocess.stderr.on("data", (data) => {
|
|
console.error("subprocess error:" + data.toString());
|
|
console.error(`stderr: ${data}`);
|
|
});
|
|
|
|
subprocess.on("close", (code) => {
|
|
console.debug(`child process exited with code ${code}`);
|
|
subprocess = null;
|
|
reject(`child process exited with code ${code}`);
|
|
});
|
|
|
|
tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
|
|
resolve(nitroResourceProbe);
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Get the system resources information
|
|
* TODO: Move to Core so that it can be reused
|
|
*/
|
|
function getResourcesInfo(): Promise<ResourcesInfo> {
|
|
return new Promise(async (resolve) => {
|
|
const cpu = await osUtils.cpuCount();
|
|
console.log("cpu: ", cpu);
|
|
const response: ResourcesInfo = {
|
|
numCpuPhysicalCore: cpu,
|
|
memAvailable: 0,
|
|
};
|
|
resolve(response);
|
|
});
|
|
}
|
|
|
|
function dispose() {
|
|
// clean other registered resources here
|
|
killSubprocess();
|
|
}
|
|
|
|
module.exports = {
|
|
initModel,
|
|
stopModel,
|
|
killSubprocess,
|
|
dispose,
|
|
updateNvidiaInfo,
|
|
};
|