refactor: introduce node module in nitro extension (#1630)
This commit is contained in:
parent
db987e88f9
commit
f4f861d0e9
@ -15,13 +15,6 @@
|
||||
"dist"
|
||||
],
|
||||
"author": "Jan <service@jan.ai>",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": ""
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.0.0"
|
||||
},
|
||||
"exports": {
|
||||
".": "./dist/core.umd.js",
|
||||
"./sdk": "./dist/core.umd.js",
|
||||
@ -49,53 +42,6 @@
|
||||
"build": "tsc --module commonjs && rollup -c rollup.config.ts",
|
||||
"start": "rollup -c rollup.config.ts -w"
|
||||
},
|
||||
"lint-staged": {
|
||||
"{src,test}/**/*.ts": [
|
||||
"prettier --write",
|
||||
"git add"
|
||||
]
|
||||
},
|
||||
"config": {
|
||||
"commitizen": {
|
||||
"path": "node_modules/cz-conventional-changelog"
|
||||
}
|
||||
},
|
||||
"jest": {
|
||||
"transform": {
|
||||
".(ts|tsx)": "ts-jest"
|
||||
},
|
||||
"testEnvironment": "node",
|
||||
"testRegex": "(/__tests__/.*|\\.(test|spec))\\.(ts|tsx|js)$",
|
||||
"moduleFileExtensions": [
|
||||
"ts",
|
||||
"tsx",
|
||||
"js"
|
||||
],
|
||||
"coveragePathIgnorePatterns": [
|
||||
"/node_modules/",
|
||||
"/test/"
|
||||
],
|
||||
"coverageThreshold": {
|
||||
"global": {
|
||||
"branches": 90,
|
||||
"functions": 95,
|
||||
"lines": 95,
|
||||
"statements": 95
|
||||
}
|
||||
},
|
||||
"collectCoverageFrom": [
|
||||
"src/*.{js,ts}"
|
||||
]
|
||||
},
|
||||
"prettier": {
|
||||
"semi": false,
|
||||
"singleQuote": true
|
||||
},
|
||||
"commitlint": {
|
||||
"extends": [
|
||||
"@commitlint/config-conventional"
|
||||
]
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^12.0.2",
|
||||
"rollup": "^2.38.5",
|
||||
@ -104,7 +50,6 @@
|
||||
"rollup-plugin-node-resolve": "^5.2.0",
|
||||
"rollup-plugin-sourcemaps": "^0.6.3",
|
||||
"rollup-plugin-typescript2": "^0.36.0",
|
||||
"ts-node": "^7.0.1",
|
||||
"tslib": "^2.6.2",
|
||||
"typescript": "^5.2.2"
|
||||
}
|
||||
|
||||
@ -104,6 +104,9 @@ export type ModelSettingParams = {
|
||||
n_parallel?: number
|
||||
cpu_threads?: number
|
||||
prompt_template?: string
|
||||
system_prompt?: string
|
||||
ai_prompt?: string
|
||||
user_prompt?: string
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -3,11 +3,11 @@
|
||||
"version": "1.0.0",
|
||||
"description": "This extension embeds Nitro, a lightweight (3mb) inference engine written in C++. See nitro.jan.ai",
|
||||
"main": "dist/index.js",
|
||||
"module": "dist/module.js",
|
||||
"node": "dist/node/index.cjs.js",
|
||||
"author": "Jan <service@jan.ai>",
|
||||
"license": "AGPL-3.0",
|
||||
"scripts": {
|
||||
"build": "tsc -b . && webpack --config webpack.config.js",
|
||||
"build": "tsc --module commonjs && rollup -c rollup.config.ts",
|
||||
"downloadnitro:linux": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/nitro",
|
||||
"downloadnitro:darwin": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-arm64.tar.gz -e --strip 1 -o ./bin/mac-arm64 && chmod +x ./bin/mac-arm64/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-amd64.tar.gz -e --strip 1 -o ./bin/mac-x64 && chmod +x ./bin/mac-x64/nitro",
|
||||
"downloadnitro:win32": "download.bat",
|
||||
@ -19,24 +19,33 @@
|
||||
},
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
"./main": "./dist/module.js"
|
||||
"./main": "./dist/node/index.cjs.js"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@rollup/plugin-commonjs": "^25.0.7",
|
||||
"@rollup/plugin-json": "^6.1.0",
|
||||
"@rollup/plugin-node-resolve": "^15.2.3",
|
||||
"@types/node": "^20.11.4",
|
||||
"@types/tcp-port-used": "^1.0.4",
|
||||
"cpx": "^1.5.0",
|
||||
"download-cli": "^1.1.1",
|
||||
"rimraf": "^3.0.2",
|
||||
"rollup": "^2.38.5",
|
||||
"rollup-plugin-define": "^1.0.1",
|
||||
"rollup-plugin-sourcemaps": "^0.6.3",
|
||||
"rollup-plugin-typescript2": "^0.36.0",
|
||||
"run-script-os": "^1.1.6",
|
||||
"webpack": "^5.88.2",
|
||||
"webpack-cli": "^5.1.4"
|
||||
"typescript": "^5.3.3"
|
||||
},
|
||||
"dependencies": {
|
||||
"@janhq/core": "file:../../core",
|
||||
"download-cli": "^1.1.1",
|
||||
"@rollup/plugin-replace": "^5.0.5",
|
||||
"@types/os-utils": "^0.0.4",
|
||||
"fetch-retry": "^5.0.6",
|
||||
"os-utils": "^0.0.14",
|
||||
"path-browserify": "^1.0.1",
|
||||
"rxjs": "^7.8.1",
|
||||
"tcp-port-used": "^1.0.2",
|
||||
"ts-loader": "^9.5.0",
|
||||
"ulid": "^2.3.0"
|
||||
},
|
||||
"engines": {
|
||||
|
||||
77
extensions/inference-nitro-extension/rollup.config.ts
Normal file
77
extensions/inference-nitro-extension/rollup.config.ts
Normal file
@ -0,0 +1,77 @@
|
||||
import resolve from "@rollup/plugin-node-resolve";
|
||||
import commonjs from "@rollup/plugin-commonjs";
|
||||
import sourceMaps from "rollup-plugin-sourcemaps";
|
||||
import typescript from "rollup-plugin-typescript2";
|
||||
import json from "@rollup/plugin-json";
|
||||
import replace from "@rollup/plugin-replace";
|
||||
const packageJson = require("./package.json");
|
||||
|
||||
const pkg = require("./package.json");
|
||||
|
||||
export default [
|
||||
{
|
||||
input: `src/index.ts`,
|
||||
output: [{ file: pkg.main, format: "es", sourcemap: true }],
|
||||
// Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
|
||||
external: [],
|
||||
watch: {
|
||||
include: "src/**",
|
||||
},
|
||||
plugins: [
|
||||
replace({
|
||||
NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
|
||||
INFERENCE_URL: JSON.stringify(
|
||||
process.env.INFERENCE_URL ||
|
||||
"http://127.0.0.1:3928/inferences/llamacpp/chat_completion"
|
||||
),
|
||||
TROUBLESHOOTING_URL: JSON.stringify(
|
||||
"https://jan.ai/guides/troubleshooting"
|
||||
),
|
||||
}),
|
||||
// Allow json resolution
|
||||
json(),
|
||||
// Compile TypeScript files
|
||||
typescript({ useTsconfigDeclarationDir: true }),
|
||||
// Compile TypeScript files
|
||||
// Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
|
||||
commonjs(),
|
||||
// Allow node_modules resolution, so you can use 'external' to control
|
||||
// which external modules to include in the bundle
|
||||
// https://github.com/rollup/rollup-plugin-node-resolve#usage
|
||||
resolve({
|
||||
extensions: [".js", ".ts", ".svelte"],
|
||||
}),
|
||||
|
||||
// Resolve source maps to the original source
|
||||
sourceMaps(),
|
||||
],
|
||||
},
|
||||
{
|
||||
input: `src/node/index.ts`,
|
||||
output: [
|
||||
{ file: "dist/node/index.cjs.js", format: "cjs", sourcemap: true },
|
||||
],
|
||||
// Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
|
||||
external: ["@janhq/core/node"],
|
||||
watch: {
|
||||
include: "src/node/**",
|
||||
},
|
||||
plugins: [
|
||||
// Allow json resolution
|
||||
json(),
|
||||
// Compile TypeScript files
|
||||
typescript({ useTsconfigDeclarationDir: true }),
|
||||
// Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
|
||||
commonjs(),
|
||||
// Allow node_modules resolution, so you can use 'external' to control
|
||||
// which external modules to include in the bundle
|
||||
// https://github.com/rollup/rollup-plugin-node-resolve#usage
|
||||
resolve({
|
||||
extensions: [".ts", ".js", ".json"],
|
||||
}),
|
||||
|
||||
// Resolve source maps to the original source
|
||||
sourceMaps(),
|
||||
],
|
||||
},
|
||||
];
|
||||
@ -1,4 +1,4 @@
|
||||
declare const MODULE: string;
|
||||
declare const NODE: string;
|
||||
declare const INFERENCE_URL: string;
|
||||
declare const TROUBLESHOOTING_URL: string;
|
||||
|
||||
|
||||
@ -26,7 +26,6 @@ import {
|
||||
} from "@janhq/core";
|
||||
import { requestInference } from "./helpers/sse";
|
||||
import { ulid } from "ulid";
|
||||
import { join } from "path";
|
||||
|
||||
/**
|
||||
* A class that implements the InferenceExtension interface from the @janhq/core package.
|
||||
@ -43,7 +42,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
*/
|
||||
private static readonly _intervalHealthCheck = 5 * 1000;
|
||||
|
||||
private _currentModel: Model;
|
||||
private _currentModel: Model | undefined;
|
||||
|
||||
private _engineSettings: EngineSettings = {
|
||||
ctx_len: 2048,
|
||||
@ -82,7 +81,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
if (!(await fs.existsSync(JanInferenceNitroExtension._homeDir))) {
|
||||
await fs
|
||||
.mkdirSync(JanInferenceNitroExtension._homeDir)
|
||||
.catch((err) => console.debug(err));
|
||||
.catch((err: Error) => console.debug(err));
|
||||
}
|
||||
|
||||
if (!(await fs.existsSync(JanInferenceNitroExtension._settingsDir)))
|
||||
@ -90,7 +89,9 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
this.writeDefaultEngineSettings();
|
||||
|
||||
// Events subscription
|
||||
events.on(EventName.OnMessageSent, (data) => this.onMessageRequest(data));
|
||||
events.on(EventName.OnMessageSent, (data: MessageRequest) =>
|
||||
this.onMessageRequest(data)
|
||||
);
|
||||
|
||||
events.on(EventName.OnModelInit, (model: Model) => this.onModelInit(model));
|
||||
|
||||
@ -99,7 +100,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
events.on(EventName.OnInferenceStopped, () => this.onInferenceStopped());
|
||||
|
||||
// Attempt to fetch nvidia info
|
||||
await executeOnMain(MODULE, "updateNvidiaInfo", {});
|
||||
await executeOnMain(NODE, "updateNvidiaInfo", {});
|
||||
}
|
||||
|
||||
/**
|
||||
@ -109,10 +110,10 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
|
||||
private async writeDefaultEngineSettings() {
|
||||
try {
|
||||
const engineFile = join(
|
||||
const engineFile = await joinPath([
|
||||
JanInferenceNitroExtension._homeDir,
|
||||
JanInferenceNitroExtension._engineMetadataFileName
|
||||
);
|
||||
JanInferenceNitroExtension._engineMetadataFileName,
|
||||
]);
|
||||
if (await fs.existsSync(engineFile)) {
|
||||
const engine = await fs.readFileSync(engineFile, "utf-8");
|
||||
this._engineSettings =
|
||||
@ -133,12 +134,12 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
|
||||
const modelFullPath = await joinPath(["models", model.id]);
|
||||
|
||||
const nitroInitResult = await executeOnMain(MODULE, "initModel", {
|
||||
modelFullPath: modelFullPath,
|
||||
model: model,
|
||||
const nitroInitResult = await executeOnMain(NODE, "runModel", {
|
||||
modelFullPath,
|
||||
model,
|
||||
});
|
||||
|
||||
if (nitroInitResult.error === null) {
|
||||
if (nitroInitResult?.error) {
|
||||
events.emit(EventName.OnModelFail, model);
|
||||
return;
|
||||
}
|
||||
@ -155,12 +156,11 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
private async onModelStop(model: Model) {
|
||||
if (model.engine !== "nitro") return;
|
||||
|
||||
await executeOnMain(MODULE, "stopModel");
|
||||
await executeOnMain(NODE, "stopModel");
|
||||
events.emit(EventName.OnModelStopped, {});
|
||||
|
||||
// stop the periocally health check
|
||||
if (this.getNitroProcesHealthIntervalId) {
|
||||
console.debug("Stop calling Nitro process health check");
|
||||
clearInterval(this.getNitroProcesHealthIntervalId);
|
||||
this.getNitroProcesHealthIntervalId = undefined;
|
||||
}
|
||||
@ -170,7 +170,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
* Periodically check for nitro process's health.
|
||||
*/
|
||||
private async periodicallyGetNitroHealth(): Promise<void> {
|
||||
const health = await executeOnMain(MODULE, "getCurrentNitroProcessInfo");
|
||||
const health = await executeOnMain(NODE, "getCurrentNitroProcessInfo");
|
||||
|
||||
const isRunning = this.nitroProcessInfo?.isRunning ?? false;
|
||||
if (isRunning && health.isRunning === false) {
|
||||
@ -204,6 +204,8 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
};
|
||||
|
||||
return new Promise(async (resolve, reject) => {
|
||||
if (!this._currentModel) return Promise.reject("No model loaded");
|
||||
|
||||
requestInference(data.messages ?? [], this._currentModel).subscribe({
|
||||
next: (_content) => {},
|
||||
complete: async () => {
|
||||
@ -223,7 +225,9 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
* @param {MessageRequest} data - The data for the new message request.
|
||||
*/
|
||||
private async onMessageRequest(data: MessageRequest) {
|
||||
if (data.model.engine !== "nitro") return;
|
||||
if (data.model?.engine !== InferenceEngine.nitro || !this._currentModel) {
|
||||
return;
|
||||
}
|
||||
|
||||
const timestamp = Date.now();
|
||||
const message: ThreadMessage = {
|
||||
@ -242,11 +246,12 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
|
||||
this.isCancelled = false;
|
||||
this.controller = new AbortController();
|
||||
|
||||
requestInference(
|
||||
data.messages ?? [],
|
||||
{ ...this._currentModel, ...data.model },
|
||||
this.controller
|
||||
).subscribe({
|
||||
// @ts-ignore
|
||||
const model: Model = {
|
||||
...(this._currentModel || {}),
|
||||
...(data.model || {}),
|
||||
};
|
||||
requestInference(data.messages ?? [], model, this.controller).subscribe({
|
||||
next: (content) => {
|
||||
const messageContent: ThreadContent = {
|
||||
type: ContentType.Text,
|
||||
|
||||
@ -1,514 +0,0 @@
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const { exec, spawn } = require("child_process");
|
||||
const tcpPortUsed = require("tcp-port-used");
|
||||
const fetchRetry = require("fetch-retry")(global.fetch);
|
||||
const osUtils = require("os-utils");
|
||||
const { readFileSync, writeFileSync, existsSync } = require("fs");
|
||||
const { log } = require("@janhq/core/node");
|
||||
|
||||
// The PORT to use for the Nitro subprocess
|
||||
const PORT = 3928;
|
||||
const LOCAL_HOST = "127.0.0.1";
|
||||
const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${PORT}`;
|
||||
const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/loadmodel`;
|
||||
const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/modelstatus`;
|
||||
const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`;
|
||||
const SUPPORTED_MODEL_FORMAT = ".gguf";
|
||||
const NVIDIA_INFO_FILE = path.join(
|
||||
require("os").homedir(),
|
||||
"jan",
|
||||
"settings",
|
||||
"settings.json"
|
||||
);
|
||||
|
||||
// The subprocess instance for Nitro
|
||||
let subprocess = undefined;
|
||||
let currentModelFile: string = undefined;
|
||||
let currentSettings = undefined;
|
||||
|
||||
let nitroProcessInfo = undefined;
|
||||
|
||||
/**
|
||||
* Default GPU settings
|
||||
**/
|
||||
const DEFALT_SETTINGS = {
|
||||
notify: true,
|
||||
run_mode: "cpu",
|
||||
nvidia_driver: {
|
||||
exist: false,
|
||||
version: "",
|
||||
},
|
||||
cuda: {
|
||||
exist: false,
|
||||
version: "",
|
||||
},
|
||||
gpus: [],
|
||||
gpu_highest_vram: "",
|
||||
};
|
||||
|
||||
/**
|
||||
* Stops a Nitro subprocess.
|
||||
* @param wrapper - The model wrapper.
|
||||
* @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
|
||||
*/
|
||||
function stopModel(): Promise<void> {
|
||||
return killSubprocess();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes a Nitro subprocess to load a machine learning model.
|
||||
* @param wrapper - The model wrapper.
|
||||
* @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
|
||||
* TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
|
||||
* TODO: Should it be startModel instead?
|
||||
*/
|
||||
async function initModel(wrapper: any): Promise<ModelOperationResponse> {
|
||||
currentModelFile = wrapper.modelFullPath;
|
||||
const janRoot = path.join(require("os").homedir(), "jan");
|
||||
if (!currentModelFile.includes(janRoot)) {
|
||||
currentModelFile = path.join(janRoot, currentModelFile);
|
||||
}
|
||||
const files: string[] = fs.readdirSync(currentModelFile);
|
||||
|
||||
// Look for GGUF model file
|
||||
const ggufBinFile = files.find(
|
||||
(file) =>
|
||||
file === path.basename(currentModelFile) ||
|
||||
file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT)
|
||||
);
|
||||
|
||||
currentModelFile = path.join(currentModelFile, ggufBinFile);
|
||||
|
||||
if (wrapper.model.engine !== "nitro") {
|
||||
return Promise.resolve({ error: "Not a nitro model" });
|
||||
} else {
|
||||
const nitroResourceProbe = await getResourcesInfo();
|
||||
// Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
|
||||
if (wrapper.model.settings.prompt_template) {
|
||||
const promptTemplate = wrapper.model.settings.prompt_template;
|
||||
const prompt = promptTemplateConverter(promptTemplate);
|
||||
if (prompt.error) {
|
||||
return Promise.resolve({ error: prompt.error });
|
||||
}
|
||||
wrapper.model.settings.system_prompt = prompt.system_prompt;
|
||||
wrapper.model.settings.user_prompt = prompt.user_prompt;
|
||||
wrapper.model.settings.ai_prompt = prompt.ai_prompt;
|
||||
}
|
||||
|
||||
currentSettings = {
|
||||
llama_model_path: currentModelFile,
|
||||
...wrapper.model.settings,
|
||||
// This is critical and requires real system information
|
||||
cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
|
||||
};
|
||||
return loadModel(nitroResourceProbe);
|
||||
}
|
||||
}
|
||||
|
||||
async function loadModel(nitroResourceProbe: any | undefined) {
|
||||
// Gather system information for CPU physical cores and memory
|
||||
if (!nitroResourceProbe) nitroResourceProbe = await getResourcesInfo();
|
||||
return killSubprocess()
|
||||
.then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
|
||||
.then(() => {
|
||||
/**
|
||||
* There is a problem with Windows process manager
|
||||
* Should wait for awhile to make sure the port is free and subprocess is killed
|
||||
* The tested threshold is 500ms
|
||||
**/
|
||||
if (process.platform === "win32") {
|
||||
return new Promise((resolve) => setTimeout(resolve, 500));
|
||||
} else {
|
||||
return Promise.resolve();
|
||||
}
|
||||
})
|
||||
.then(() => spawnNitroProcess(nitroResourceProbe))
|
||||
.then(() => loadLLMModel(currentSettings))
|
||||
.then(validateModelStatus)
|
||||
.catch((err) => {
|
||||
log(`[NITRO]::Error: ${err}`);
|
||||
// TODO: Broadcast error so app could display proper error message
|
||||
return { error: err, currentModelFile };
|
||||
});
|
||||
}
|
||||
|
||||
function promptTemplateConverter(promptTemplate) {
|
||||
// Split the string using the markers
|
||||
const systemMarker = "{system_message}";
|
||||
const promptMarker = "{prompt}";
|
||||
|
||||
if (
|
||||
promptTemplate.includes(systemMarker) &&
|
||||
promptTemplate.includes(promptMarker)
|
||||
) {
|
||||
// Find the indices of the markers
|
||||
const systemIndex = promptTemplate.indexOf(systemMarker);
|
||||
const promptIndex = promptTemplate.indexOf(promptMarker);
|
||||
|
||||
// Extract the parts of the string
|
||||
const system_prompt = promptTemplate.substring(0, systemIndex);
|
||||
const user_prompt = promptTemplate.substring(
|
||||
systemIndex + systemMarker.length,
|
||||
promptIndex
|
||||
);
|
||||
const ai_prompt = promptTemplate.substring(
|
||||
promptIndex + promptMarker.length
|
||||
);
|
||||
|
||||
// Return the split parts
|
||||
return { system_prompt, user_prompt, ai_prompt };
|
||||
} else if (promptTemplate.includes(promptMarker)) {
|
||||
// Extract the parts of the string for the case where only promptMarker is present
|
||||
const promptIndex = promptTemplate.indexOf(promptMarker);
|
||||
const user_prompt = promptTemplate.substring(0, promptIndex);
|
||||
const ai_prompt = promptTemplate.substring(
|
||||
promptIndex + promptMarker.length
|
||||
);
|
||||
const system_prompt = "";
|
||||
|
||||
// Return the split parts
|
||||
return { system_prompt, user_prompt, ai_prompt };
|
||||
}
|
||||
|
||||
// Return an error if none of the conditions are met
|
||||
return { error: "Cannot split prompt template" };
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a LLM model into the Nitro subprocess by sending a HTTP POST request.
|
||||
* @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
|
||||
*/
|
||||
function loadLLMModel(settings): Promise<Response> {
|
||||
log(`[NITRO]::Debug: Loading model with params ${JSON.stringify(settings)}`);
|
||||
return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(settings),
|
||||
retries: 3,
|
||||
retryDelay: 500,
|
||||
}).catch((err) => {
|
||||
log(`[NITRO]::Error: Load model failed with error ${err}`);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates the status of a model.
|
||||
* @returns {Promise<ModelOperationResponse>} A promise that resolves to an object.
|
||||
* If the model is loaded successfully, the object is empty.
|
||||
* If the model is not loaded successfully, the object contains an error message.
|
||||
*/
|
||||
async function validateModelStatus(): Promise<ModelOperationResponse> {
|
||||
// Send a GET request to the validation URL.
|
||||
// Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
|
||||
return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
retries: 5,
|
||||
retryDelay: 500,
|
||||
}).then(async (res: Response) => {
|
||||
// If the response is OK, check model_loaded status.
|
||||
if (res.ok) {
|
||||
const body = await res.json();
|
||||
// If the model is loaded, return an empty object.
|
||||
// Otherwise, return an object with an error message.
|
||||
if (body.model_loaded) {
|
||||
return { error: undefined };
|
||||
}
|
||||
}
|
||||
return { error: "Model loading failed" };
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminates the Nitro subprocess.
|
||||
* @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
|
||||
*/
|
||||
async function killSubprocess(): Promise<void> {
|
||||
const controller = new AbortController();
|
||||
setTimeout(() => controller.abort(), 5000);
|
||||
log(`[NITRO]::Debug: Request to kill Nitro`);
|
||||
|
||||
return fetch(NITRO_HTTP_KILL_URL, {
|
||||
method: "DELETE",
|
||||
signal: controller.signal,
|
||||
})
|
||||
.then(() => {
|
||||
subprocess?.kill();
|
||||
subprocess = undefined;
|
||||
})
|
||||
.catch(() => {})
|
||||
.then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
|
||||
.then(() => log(`[NITRO]::Debug: Nitro process is terminated`));
|
||||
}
|
||||
|
||||
/**
|
||||
* Spawns a Nitro subprocess.
|
||||
* @param nitroResourceProbe - The Nitro resource probe.
|
||||
* @returns A promise that resolves when the Nitro subprocess is started.
|
||||
*/
|
||||
function spawnNitroProcess(nitroResourceProbe: any): Promise<any> {
|
||||
log(`[NITRO]::Debug: Spawning Nitro subprocess...`);
|
||||
|
||||
return new Promise(async (resolve, reject) => {
|
||||
let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
|
||||
let cudaVisibleDevices = "";
|
||||
let binaryName;
|
||||
if (process.platform === "win32") {
|
||||
let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
if (nvidiaInfo["run_mode"] === "cpu") {
|
||||
binaryFolder = path.join(binaryFolder, "win-cpu");
|
||||
} else {
|
||||
if (nvidiaInfo["cuda"].version === "12") {
|
||||
binaryFolder = path.join(binaryFolder, "win-cuda-12-0");
|
||||
} else {
|
||||
binaryFolder = path.join(binaryFolder, "win-cuda-11-7");
|
||||
}
|
||||
cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
|
||||
}
|
||||
binaryName = "nitro.exe";
|
||||
} else if (process.platform === "darwin") {
|
||||
if (process.arch === "arm64") {
|
||||
binaryFolder = path.join(binaryFolder, "mac-arm64");
|
||||
} else {
|
||||
binaryFolder = path.join(binaryFolder, "mac-x64");
|
||||
}
|
||||
binaryName = "nitro";
|
||||
} else {
|
||||
let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
if (nvidiaInfo["run_mode"] === "cpu") {
|
||||
binaryFolder = path.join(binaryFolder, "linux-cpu");
|
||||
} else {
|
||||
if (nvidiaInfo["cuda"].version === "12") {
|
||||
binaryFolder = path.join(binaryFolder, "linux-cuda-12-0");
|
||||
} else {
|
||||
binaryFolder = path.join(binaryFolder, "linux-cuda-11-7");
|
||||
}
|
||||
cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
|
||||
}
|
||||
binaryName = "nitro";
|
||||
}
|
||||
|
||||
const binaryPath = path.join(binaryFolder, binaryName);
|
||||
// Execute the binary
|
||||
subprocess = spawn(binaryPath, ["1", LOCAL_HOST, PORT.toString()], {
|
||||
cwd: binaryFolder,
|
||||
env: {
|
||||
...process.env,
|
||||
CUDA_VISIBLE_DEVICES: cudaVisibleDevices,
|
||||
},
|
||||
});
|
||||
|
||||
// Handle subprocess output
|
||||
subprocess.stdout.on("data", (data) => {
|
||||
log(`[NITRO]::Debug: ${data}`);
|
||||
});
|
||||
|
||||
subprocess.stderr.on("data", (data) => {
|
||||
log(`[NITRO]::Error: ${data}`);
|
||||
});
|
||||
|
||||
subprocess.on("close", (code) => {
|
||||
log(`[NITRO]::Debug: Nitro exited with code: ${code}`);
|
||||
subprocess = null;
|
||||
reject(`child process exited with code ${code}`);
|
||||
});
|
||||
|
||||
tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
|
||||
resolve(nitroResourceProbe);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the system resources information
|
||||
* TODO: Move to Core so that it can be reused
|
||||
*/
|
||||
function getResourcesInfo(): Promise<ResourcesInfo> {
|
||||
return new Promise(async (resolve) => {
|
||||
const cpu = await osUtils.cpuCount();
|
||||
log(`[NITRO]::CPU informations - ${cpu}`);
|
||||
const response: ResourcesInfo = {
|
||||
numCpuPhysicalCore: cpu,
|
||||
memAvailable: 0,
|
||||
};
|
||||
resolve(response);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* This will retrive GPU informations and persist settings.json
|
||||
* Will be called when the extension is loaded to turn on GPU acceleration if supported
|
||||
*/
|
||||
async function updateNvidiaInfo() {
|
||||
if (process.platform !== "darwin") {
|
||||
await Promise.all([
|
||||
updateNvidiaDriverInfo(),
|
||||
updateCudaExistence(),
|
||||
updateGpuInfo(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve current nitro process
|
||||
*/
|
||||
const getCurrentNitroProcessInfo = (): Promise<any> => {
|
||||
nitroProcessInfo = {
|
||||
isRunning: subprocess != null,
|
||||
};
|
||||
return nitroProcessInfo;
|
||||
};
|
||||
|
||||
/**
|
||||
* Every module should have a dispose function
|
||||
* This will be called when the extension is unloaded and should clean up any resources
|
||||
* Also called when app is closed
|
||||
*/
|
||||
function dispose() {
|
||||
// clean other registered resources here
|
||||
killSubprocess();
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate nvidia and cuda for linux and windows
|
||||
*/
|
||||
async function updateNvidiaDriverInfo(): Promise<void> {
|
||||
exec(
|
||||
"nvidia-smi --query-gpu=driver_version --format=csv,noheader",
|
||||
(error, stdout) => {
|
||||
let data;
|
||||
try {
|
||||
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
} catch (error) {
|
||||
data = DEFALT_SETTINGS;
|
||||
}
|
||||
|
||||
if (!error) {
|
||||
const firstLine = stdout.split("\n")[0].trim();
|
||||
data["nvidia_driver"].exist = true;
|
||||
data["nvidia_driver"].version = firstLine;
|
||||
} else {
|
||||
data["nvidia_driver"].exist = false;
|
||||
}
|
||||
|
||||
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
||||
Promise.resolve();
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if file exists in paths
|
||||
*/
|
||||
function checkFileExistenceInPaths(file: string, paths: string[]): boolean {
|
||||
return paths.some((p) => existsSync(path.join(p, file)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate cuda for linux and windows
|
||||
*/
|
||||
function updateCudaExistence() {
|
||||
let filesCuda12: string[];
|
||||
let filesCuda11: string[];
|
||||
let paths: string[];
|
||||
let cudaVersion: string = "";
|
||||
|
||||
if (process.platform === "win32") {
|
||||
filesCuda12 = ["cublas64_12.dll", "cudart64_12.dll", "cublasLt64_12.dll"];
|
||||
filesCuda11 = ["cublas64_11.dll", "cudart64_11.dll", "cublasLt64_11.dll"];
|
||||
paths = process.env.PATH ? process.env.PATH.split(path.delimiter) : [];
|
||||
} else {
|
||||
filesCuda12 = ["libcudart.so.12", "libcublas.so.12", "libcublasLt.so.12"];
|
||||
filesCuda11 = ["libcudart.so.11.0", "libcublas.so.11", "libcublasLt.so.11"];
|
||||
paths = process.env.LD_LIBRARY_PATH
|
||||
? process.env.LD_LIBRARY_PATH.split(path.delimiter)
|
||||
: [];
|
||||
paths.push("/usr/lib/x86_64-linux-gnu/");
|
||||
}
|
||||
|
||||
let cudaExists = filesCuda12.every(
|
||||
(file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
|
||||
);
|
||||
|
||||
if (!cudaExists) {
|
||||
cudaExists = filesCuda11.every(
|
||||
(file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
|
||||
);
|
||||
if (cudaExists) {
|
||||
cudaVersion = "11";
|
||||
}
|
||||
} else {
|
||||
cudaVersion = "12";
|
||||
}
|
||||
|
||||
let data;
|
||||
try {
|
||||
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
} catch (error) {
|
||||
data = DEFALT_SETTINGS;
|
||||
}
|
||||
|
||||
data["cuda"].exist = cudaExists;
|
||||
data["cuda"].version = cudaVersion;
|
||||
if (cudaExists) {
|
||||
data.run_mode = "gpu";
|
||||
}
|
||||
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get GPU information
|
||||
*/
|
||||
async function updateGpuInfo(): Promise<void> {
|
||||
exec(
|
||||
"nvidia-smi --query-gpu=index,memory.total --format=csv,noheader,nounits",
|
||||
(error, stdout) => {
|
||||
let data;
|
||||
try {
|
||||
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
} catch (error) {
|
||||
data = DEFALT_SETTINGS;
|
||||
}
|
||||
|
||||
if (!error) {
|
||||
// Get GPU info and gpu has higher memory first
|
||||
let highestVram = 0;
|
||||
let highestVramId = "0";
|
||||
let gpus = stdout
|
||||
.trim()
|
||||
.split("\n")
|
||||
.map((line) => {
|
||||
let [id, vram] = line.split(", ");
|
||||
vram = vram.replace(/\r/g, "");
|
||||
if (parseFloat(vram) > highestVram) {
|
||||
highestVram = parseFloat(vram);
|
||||
highestVramId = id;
|
||||
}
|
||||
return { id, vram };
|
||||
});
|
||||
|
||||
data["gpus"] = gpus;
|
||||
data["gpu_highest_vram"] = highestVramId;
|
||||
} else {
|
||||
data["gpus"] = [];
|
||||
}
|
||||
|
||||
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
||||
Promise.resolve();
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initModel,
|
||||
stopModel,
|
||||
killSubprocess,
|
||||
dispose,
|
||||
updateNvidiaInfo,
|
||||
getCurrentNitroProcessInfo,
|
||||
};
|
||||
65
extensions/inference-nitro-extension/src/node/execute.ts
Normal file
65
extensions/inference-nitro-extension/src/node/execute.ts
Normal file
@ -0,0 +1,65 @@
|
||||
import { readFileSync } from "fs";
|
||||
import * as path from "path";
|
||||
import { NVIDIA_INFO_FILE } from "./nvidia";
|
||||
|
||||
export interface NitroExecutableOptions {
|
||||
executablePath: string;
|
||||
cudaVisibleDevices: string;
|
||||
}
|
||||
/**
|
||||
* Find which executable file to run based on the current platform.
|
||||
* @returns The name of the executable file to run.
|
||||
*/
|
||||
export const executableNitroFile = (): NitroExecutableOptions => {
|
||||
let binaryFolder = path.join(__dirname, "..", "bin"); // Current directory by default
|
||||
let cudaVisibleDevices = "";
|
||||
let binaryName = "nitro";
|
||||
/**
|
||||
* The binary folder is different for each platform.
|
||||
*/
|
||||
if (process.platform === "win32") {
|
||||
/**
|
||||
* For Windows: win-cpu, win-cuda-11-7, win-cuda-12-0
|
||||
*/
|
||||
let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
if (nvidiaInfo["run_mode"] === "cpu") {
|
||||
binaryFolder = path.join(binaryFolder, "win-cpu");
|
||||
} else {
|
||||
if (nvidiaInfo["cuda"].version === "12") {
|
||||
binaryFolder = path.join(binaryFolder, "win-cuda-12-0");
|
||||
} else {
|
||||
binaryFolder = path.join(binaryFolder, "win-cuda-11-7");
|
||||
}
|
||||
cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
|
||||
}
|
||||
binaryName = "nitro.exe";
|
||||
} else if (process.platform === "darwin") {
|
||||
/**
|
||||
* For MacOS: mac-arm64 (Silicon), mac-x64 (InteL)
|
||||
*/
|
||||
if (process.arch === "arm64") {
|
||||
binaryFolder = path.join(binaryFolder, "mac-arm64");
|
||||
} else {
|
||||
binaryFolder = path.join(binaryFolder, "mac-x64");
|
||||
}
|
||||
} else {
|
||||
/**
|
||||
* For Linux: linux-cpu, linux-cuda-11-7, linux-cuda-12-0
|
||||
*/
|
||||
let nvidiaInfo = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
if (nvidiaInfo["run_mode"] === "cpu") {
|
||||
binaryFolder = path.join(binaryFolder, "linux-cpu");
|
||||
} else {
|
||||
if (nvidiaInfo["cuda"].version === "12") {
|
||||
binaryFolder = path.join(binaryFolder, "linux-cuda-12-0");
|
||||
} else {
|
||||
binaryFolder = path.join(binaryFolder, "linux-cuda-11-7");
|
||||
}
|
||||
cudaVisibleDevices = nvidiaInfo["gpu_highest_vram"];
|
||||
}
|
||||
}
|
||||
return {
|
||||
executablePath: path.join(binaryFolder, binaryName),
|
||||
cudaVisibleDevices,
|
||||
};
|
||||
};
|
||||
379
extensions/inference-nitro-extension/src/node/index.ts
Normal file
379
extensions/inference-nitro-extension/src/node/index.ts
Normal file
@ -0,0 +1,379 @@
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { ChildProcessWithoutNullStreams, spawn } from "child_process";
|
||||
import tcpPortUsed from "tcp-port-used";
|
||||
import fetchRT from "fetch-retry";
|
||||
import osUtils from "os-utils";
|
||||
import { log } from "@janhq/core/node";
|
||||
import { getNitroProcessInfo, updateNvidiaInfo } from "./nvidia";
|
||||
import { Model, InferenceEngine, ModelSettingParams } from "@janhq/core";
|
||||
import { executableNitroFile } from "./execute";
|
||||
import { homedir } from "os";
|
||||
// Polyfill fetch with retry
|
||||
const fetchRetry = fetchRT(fetch);
|
||||
|
||||
/**
|
||||
* The response object for model init operation.
|
||||
*/
|
||||
interface ModelInitOptions {
|
||||
modelFullPath: string;
|
||||
model: Model;
|
||||
}
|
||||
|
||||
/**
|
||||
* The response object of Prompt Template parsing.
|
||||
*/
|
||||
interface PromptTemplate {
|
||||
system_prompt?: string;
|
||||
ai_prompt?: string;
|
||||
user_prompt?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Model setting args for Nitro model load.
|
||||
*/
|
||||
interface ModelSettingArgs extends ModelSettingParams {
|
||||
llama_model_path: string;
|
||||
cpu_threads: number;
|
||||
}
|
||||
|
||||
// The PORT to use for the Nitro subprocess
|
||||
const PORT = 3928;
|
||||
// The HOST address to use for the Nitro subprocess
|
||||
const LOCAL_HOST = "127.0.0.1";
|
||||
// The URL for the Nitro subprocess
|
||||
const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${PORT}`;
|
||||
// The URL for the Nitro subprocess to load a model
|
||||
const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/loadmodel`;
|
||||
// The URL for the Nitro subprocess to validate a model
|
||||
const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/modelstatus`;
|
||||
// The URL for the Nitro subprocess to kill itself
|
||||
const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`;
|
||||
|
||||
// The supported model format
|
||||
// TODO: Should be an array to support more models
|
||||
const SUPPORTED_MODEL_FORMAT = ".gguf";
|
||||
|
||||
// The subprocess instance for Nitro
|
||||
let subprocess: ChildProcessWithoutNullStreams | undefined = undefined;
|
||||
// The current model file url
|
||||
let currentModelFile: string = "";
|
||||
// The current model settings
|
||||
let currentSettings: ModelSettingArgs | undefined = undefined;
|
||||
|
||||
/**
|
||||
* Stops a Nitro subprocess.
|
||||
* @param wrapper - The model wrapper.
|
||||
* @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
|
||||
*/
|
||||
function stopModel(): Promise<void> {
|
||||
return killSubprocess();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes a Nitro subprocess to load a machine learning model.
|
||||
* @param wrapper - The model wrapper.
|
||||
* @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
|
||||
* TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
|
||||
*/
|
||||
async function runModel(
|
||||
wrapper: ModelInitOptions
|
||||
): Promise<ModelOperationResponse | void> {
|
||||
if (wrapper.model.engine !== InferenceEngine.nitro) {
|
||||
// Not a nitro model
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
currentModelFile = wrapper.modelFullPath;
|
||||
const janRoot = path.join(homedir(), "jan");
|
||||
if (!currentModelFile.includes(janRoot)) {
|
||||
currentModelFile = path.join(janRoot, currentModelFile);
|
||||
}
|
||||
const files: string[] = fs.readdirSync(currentModelFile);
|
||||
|
||||
// Look for GGUF model file
|
||||
const ggufBinFile = files.find(
|
||||
(file) =>
|
||||
file === path.basename(currentModelFile) ||
|
||||
file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT)
|
||||
);
|
||||
|
||||
if (!ggufBinFile) return Promise.reject("No GGUF model file found");
|
||||
|
||||
currentModelFile = path.join(currentModelFile, ggufBinFile);
|
||||
|
||||
if (wrapper.model.engine !== InferenceEngine.nitro) {
|
||||
return Promise.reject("Not a nitro model");
|
||||
} else {
|
||||
const nitroResourceProbe = await getResourcesInfo();
|
||||
// Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
|
||||
if (wrapper.model.settings.prompt_template) {
|
||||
const promptTemplate = wrapper.model.settings.prompt_template;
|
||||
const prompt = promptTemplateConverter(promptTemplate);
|
||||
if (prompt?.error) {
|
||||
return Promise.reject(prompt.error);
|
||||
}
|
||||
wrapper.model.settings.system_prompt = prompt.system_prompt;
|
||||
wrapper.model.settings.user_prompt = prompt.user_prompt;
|
||||
wrapper.model.settings.ai_prompt = prompt.ai_prompt;
|
||||
}
|
||||
|
||||
currentSettings = {
|
||||
llama_model_path: currentModelFile,
|
||||
...wrapper.model.settings,
|
||||
// This is critical and requires real system information
|
||||
cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
|
||||
};
|
||||
return runNitroAndLoadModel();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 1. Spawn Nitro process
|
||||
* 2. Load model into Nitro subprocess
|
||||
* 3. Validate model status
|
||||
* @returns
|
||||
*/
|
||||
async function runNitroAndLoadModel() {
|
||||
// Gather system information for CPU physical cores and memory
|
||||
return killSubprocess()
|
||||
.then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
|
||||
.then(() => {
|
||||
/**
|
||||
* There is a problem with Windows process manager
|
||||
* Should wait for awhile to make sure the port is free and subprocess is killed
|
||||
* The tested threshold is 500ms
|
||||
**/
|
||||
if (process.platform === "win32") {
|
||||
return new Promise((resolve) => setTimeout(resolve, 500));
|
||||
} else {
|
||||
return Promise.resolve();
|
||||
}
|
||||
})
|
||||
.then(spawnNitroProcess)
|
||||
.then(() => loadLLMModel(currentSettings))
|
||||
.then(validateModelStatus)
|
||||
.catch((err) => {
|
||||
// TODO: Broadcast error so app could display proper error message
|
||||
log(`[NITRO]::Error: ${err}`);
|
||||
return { error: err };
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse prompt template into agrs settings
|
||||
* @param promptTemplate Template as string
|
||||
* @returns
|
||||
*/
|
||||
function promptTemplateConverter(promptTemplate: string): PromptTemplate {
|
||||
// Split the string using the markers
|
||||
const systemMarker = "{system_message}";
|
||||
const promptMarker = "{prompt}";
|
||||
|
||||
if (
|
||||
promptTemplate.includes(systemMarker) &&
|
||||
promptTemplate.includes(promptMarker)
|
||||
) {
|
||||
// Find the indices of the markers
|
||||
const systemIndex = promptTemplate.indexOf(systemMarker);
|
||||
const promptIndex = promptTemplate.indexOf(promptMarker);
|
||||
|
||||
// Extract the parts of the string
|
||||
const system_prompt = promptTemplate.substring(0, systemIndex);
|
||||
const user_prompt = promptTemplate.substring(
|
||||
systemIndex + systemMarker.length,
|
||||
promptIndex
|
||||
);
|
||||
const ai_prompt = promptTemplate.substring(
|
||||
promptIndex + promptMarker.length
|
||||
);
|
||||
|
||||
// Return the split parts
|
||||
return { system_prompt, user_prompt, ai_prompt };
|
||||
} else if (promptTemplate.includes(promptMarker)) {
|
||||
// Extract the parts of the string for the case where only promptMarker is present
|
||||
const promptIndex = promptTemplate.indexOf(promptMarker);
|
||||
const user_prompt = promptTemplate.substring(0, promptIndex);
|
||||
const ai_prompt = promptTemplate.substring(
|
||||
promptIndex + promptMarker.length
|
||||
);
|
||||
|
||||
// Return the split parts
|
||||
return { user_prompt, ai_prompt };
|
||||
}
|
||||
|
||||
// Return an error if none of the conditions are met
|
||||
return { error: "Cannot split prompt template" };
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a LLM model into the Nitro subprocess by sending a HTTP POST request.
|
||||
* @returns A Promise that resolves when the model is loaded successfully, or rejects with an error message if the model is not found or fails to load.
|
||||
*/
|
||||
function loadLLMModel(settings: any): Promise<Response> {
|
||||
log(`[NITRO]::Debug: Loading model with params ${JSON.stringify(settings)}`);
|
||||
return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(settings),
|
||||
retries: 3,
|
||||
retryDelay: 500,
|
||||
})
|
||||
.then((res) => {
|
||||
log(
|
||||
`[NITRO]::Debug: Load model success with response ${JSON.stringify(
|
||||
res
|
||||
)}`
|
||||
);
|
||||
return Promise.resolve(res);
|
||||
})
|
||||
.catch((err) => {
|
||||
log(`[NITRO]::Error: Load model failed with error ${err}`);
|
||||
return Promise.reject();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates the status of a model.
|
||||
* @returns {Promise<ModelOperationResponse>} A promise that resolves to an object.
|
||||
* If the model is loaded successfully, the object is empty.
|
||||
* If the model is not loaded successfully, the object contains an error message.
|
||||
*/
|
||||
async function validateModelStatus(): Promise<void> {
|
||||
// Send a GET request to the validation URL.
|
||||
// Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
|
||||
return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
retries: 5,
|
||||
retryDelay: 500,
|
||||
}).then(async (res: Response) => {
|
||||
log(
|
||||
`[NITRO]::Debug: Validate model state success with response ${JSON.stringify(
|
||||
res
|
||||
)}`
|
||||
);
|
||||
// If the response is OK, check model_loaded status.
|
||||
if (res.ok) {
|
||||
const body = await res.json();
|
||||
// If the model is loaded, return an empty object.
|
||||
// Otherwise, return an object with an error message.
|
||||
if (body.model_loaded) {
|
||||
return Promise.resolve();
|
||||
}
|
||||
}
|
||||
return Promise.reject("Validate model status failed");
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminates the Nitro subprocess.
|
||||
* @returns A Promise that resolves when the subprocess is terminated successfully, or rejects with an error message if the subprocess fails to terminate.
|
||||
*/
|
||||
async function killSubprocess(): Promise<void> {
|
||||
const controller = new AbortController();
|
||||
setTimeout(() => controller.abort(), 5000);
|
||||
log(`[NITRO]::Debug: Request to kill Nitro`);
|
||||
|
||||
return fetch(NITRO_HTTP_KILL_URL, {
|
||||
method: "DELETE",
|
||||
signal: controller.signal,
|
||||
})
|
||||
.then(() => {
|
||||
subprocess?.kill();
|
||||
subprocess = undefined;
|
||||
})
|
||||
.catch(() => {})
|
||||
.then(() => tcpPortUsed.waitUntilFree(PORT, 300, 5000))
|
||||
.then(() => log(`[NITRO]::Debug: Nitro process is terminated`));
|
||||
}
|
||||
|
||||
/**
|
||||
* Spawns a Nitro subprocess.
|
||||
* @returns A promise that resolves when the Nitro subprocess is started.
|
||||
*/
|
||||
function spawnNitroProcess(): Promise<any> {
|
||||
log(`[NITRO]::Debug: Spawning Nitro subprocess...`);
|
||||
|
||||
return new Promise<void>(async (resolve, reject) => {
|
||||
let binaryFolder = path.join(__dirname, "..", "bin"); // Current directory by default
|
||||
let executableOptions = executableNitroFile();
|
||||
|
||||
const args: string[] = ["1", LOCAL_HOST, PORT.toString()];
|
||||
// Execute the binary
|
||||
log(
|
||||
`[NITRO]::Debug: Spawn nitro at path: ${executableOptions.executablePath}, and args: ${args}`
|
||||
);
|
||||
subprocess = spawn(
|
||||
executableOptions.executablePath,
|
||||
["1", LOCAL_HOST, PORT.toString()],
|
||||
{
|
||||
cwd: binaryFolder,
|
||||
env: {
|
||||
...process.env,
|
||||
CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
// Handle subprocess output
|
||||
subprocess.stdout.on("data", (data: any) => {
|
||||
log(`[NITRO]::Debug: ${data}`);
|
||||
});
|
||||
|
||||
subprocess.stderr.on("data", (data: any) => {
|
||||
log(`[NITRO]::Error: ${data}`);
|
||||
});
|
||||
|
||||
subprocess.on("close", (code: any) => {
|
||||
log(`[NITRO]::Debug: Nitro exited with code: ${code}`);
|
||||
subprocess = undefined;
|
||||
reject(`child process exited with code ${code}`);
|
||||
});
|
||||
|
||||
tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
|
||||
log(`[NITRO]::Debug: Nitro is ready`);
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the system resources information
|
||||
* TODO: Move to Core so that it can be reused
|
||||
*/
|
||||
function getResourcesInfo(): Promise<ResourcesInfo> {
|
||||
return new Promise(async (resolve) => {
|
||||
const cpu = await osUtils.cpuCount();
|
||||
log(`[NITRO]::CPU informations - ${cpu}`);
|
||||
const response: ResourcesInfo = {
|
||||
numCpuPhysicalCore: cpu,
|
||||
memAvailable: 0,
|
||||
};
|
||||
resolve(response);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Every module should have a dispose function
|
||||
* This will be called when the extension is unloaded and should clean up any resources
|
||||
* Also called when app is closed
|
||||
*/
|
||||
function dispose() {
|
||||
// clean other registered resources here
|
||||
killSubprocess();
|
||||
}
|
||||
|
||||
export default {
|
||||
runModel,
|
||||
stopModel,
|
||||
killSubprocess,
|
||||
dispose,
|
||||
updateNvidiaInfo,
|
||||
getCurrentNitroProcessInfo: () => getNitroProcessInfo(subprocess),
|
||||
};
|
||||
201
extensions/inference-nitro-extension/src/node/nvidia.ts
Normal file
201
extensions/inference-nitro-extension/src/node/nvidia.ts
Normal file
@ -0,0 +1,201 @@
|
||||
import { writeFileSync, existsSync, readFileSync } from "fs";
|
||||
import { exec } from "child_process";
|
||||
import path from "path";
|
||||
import { homedir } from "os";
|
||||
|
||||
/**
|
||||
* Default GPU settings
|
||||
**/
|
||||
const DEFALT_SETTINGS = {
|
||||
notify: true,
|
||||
run_mode: "cpu",
|
||||
nvidia_driver: {
|
||||
exist: false,
|
||||
version: "",
|
||||
},
|
||||
cuda: {
|
||||
exist: false,
|
||||
version: "",
|
||||
},
|
||||
gpus: [],
|
||||
gpu_highest_vram: "",
|
||||
};
|
||||
|
||||
/**
|
||||
* Path to the settings file
|
||||
**/
|
||||
export const NVIDIA_INFO_FILE = path.join(
|
||||
homedir(),
|
||||
"jan",
|
||||
"settings",
|
||||
"settings.json"
|
||||
);
|
||||
|
||||
/**
|
||||
* Current nitro process
|
||||
*/
|
||||
let nitroProcessInfo: NitroProcessInfo | undefined = undefined;
|
||||
|
||||
/**
|
||||
* Nitro process info
|
||||
*/
|
||||
export interface NitroProcessInfo {
|
||||
isRunning: boolean
|
||||
}
|
||||
|
||||
/**
|
||||
* This will retrive GPU informations and persist settings.json
|
||||
* Will be called when the extension is loaded to turn on GPU acceleration if supported
|
||||
*/
|
||||
export async function updateNvidiaInfo() {
|
||||
if (process.platform !== "darwin") {
|
||||
await Promise.all([
|
||||
updateNvidiaDriverInfo(),
|
||||
updateCudaExistence(),
|
||||
updateGpuInfo(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve current nitro process
|
||||
*/
|
||||
export const getNitroProcessInfo = (subprocess: any): NitroProcessInfo => {
|
||||
nitroProcessInfo = {
|
||||
isRunning: subprocess != null,
|
||||
};
|
||||
return nitroProcessInfo;
|
||||
};
|
||||
|
||||
/**
|
||||
* Validate nvidia and cuda for linux and windows
|
||||
*/
|
||||
export async function updateNvidiaDriverInfo(): Promise<void> {
|
||||
exec(
|
||||
"nvidia-smi --query-gpu=driver_version --format=csv,noheader",
|
||||
(error, stdout) => {
|
||||
let data;
|
||||
try {
|
||||
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
} catch (error) {
|
||||
data = DEFALT_SETTINGS;
|
||||
}
|
||||
|
||||
if (!error) {
|
||||
const firstLine = stdout.split("\n")[0].trim();
|
||||
data["nvidia_driver"].exist = true;
|
||||
data["nvidia_driver"].version = firstLine;
|
||||
} else {
|
||||
data["nvidia_driver"].exist = false;
|
||||
}
|
||||
|
||||
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
||||
Promise.resolve();
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if file exists in paths
|
||||
*/
|
||||
export function checkFileExistenceInPaths(
|
||||
file: string,
|
||||
paths: string[]
|
||||
): boolean {
|
||||
return paths.some((p) => existsSync(path.join(p, file)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate cuda for linux and windows
|
||||
*/
|
||||
export function updateCudaExistence() {
|
||||
let filesCuda12: string[];
|
||||
let filesCuda11: string[];
|
||||
let paths: string[];
|
||||
let cudaVersion: string = "";
|
||||
|
||||
if (process.platform === "win32") {
|
||||
filesCuda12 = ["cublas64_12.dll", "cudart64_12.dll", "cublasLt64_12.dll"];
|
||||
filesCuda11 = ["cublas64_11.dll", "cudart64_11.dll", "cublasLt64_11.dll"];
|
||||
paths = process.env.PATH ? process.env.PATH.split(path.delimiter) : [];
|
||||
} else {
|
||||
filesCuda12 = ["libcudart.so.12", "libcublas.so.12", "libcublasLt.so.12"];
|
||||
filesCuda11 = ["libcudart.so.11.0", "libcublas.so.11", "libcublasLt.so.11"];
|
||||
paths = process.env.LD_LIBRARY_PATH
|
||||
? process.env.LD_LIBRARY_PATH.split(path.delimiter)
|
||||
: [];
|
||||
paths.push("/usr/lib/x86_64-linux-gnu/");
|
||||
}
|
||||
|
||||
let cudaExists = filesCuda12.every(
|
||||
(file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
|
||||
);
|
||||
|
||||
if (!cudaExists) {
|
||||
cudaExists = filesCuda11.every(
|
||||
(file) => existsSync(file) || checkFileExistenceInPaths(file, paths)
|
||||
);
|
||||
if (cudaExists) {
|
||||
cudaVersion = "11";
|
||||
}
|
||||
} else {
|
||||
cudaVersion = "12";
|
||||
}
|
||||
|
||||
let data;
|
||||
try {
|
||||
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
} catch (error) {
|
||||
data = DEFALT_SETTINGS;
|
||||
}
|
||||
|
||||
data["cuda"].exist = cudaExists;
|
||||
data["cuda"].version = cudaVersion;
|
||||
if (cudaExists) {
|
||||
data.run_mode = "gpu";
|
||||
}
|
||||
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get GPU information
|
||||
*/
|
||||
export async function updateGpuInfo(): Promise<void> {
|
||||
exec(
|
||||
"nvidia-smi --query-gpu=index,memory.total --format=csv,noheader,nounits",
|
||||
(error, stdout) => {
|
||||
let data;
|
||||
try {
|
||||
data = JSON.parse(readFileSync(NVIDIA_INFO_FILE, "utf-8"));
|
||||
} catch (error) {
|
||||
data = DEFALT_SETTINGS;
|
||||
}
|
||||
|
||||
if (!error) {
|
||||
// Get GPU info and gpu has higher memory first
|
||||
let highestVram = 0;
|
||||
let highestVramId = "0";
|
||||
let gpus = stdout
|
||||
.trim()
|
||||
.split("\n")
|
||||
.map((line) => {
|
||||
let [id, vram] = line.split(", ");
|
||||
vram = vram.replace(/\r/g, "");
|
||||
if (parseFloat(vram) > highestVram) {
|
||||
highestVram = parseFloat(vram);
|
||||
highestVramId = id;
|
||||
}
|
||||
return { id, vram };
|
||||
});
|
||||
|
||||
data["gpus"] = gpus;
|
||||
data["gpu_highest_vram"] = highestVramId;
|
||||
} else {
|
||||
data["gpus"] = [];
|
||||
}
|
||||
|
||||
writeFileSync(NVIDIA_INFO_FILE, JSON.stringify(data, null, 2));
|
||||
Promise.resolve();
|
||||
}
|
||||
);
|
||||
}
|
||||
@ -1,15 +1,19 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "es2016",
|
||||
"module": "ES6",
|
||||
"moduleResolution": "node",
|
||||
|
||||
"outDir": "./dist",
|
||||
"esModuleInterop": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"strict": false,
|
||||
"skipLibCheck": true,
|
||||
"rootDir": "./src"
|
||||
"target": "es5",
|
||||
"module": "ES2020",
|
||||
"lib": ["es2015", "es2016", "es2017", "dom"],
|
||||
"strict": true,
|
||||
"sourceMap": true,
|
||||
"declaration": true,
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"experimentalDecorators": true,
|
||||
"emitDecoratorMetadata": true,
|
||||
"declarationDir": "dist/types",
|
||||
"outDir": "dist",
|
||||
"importHelpers": true,
|
||||
"typeRoots": ["node_modules/@types"]
|
||||
},
|
||||
"include": ["./src"]
|
||||
"include": ["src"]
|
||||
}
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
const path = require("path");
|
||||
const webpack = require("webpack");
|
||||
const packageJson = require("./package.json");
|
||||
|
||||
module.exports = {
|
||||
experiments: { outputModule: true },
|
||||
entry: "./src/index.ts", // Adjust the entry point to match your project's main file
|
||||
mode: "production",
|
||||
module: {
|
||||
rules: [
|
||||
{
|
||||
test: /\.tsx?$/,
|
||||
use: "ts-loader",
|
||||
exclude: /node_modules/,
|
||||
},
|
||||
],
|
||||
},
|
||||
plugins: [
|
||||
new webpack.DefinePlugin({
|
||||
MODULE: JSON.stringify(`${packageJson.name}/${packageJson.module}`),
|
||||
INFERENCE_URL: JSON.stringify(
|
||||
process.env.INFERENCE_URL ||
|
||||
"http://127.0.0.1:3928/inferences/llamacpp/chat_completion"
|
||||
),
|
||||
TROUBLESHOOTING_URL: JSON.stringify("https://jan.ai/guides/troubleshooting")
|
||||
}),
|
||||
],
|
||||
output: {
|
||||
filename: "index.js", // Adjust the output file name as needed
|
||||
path: path.resolve(__dirname, "dist"),
|
||||
library: { type: "module" }, // Specify ESM output format
|
||||
},
|
||||
resolve: {
|
||||
extensions: [".ts", ".js"],
|
||||
fallback: {
|
||||
path: require.resolve("path-browserify"),
|
||||
},
|
||||
},
|
||||
optimization: {
|
||||
minimize: false,
|
||||
},
|
||||
// Add loaders and other configuration as needed for your project
|
||||
};
|
||||
Loading…
x
Reference in New Issue
Block a user