chore: deprecate tensorrt-llm extension (#4453)

This commit is contained in:
Louis 2025-01-15 15:43:16 +07:00 committed by GitHub
parent 58bb1b4939
commit 06ee10be1b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 0 additions and 1121 deletions

View File

@ -1,79 +0,0 @@
# Tensorrt-LLM Extension
Created using Jan extension example
# Create a Jan Extension using Typescript
Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀
## Create Your Own Extension
To create your own extension, you can use this repository as a template! Just follow the below instructions:
1. Click the Use this template button at the top of the repository
2. Select Create a new repository
3. Select an owner and name for your new repository
4. Click Create repository
5. Clone your new repository
## Initial Setup
After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension.
> [!NOTE]
>
> You'll need to have a reasonably modern version of
> [Node.js](https://nodejs.org) handy. If you are using a version manager like
> [`nodenv`](https://github.com/nodenv/nodenv) or
> [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
> root of your repository to install the version specified in
> [`package.json`](./package.json). Otherwise, 20.x or later should work!
1. :hammer_and_wrench: Install the dependencies
```bash
npm install
```
1. :building_construction: Package the TypeScript for distribution
```bash
npm run bundle
```
1. :white_check_mark: Check your artifact
There will be a tgz file in your extension directory now
## Update the Extension Metadata
The [`package.json`](package.json) file defines metadata about your extension, such as
extension name, main entry, description and version.
When you copy this repository, update `package.json` with the name, description for your extension.
## Update the Extension Code
The [`src/`](./src/) directory is the heart of your extension! This contains the
source code that will be run when your extension functions are invoked. You can replace the
contents of this directory with your own code.
There are a few things to keep in mind when writing your extension code:
- Most Jan Extension functions are processed asynchronously.
In `index.ts`, you will see that the extension function will return a `Promise<any>`.
```typescript
import { events, MessageEvent, MessageRequest } from '@janhq/core'
function onStart(): Promise<any> {
return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) =>
this.inference(data)
)
}
```
For more information about the Jan Extension Core module, see the
[documentation](https://github.com/janhq/jan/blob/main/core/README.md).
So, what are you waiting for? Go ahead and start customizing your extension!

View File

@ -1,9 +0,0 @@
/** @type {import('ts-jest').JestConfigWithTsJest} */
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
transform: {
'node_modules/@janhq/core/.+\\.(j|t)s?$': 'ts-jest',
},
transformIgnorePatterns: ['node_modules/(?!@janhq/core/.*)'],
}

View File

@ -1,78 +0,0 @@
{
"name": "@janhq/tensorrt-llm-extension",
"productName": "TensorRT-LLM Inference Engine",
"version": "0.0.3",
"description": "This extension enables Nvidia's TensorRT-LLM for the fastest GPU acceleration. See the [setup guide](https://jan.ai/guides/providers/tensorrt-llm/) for next steps.",
"main": "dist/index.js",
"node": "dist/node/index.cjs.js",
"author": "Jan <service@jan.ai>",
"license": "AGPL-3.0",
"config": {
"host": "127.0.0.1",
"port": "3929"
},
"compatibility": {
"platform": [
"win32"
],
"app": [
"0.1.0"
]
},
"tensorrtVersion": "0.1.8",
"provider": "nitro-tensorrt-llm",
"scripts": {
"test": "jest",
"build": "rolldown -c rolldown.config.mjs",
"build:publish": "rimraf *.tgz --glob || true && yarn build && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install"
},
"exports": {
".": "./dist/index.js",
"./main": "./dist/node/index.cjs.js"
},
"devDependencies": {
"@types/decompress": "4.2.7",
"@types/jest": "^29.5.12",
"@types/node": "^20.11.4",
"@types/os-utils": "^0.0.4",
"@types/tcp-port-used": "^1.0.4",
"cpx": "^1.5.0",
"download-cli": "^1.1.1",
"jest": "^29.7.0",
"jest-junit": "^16.0.0",
"jest-runner": "^29.7.0",
"rimraf": "^3.0.2",
"rolldown": "1.0.0-beta.1",
"run-script-os": "^1.1.6",
"ts-jest": "^29.2.5",
"typescript": "^5.2.2"
},
"dependencies": {
"@janhq/core": "../../core/package.tgz",
"decompress": "^4.2.1",
"fetch-retry": "^5.0.6",
"rxjs": "^7.8.1",
"tcp-port-used": "^1.0.2",
"terminate": "^2.6.1",
"ulidx": "^2.3.0"
},
"engines": {
"node": ">=18.0.0"
},
"files": [
"dist/*",
"package.json",
"README.md"
],
"bundleDependencies": [
"tcp-port-used",
"fetch-retry",
"decompress",
"@janhq/core",
"terminate"
],
"installConfig": {
"hoistingLimits": "workspaces"
},
"packageManager": "yarn@4.5.3"
}

View File

@ -1,156 +0,0 @@
[
{
"sources": [
{
"filename": "config.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/config.json"
},
{
"filename": "mistral_float16_tp1_rank0.engine",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/mistral_float16_tp1_rank0.engine"
},
{
"filename": "tokenizer.model",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/tokenizer.model"
},
{
"filename": "special_tokens_map.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/special_tokens_map.json"
},
{
"filename": "tokenizer.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/tokenizer.json"
},
{
"filename": "tokenizer_config.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/tokenizer_config.json"
},
{
"filename": "model.cache",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/model.cache"
}
],
"id": "llamacorn-1.1b-chat-fp16",
"object": "model",
"name": "LlamaCorn 1.1B Chat FP16",
"version": "1.0",
"description": "LlamaCorn is a refined version of TinyLlama-1.1B, optimized for conversational quality, running on consumer devices through TensorRT-LLM",
"format": "TensorRT-LLM",
"settings": {
"ctx_len": 2048,
"text_model": false
},
"parameters": {
"max_tokens": 4096
},
"metadata": {
"author": "LLama",
"tags": ["TensorRT-LLM", "1B", "Finetuned"],
"size": 2151000000
},
"engine": "nitro-tensorrt-llm"
},
{
"sources": [
{
"filename": "config.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/config.json"
},
{
"filename": "mistral_float16_tp1_rank0.engine",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/mistral_float16_tp1_rank0.engine"
},
{
"filename": "tokenizer.model",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/tokenizer.model"
},
{
"filename": "special_tokens_map.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/special_tokens_map.json"
},
{
"filename": "tokenizer.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/tokenizer.json"
},
{
"filename": "tokenizer_config.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/tokenizer_config.json"
},
{
"filename": "model.cache",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/model.cache"
}
],
"id": "tinyjensen-1.1b-chat-fp16",
"object": "model",
"name": "TinyJensen 1.1B Chat FP16",
"version": "1.0",
"description": "Do you want to chat with Jensen Huan? Here you are",
"format": "TensorRT-LLM",
"settings": {
"ctx_len": 2048,
"text_model": false
},
"parameters": {
"max_tokens": 4096
},
"metadata": {
"author": "LLama",
"tags": ["TensorRT-LLM", "1B", "Finetuned"],
"size": 2151000000
},
"engine": "nitro-tensorrt-llm"
},
{
"sources": [
{
"filename": "config.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/config.json"
},
{
"filename": "mistral_float16_tp1_rank0.engine",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/mistral_float16_tp1_rank0.engine"
},
{
"filename": "tokenizer.model",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/tokenizer.model"
},
{
"filename": "special_tokens_map.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/special_tokens_map.json"
},
{
"filename": "tokenizer.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/tokenizer.json"
},
{
"filename": "tokenizer_config.json",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/tokenizer_config.json"
},
{
"filename": "model.cache",
"url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/model.cache"
}
],
"id": "mistral-7b-instruct-int4",
"object": "model",
"name": "Mistral 7B Instruct v0.1 INT4",
"version": "1.0",
"description": "Mistral 7B Instruct v0.1 INT4",
"format": "TensorRT-LLM",
"settings": {
"ctx_len": 2048,
"text_model": false,
"prompt_template": "[INST] {prompt} [/INST]"
},
"parameters": {
"max_tokens": 4096
},
"metadata": {
"author": "MistralAI",
"tags": ["TensorRT-LLM", "7B", "Finetuned"],
"size": 3840000000
},
"engine": "nitro-tensorrt-llm"
}
]

View File

@ -1,59 +0,0 @@
import { defineConfig } from 'rolldown'
import packageJson from './package.json' with { type: 'json' }
import modelsJson from './resources/models.json' with { type: 'json' }
export default defineConfig([
{
input: 'src/index.ts',
output: {
format: 'esm',
file: 'dist/index.js',
},
platform: 'browser',
define: {
MODELS: JSON.stringify(modelsJson),
TENSORRT_VERSION: JSON.stringify(packageJson.tensorrtVersion),
PROVIDER: JSON.stringify(packageJson.provider),
DOWNLOAD_RUNNER_URL:
process.platform === 'win32'
? JSON.stringify(
'https://github.com/janhq/cortex.tensorrt-llm/releases/download/windows-v<version>-tensorrt-llm-v0.7.1/nitro-windows-v<version>-tensorrt-llm-v0.7.1-amd64-all-arch.tar.gz'
)
: JSON.stringify(
'https://github.com/janhq/cortex.tensorrt-llm/releases/download/linux-v<version>/nitro-linux-v<version>-amd64-tensorrt-llm-<gpuarch>.tar.gz'
),
NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
INFERENCE_URL: JSON.stringify(
process.env.INFERENCE_URL ||
`${packageJson.config?.protocol ?? 'http'}://${packageJson.config?.host}:${packageJson.config?.port}/v1/chat/completions`
),
COMPATIBILITY: JSON.stringify(packageJson.compatibility),
},
},
{
input: 'src/node/index.ts',
external: ['@janhq/core/node'],
output: {
format: 'cjs',
file: 'dist/node/index.cjs.js',
sourcemap: false,
inlineDynamicImports: true,
},
replace: {
TENSORRT_VERSION: JSON.stringify(packageJson.tensorrtVersion),
PROVIDER: JSON.stringify(packageJson.provider),
LOAD_MODEL_URL: JSON.stringify(
`${packageJson.config?.protocol ?? 'http'}://${packageJson.config?.host}:${packageJson.config?.port}/inferences/tensorrtllm/loadmodel`
),
TERMINATE_ENGINE_URL: JSON.stringify(
`${packageJson.config?.protocol ?? 'http'}://${packageJson.config?.host}:${packageJson.config?.port}/processmanager/destroy`
),
ENGINE_HOST: JSON.stringify(packageJson.config?.host ?? '127.0.0.1'),
ENGINE_PORT: JSON.stringify(packageJson.config?.port ?? '3928'),
},
resolve: {
extensions: ['.js', '.ts', '.json'],
},
platform: 'node',
},
])

View File

@ -1,11 +0,0 @@
declare const NODE: string
declare const INFERENCE_URL: string
declare const LOAD_MODEL_URL: string
declare const TERMINATE_ENGINE_URL: string
declare const ENGINE_HOST: string
declare const ENGINE_PORT: string
declare const DOWNLOAD_RUNNER_URL: string
declare const TENSORRT_VERSION: string
declare const COMPATIBILITY: object
declare const PROVIDER: string
declare const MODELS: Array<any>

View File

@ -1,186 +0,0 @@
import TensorRTLLMExtension from '../src/index'
import {
executeOnMain,
systemInformation,
fs,
baseName,
joinPath,
downloadFile,
} from '@janhq/core'
jest.mock('@janhq/core', () => ({
...jest.requireActual('@janhq/core/node'),
LocalOAIEngine: jest.fn().mockImplementation(function () {
// @ts-ignore
this.registerModels = () => {
return Promise.resolve()
}
// @ts-ignore
return this
}),
systemInformation: jest.fn(),
fs: {
existsSync: jest.fn(),
mkdir: jest.fn(),
},
joinPath: jest.fn(),
baseName: jest.fn(),
downloadFile: jest.fn(),
executeOnMain: jest.fn(),
showToast: jest.fn(),
events: {
emit: jest.fn(),
// @ts-ignore
on: (event, func) => {
func({ fileName: './' })
},
off: jest.fn(),
},
}))
// @ts-ignore
global.COMPATIBILITY = {
platform: ['win32'],
}
// @ts-ignore
global.PROVIDER = 'tensorrt-llm'
// @ts-ignore
global.INFERENCE_URL = 'http://localhost:5000'
// @ts-ignore
global.NODE = 'node'
// @ts-ignore
global.MODELS = []
// @ts-ignore
global.TENSORRT_VERSION = ''
// @ts-ignore
global.DOWNLOAD_RUNNER_URL = ''
describe('TensorRTLLMExtension', () => {
let extension: TensorRTLLMExtension
beforeEach(() => {
// @ts-ignore
extension = new TensorRTLLMExtension()
jest.clearAllMocks()
})
describe('compatibility', () => {
it('should return the correct compatibility', () => {
const result = extension.compatibility()
expect(result).toEqual({
platform: ['win32'],
})
})
})
describe('install', () => {
it('should install if compatible', async () => {
const mockSystemInfo: any = {
osInfo: { platform: 'win32' },
gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
}
;(executeOnMain as jest.Mock).mockResolvedValue({})
;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
;(fs.existsSync as jest.Mock).mockResolvedValue(false)
;(fs.mkdir as jest.Mock).mockResolvedValue(undefined)
;(baseName as jest.Mock).mockResolvedValue('./')
;(joinPath as jest.Mock).mockResolvedValue('./')
;(downloadFile as jest.Mock).mockResolvedValue({})
await extension.install()
expect(executeOnMain).toHaveBeenCalled()
})
it('should not install if not compatible', async () => {
const mockSystemInfo: any = {
osInfo: { platform: 'linux' },
gpuSetting: { gpus: [{ arch: 'pascal', name: 'NVIDIA GPU' }] },
}
;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
jest.spyOn(extension, 'registerModels').mockReturnValue(Promise.resolve())
await extension.install()
expect(executeOnMain).not.toHaveBeenCalled()
})
})
describe('installationState', () => {
it('should return NotCompatible if not compatible', async () => {
const mockSystemInfo: any = {
osInfo: { platform: 'linux' },
gpuSetting: { gpus: [{ arch: 'pascal', name: 'NVIDIA GPU' }] },
}
;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
const result = await extension.installationState()
expect(result).toBe('NotCompatible')
})
it('should return Installed if executable exists', async () => {
const mockSystemInfo: any = {
osInfo: { platform: 'win32' },
gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
}
;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
;(fs.existsSync as jest.Mock).mockResolvedValue(true)
const result = await extension.installationState()
expect(result).toBe('Installed')
})
it('should return NotInstalled if executable does not exist', async () => {
const mockSystemInfo: any = {
osInfo: { platform: 'win32' },
gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
}
;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
;(fs.existsSync as jest.Mock).mockResolvedValue(false)
const result = await extension.installationState()
expect(result).toBe('NotInstalled')
})
})
describe('isCompatible', () => {
it('should return true for compatible system', () => {
const mockInfo: any = {
osInfo: { platform: 'win32' },
gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
}
const result = extension.isCompatible(mockInfo)
expect(result).toBe(true)
})
it('should return false for incompatible system', () => {
const mockInfo: any = {
osInfo: { platform: 'linux' },
gpuSetting: { gpus: [{ arch: 'pascal', name: 'AMD GPU' }] },
}
const result = extension.isCompatible(mockInfo)
expect(result).toBe(false)
})
})
})
describe('GitHub Release File URL Test', () => {
const url = 'https://github.com/janhq/cortex.tensorrt-llm/releases/download/windows-v0.1.8-tensorrt-llm-v0.7.1/nitro-windows-v0.1.8-tensorrt-llm-v0.7.1-amd64-all-arch.tar.gz';
it('should return a status code 200 for the release file URL', async () => {
const response = await fetch(url, { method: 'HEAD' });
expect(response.status).toBe(200);
});
it('should not return a 404 status', async () => {
const response = await fetch(url, { method: 'HEAD' });
expect(response.status).not.toBe(404);
});
});

View File

@ -1,197 +0,0 @@
/**
* @module tensorrt-llm-extension/src/index
*/
import {
Compatibility,
DownloadEvent,
DownloadRequest,
DownloadState,
InstallationState,
baseName,
downloadFile,
events,
executeOnMain,
joinPath,
showToast,
systemInformation,
LocalOAIEngine,
fs,
MessageRequest,
ModelEvent,
getJanDataFolderPath,
SystemInformation,
Model,
} from '@janhq/core'
/**
* TensorRTLLMExtension - Implementation of LocalOAIEngine
* @extends BaseOAILocalInferenceProvider
* Provide pre-populated models for TensorRTLLM
*/
export default class TensorRTLLMExtension extends LocalOAIEngine {
/**
* Override custom function name for loading and unloading model
* Which are implemented from node module
*/
override provider = PROVIDER
override inferenceUrl = INFERENCE_URL
override nodeModule = NODE
private supportedGpuArch = ['ampere', 'ada']
override compatibility() {
return COMPATIBILITY as unknown as Compatibility
}
override async onLoad(): Promise<void> {
super.onLoad()
if ((await this.installationState()) === 'Installed') {
const models = MODELS as unknown as Model[]
this.registerModels(models)
}
}
override async install(): Promise<void> {
await this.removePopulatedModels()
const info = await systemInformation()
if (!this.isCompatible(info)) return
const janDataFolderPath = await getJanDataFolderPath()
const engineVersion = TENSORRT_VERSION
const executableFolderPath = await joinPath([
janDataFolderPath,
'engines',
this.provider,
engineVersion,
info.gpuSetting?.gpus[0].arch,
])
if (!(await fs.existsSync(executableFolderPath))) {
await fs.mkdir(executableFolderPath)
}
const placeholderUrl = DOWNLOAD_RUNNER_URL
const tensorrtVersion = TENSORRT_VERSION
const url = placeholderUrl
.replace(/<version>/g, tensorrtVersion)
.replace(/<gpuarch>/g, info.gpuSetting!.gpus[0]!.arch!)
const tarball = await baseName(url)
const tarballFullPath = await joinPath([executableFolderPath, tarball])
const downloadRequest: DownloadRequest = {
url,
localPath: tarballFullPath,
extensionId: this.name,
downloadType: 'extension',
}
downloadFile(downloadRequest)
const onFileDownloadSuccess = async (state: DownloadState) => {
// if other download, ignore
if (state.fileName !== tarball) return
events.off(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess)
await executeOnMain(
this.nodeModule,
'decompressRunner',
tarballFullPath,
executableFolderPath
)
events.emit(DownloadEvent.onFileUnzipSuccess, state)
// Prepopulate models as soon as it's ready
const models = MODELS as unknown as Model[]
this.registerModels(models).then(() => {
showToast(
'Extension installed successfully.',
'New models are added to Model Hub.'
)
})
}
events.on(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess)
}
private async removePopulatedModels(): Promise<void> {
const models = MODELS as unknown as Model[]
console.debug(`removePopulatedModels`, JSON.stringify(models))
const janDataFolderPath = await getJanDataFolderPath()
const modelFolderPath = await joinPath([janDataFolderPath, 'models'])
for (const model of models) {
const modelPath = await joinPath([modelFolderPath, model.id])
try {
await fs.rm(modelPath)
} catch (err) {
console.error(`Error removing model ${modelPath}`, err)
}
}
events.emit(ModelEvent.OnModelsUpdate, {})
}
override async loadModel(model: Model): Promise<void> {
if ((await this.installationState()) === 'Installed')
return super.loadModel(model)
throw new Error('EXTENSION_IS_NOT_INSTALLED::TensorRT-LLM extension')
}
override async installationState(): Promise<InstallationState> {
const info = await systemInformation()
if (!this.isCompatible(info)) return 'NotCompatible'
const firstGpu = info.gpuSetting?.gpus[0]
const janDataFolderPath = await getJanDataFolderPath()
const engineVersion = TENSORRT_VERSION
const enginePath = await joinPath([
janDataFolderPath,
'engines',
this.provider,
engineVersion,
firstGpu.arch,
info.osInfo.platform === 'win32' ? 'nitro.exe' : 'nitro',
])
// For now, we just check the executable of nitro x tensor rt
return (await fs.existsSync(enginePath)) ? 'Installed' : 'NotInstalled'
}
override stopInference() {
if (!this.loadedModel) return
showToast(
'Unable to Stop Inference',
'The model does not support stopping inference.'
)
return Promise.resolve()
}
override async inference(data: MessageRequest) {
if (!this.loadedModel) return
// TensorRT LLM Extension supports streaming only
if (data.model && data.model.parameters) data.model.parameters.stream = true
super.inference(data)
}
isCompatible(info: SystemInformation): info is Required<SystemInformation> & {
gpuSetting: { gpus: { arch: string }[] }
} {
const firstGpu = info.gpuSetting?.gpus[0]
return (
!!info.osInfo &&
!!info.gpuSetting &&
!!firstGpu &&
info.gpuSetting.gpus.length > 0 &&
this.compatibility().platform.includes(info.osInfo.platform) &&
!!firstGpu.arch &&
firstGpu.name.toLowerCase().includes('nvidia') &&
this.supportedGpuArch.includes(firstGpu.arch)
)
}
}

View File

@ -1,325 +0,0 @@
import path from 'path'
import { ChildProcessWithoutNullStreams, spawn } from 'child_process'
import tcpPortUsed from 'tcp-port-used'
import fetchRT from 'fetch-retry'
import {
log,
getJanDataFolderPath,
SystemInformation,
PromptTemplate,
} from '@janhq/core/node'
import decompress from 'decompress'
import terminate from 'terminate'
// Polyfill fetch with retry
const fetchRetry = fetchRT(fetch)
const supportedPlatform = (): string[] => ['win32', 'linux']
const supportedGpuArch = (): string[] => ['ampere', 'ada']
const PORT_CHECK_INTERVAL = 100
/**
* The response object for model init operation.
*/
interface ModelLoadParams {
engine_path: string
ctx_len: number
}
// The subprocess instance for Engine
let subprocess: ChildProcessWithoutNullStreams | undefined = undefined
/**
* Initializes a engine subprocess to load a machine learning model.
* @param params - The model load settings.
*/
async function loadModel(
params: any,
systemInfo?: SystemInformation
): Promise<{ error: Error | undefined }> {
// modelFolder is the absolute path to the running model folder
// e.g. ~/jan/models/llama-2
let modelFolder = params.modelFolder
if (params.model.settings?.prompt_template) {
const promptTemplate = params.model.settings.prompt_template
const prompt = promptTemplateConverter(promptTemplate)
if (prompt?.error) {
return Promise.reject(prompt.error)
}
params.model.settings.system_prompt = prompt.system_prompt
params.model.settings.user_prompt = prompt.user_prompt
params.model.settings.ai_prompt = prompt.ai_prompt
}
const settings: ModelLoadParams = {
engine_path: modelFolder,
ctx_len: params.model.settings.ctx_len ?? 2048,
...params.model.settings,
}
if (!systemInfo) {
throw new Error('Cannot get system info. Unable to start nitro x tensorrt.')
}
return runEngineAndLoadModel(settings, systemInfo)
}
/**
* Stops a Engine subprocess.
*/
function unloadModel(): Promise<void> {
const controller = new AbortController()
setTimeout(() => controller.abort(), 5000)
debugLog(`Request to kill engine`)
const killRequest = () => {
return fetch(TERMINATE_ENGINE_URL, {
method: 'DELETE',
signal: controller.signal,
})
.then(() => {
subprocess = undefined
})
.catch(() => {}) // Do nothing with this attempt
.then(() =>
tcpPortUsed.waitUntilFree(
parseInt(ENGINE_PORT),
PORT_CHECK_INTERVAL,
5000
)
) // Wait for port available
.then(() => debugLog(`Engine process is terminated`))
.catch((err) => {
debugLog(
`Could not kill running process on port ${ENGINE_PORT}. Might be another process running on the same port? ${err}`
)
throw 'PORT_NOT_AVAILABLE'
})
}
if (subprocess?.pid) {
log(`[CORTEX]:: Killing PID ${subprocess.pid}`)
const pid = subprocess.pid
return new Promise((resolve, reject) => {
terminate(pid, function (err) {
if (err) {
return killRequest()
} else {
return tcpPortUsed
.waitUntilFree(parseInt(ENGINE_PORT), PORT_CHECK_INTERVAL, 5000)
.then(() => resolve())
.then(() => log(`[CORTEX]:: cortex process is terminated`))
.catch(() => {
killRequest()
})
}
})
})
} else {
return killRequest()
}
}
/**
* 1. Spawn engine process
* 2. Load model into engine subprocess
* @returns
*/
async function runEngineAndLoadModel(
settings: ModelLoadParams,
systemInfo: SystemInformation
) {
return unloadModel()
.then(() => runEngine(systemInfo))
.then(() => loadModelRequest(settings))
.catch((err) => {
// TODO: Broadcast error so app could display proper error message
debugLog(`${err}`, 'Error')
return { error: err }
})
}
/**
* Loads a LLM model into the Engine subprocess by sending a HTTP POST request.
*/
async function loadModelRequest(
settings: ModelLoadParams
): Promise<{ error: Error | undefined }> {
debugLog(`Loading model with params ${JSON.stringify(settings)}`)
return fetchRetry(LOAD_MODEL_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(settings),
retries: 3,
retryDelay: 500,
})
.then((res) => {
debugLog(`Load model success with response ${JSON.stringify(res)}`)
return Promise.resolve({ error: undefined })
})
.catch((err) => {
debugLog(`Load model failed with error ${err}`, 'Error')
return Promise.resolve({ error: err })
})
}
/**
* Spawns engine subprocess.
*/
async function runEngine(systemInfo: SystemInformation): Promise<void> {
debugLog(`Spawning engine subprocess...`)
if (systemInfo.gpuSetting == null) {
return Promise.reject(
'No GPU information found. Please check your GPU setting.'
)
}
if (systemInfo.gpuSetting?.gpus.length === 0) {
return Promise.reject('No GPU found. Please check your GPU setting.')
}
if (systemInfo.osInfo == null) {
return Promise.reject(
'No OS information found. Please check your OS setting.'
)
}
const platform = systemInfo.osInfo.platform
if (platform == null || supportedPlatform().includes(platform) === false) {
return Promise.reject(
'No OS architecture found. Please check your OS setting.'
)
}
const gpu = systemInfo.gpuSetting?.gpus[0]
if (gpu.name.toLowerCase().includes('nvidia') === false) {
return Promise.reject('No Nvidia GPU found. Please check your GPU setting.')
}
const gpuArch = gpu.arch
if (gpuArch == null || supportedGpuArch().includes(gpuArch) === false) {
return Promise.reject(
`Your GPU: ${gpu.name} is not supported. Only ${supportedGpuArch().join(
', '
)} series are supported.`
)
}
const janDataFolderPath = await getJanDataFolderPath()
const tensorRtVersion = TENSORRT_VERSION
const provider = PROVIDER
return new Promise<void>((resolve, reject) => {
// Current directory by default
const executableFolderPath = path.join(
janDataFolderPath,
'engines',
provider,
tensorRtVersion,
gpuArch
)
const nitroExecutablePath = path.join(
executableFolderPath,
platform === 'win32' ? 'nitro.exe' : 'nitro'
)
const args: string[] = ['1', ENGINE_HOST, ENGINE_PORT]
// Execute the binary
debugLog(`Spawn nitro at path: ${nitroExecutablePath}, and args: ${args}`)
subprocess = spawn(nitroExecutablePath, args, {
cwd: executableFolderPath,
env: {
...process.env,
},
})
// Handle subprocess output
subprocess.stdout.on('data', (data: any) => {
debugLog(`${data}`)
})
subprocess.stderr.on('data', (data: any) => {
debugLog(`${data}`)
})
subprocess.on('close', (code: any) => {
debugLog(`Engine exited with code: ${code}`)
subprocess = undefined
reject(`child process exited with code ${code}`)
})
tcpPortUsed
.waitUntilUsed(parseInt(ENGINE_PORT), PORT_CHECK_INTERVAL, 30000)
.then(() => {
debugLog(`Engine is ready`)
resolve()
})
})
}
function debugLog(message: string, level: string = 'Debug') {
log(`[TENSORRT_LLM_NITRO]::${level}:${message}`)
}
const decompressRunner = async (zipPath: string, output: string) => {
console.debug(`Decompressing ${zipPath} to ${output}...`)
try {
const files = await decompress(zipPath, output)
console.debug('Decompress finished!', files)
} catch (err) {
console.error(`Decompress ${zipPath} failed: ${err}`)
}
}
/**
* Parse prompt template into agrs settings
* @param promptTemplate Template as string
* @returns
*/
function promptTemplateConverter(promptTemplate: string): PromptTemplate {
// Split the string using the markers
const systemMarker = '{system_message}'
const promptMarker = '{prompt}'
if (
promptTemplate.includes(systemMarker) &&
promptTemplate.includes(promptMarker)
) {
// Find the indices of the markers
const systemIndex = promptTemplate.indexOf(systemMarker)
const promptIndex = promptTemplate.indexOf(promptMarker)
// Extract the parts of the string
const system_prompt = promptTemplate.substring(0, systemIndex)
const user_prompt = promptTemplate.substring(
systemIndex + systemMarker.length,
promptIndex
)
const ai_prompt = promptTemplate.substring(
promptIndex + promptMarker.length
)
// Return the split parts
return { system_prompt, user_prompt, ai_prompt }
} else if (promptTemplate.includes(promptMarker)) {
// Extract the parts of the string for the case where only promptMarker is present
const promptIndex = promptTemplate.indexOf(promptMarker)
const user_prompt = promptTemplate.substring(0, promptIndex)
const ai_prompt = promptTemplate.substring(
promptIndex + promptMarker.length
)
// Return the split parts
return { user_prompt, ai_prompt }
}
// Return an error if none of the conditions are met
return { error: 'Cannot split prompt template' }
}
export default {
supportedPlatform,
supportedGpuArch,
decompressRunner,
loadModel,
unloadModel,
dispose: unloadModel,
}

View File

@ -1,21 +0,0 @@
{
"compilerOptions": {
"moduleResolution": "node",
"target": "ES2015",
"module": "ES2020",
"lib": ["es2015", "es2016", "es2017", "dom"],
"strict": true,
"sourceMap": true,
"declaration": true,
"allowSyntheticDefaultImports": true,
"experimentalDecorators": true,
"emitDecoratorMetadata": true,
"declarationDir": "dist/types",
"outDir": "dist",
"importHelpers": true,
"resolveJsonModule": true,
"typeRoots": ["node_modules/@types"]
},
"include": ["src"],
"exclude": ["**/*.test.ts"]
}