chore: deprecate tensorrt-llm extension (#4453)

2025-01-15 15:43:16 +07:00 · 2025-01-15 15:43:16 +07:00 · 06ee10be1b
commit 06ee10be1b
parent 58bb1b4939
10 changed files with 0 additions and 1121 deletions
--- a/extensions/tensorrt-llm-extension/README.md
+++ b/extensions/tensorrt-llm-extension/README.md
@ -1,79 +0,0 @@
 # Tensorrt-LLM Extension
 Created using Jan extension example
 # Create a Jan Extension using Typescript
 Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀
 ## Create Your Own Extension
 To create your own extension, you can use this repository as a template! Just follow the below instructions:
 1. Click the Use this template button at the top of the repository
 2. Select Create a new repository
 3. Select an owner and name for your new repository
 4. Click Create repository
 5. Clone your new repository
 ## Initial Setup
 After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension.
 > [!NOTE]
 >
 > You'll need to have a reasonably modern version of
 > [Node.js](https://nodejs.org) handy. If you are using a version manager like
 > [`nodenv`](https://github.com/nodenv/nodenv) or
 > [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
 > root of your repository to install the version specified in
 > [`package.json`](./package.json). Otherwise, 20.x or later should work!
 1. :hammer_and_wrench: Install the dependencies
   ```bash
   npm install
   ```
 1. :building_construction: Package the TypeScript for distribution
   ```bash
   npm run bundle
   ```
 1. :white_check_mark: Check your artifact
   There will be a tgz file in your extension directory now
 ## Update the Extension Metadata
 The [`package.json`](package.json) file defines metadata about your extension, such as
 extension name, main entry, description and version.
 When you copy this repository, update `package.json` with the name, description for your extension.
 ## Update the Extension Code
 The [`src/`](./src/) directory is the heart of your extension! This contains the
 source code that will be run when your extension functions are invoked. You can replace the
 contents of this directory with your own code.
 There are a few things to keep in mind when writing your extension code:
 - Most Jan Extension functions are processed asynchronously.
  In `index.ts`, you will see that the extension function will return a `Promise<any>`.
  ```typescript
  import { events, MessageEvent, MessageRequest } from '@janhq/core'
  function onStart(): Promise<any> {
    return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) =>
      this.inference(data)
    )
  }
  ```
  For more information about the Jan Extension Core module, see the
  [documentation](https://github.com/janhq/jan/blob/main/core/README.md).
 So, what are you waiting for? Go ahead and start customizing your extension!
--- a/extensions/tensorrt-llm-extension/jest.config.js
+++ b/extensions/tensorrt-llm-extension/jest.config.js
@ -1,9 +0,0 @@
 /** @type {import('ts-jest').JestConfigWithTsJest} */
 module.exports = {
  preset: 'ts-jest',
  testEnvironment: 'node',
  transform: {
    'node_modules/@janhq/core/.+\\.(j|t)s?$': 'ts-jest',
  },
  transformIgnorePatterns: ['node_modules/(?!@janhq/core/.*)'],
 }
--- a/extensions/tensorrt-llm-extension/package.json
+++ b/extensions/tensorrt-llm-extension/package.json
@ -1,78 +0,0 @@
 {
  "name": "@janhq/tensorrt-llm-extension",
  "productName": "TensorRT-LLM Inference Engine",
  "version": "0.0.3",
  "description": "This extension enables Nvidia's TensorRT-LLM for the fastest GPU acceleration. See the [setup guide](https://jan.ai/guides/providers/tensorrt-llm/) for next steps.",
  "main": "dist/index.js",
  "node": "dist/node/index.cjs.js",
  "author": "Jan <service@jan.ai>",
  "license": "AGPL-3.0",
  "config": {
    "host": "127.0.0.1",
    "port": "3929"
  },
  "compatibility": {
    "platform": [
      "win32"
    ],
    "app": [
      "0.1.0"
    ]
  },
  "tensorrtVersion": "0.1.8",
  "provider": "nitro-tensorrt-llm",
  "scripts": {
    "test": "jest",
    "build": "rolldown -c rolldown.config.mjs",
    "build:publish": "rimraf *.tgz --glob || true && yarn build && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install"
  },
  "exports": {
    ".": "./dist/index.js",
    "./main": "./dist/node/index.cjs.js"
  },
  "devDependencies": {
    "@types/decompress": "4.2.7",
    "@types/jest": "^29.5.12",
    "@types/node": "^20.11.4",
    "@types/os-utils": "^0.0.4",
    "@types/tcp-port-used": "^1.0.4",
    "cpx": "^1.5.0",
    "download-cli": "^1.1.1",
    "jest": "^29.7.0",
    "jest-junit": "^16.0.0",
    "jest-runner": "^29.7.0",
    "rimraf": "^3.0.2",
    "rolldown": "1.0.0-beta.1",
    "run-script-os": "^1.1.6",
    "ts-jest": "^29.2.5",
    "typescript": "^5.2.2"
  },
  "dependencies": {
    "@janhq/core": "../../core/package.tgz",
    "decompress": "^4.2.1",
    "fetch-retry": "^5.0.6",
    "rxjs": "^7.8.1",
    "tcp-port-used": "^1.0.2",
    "terminate": "^2.6.1",
    "ulidx": "^2.3.0"
  },
  "engines": {
    "node": ">=18.0.0"
  },
  "files": [
    "dist/*",
    "package.json",
    "README.md"
  ],
  "bundleDependencies": [
    "tcp-port-used",
    "fetch-retry",
    "decompress",
    "@janhq/core",
    "terminate"
  ],
  "installConfig": {
    "hoistingLimits": "workspaces"
  },
  "packageManager": "yarn@4.5.3"
 }
--- a/extensions/tensorrt-llm-extension/resources/models.json
+++ b/extensions/tensorrt-llm-extension/resources/models.json
@ -1,156 +0,0 @@
 [
  {
    "sources": [
      {
        "filename": "config.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/config.json"
      },
      {
        "filename": "mistral_float16_tp1_rank0.engine",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/mistral_float16_tp1_rank0.engine"
      },
      {
        "filename": "tokenizer.model",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/tokenizer.model"
      },
      {
        "filename": "special_tokens_map.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/special_tokens_map.json"
      },
      {
        "filename": "tokenizer.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/tokenizer.json"
      },
      {
        "filename": "tokenizer_config.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/tokenizer_config.json"
      },
      {
        "filename": "model.cache",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/LlamaCorn-1.1B-Chat-fp16/model.cache"
      }
    ],
    "id": "llamacorn-1.1b-chat-fp16",
    "object": "model",
    "name": "LlamaCorn 1.1B Chat FP16",
    "version": "1.0",
    "description": "LlamaCorn is a refined version of TinyLlama-1.1B, optimized for conversational quality, running on consumer devices through TensorRT-LLM",
    "format": "TensorRT-LLM",
    "settings": {
      "ctx_len": 2048,
      "text_model": false
    },
    "parameters": {
      "max_tokens": 4096
    },
    "metadata": {
      "author": "LLama",
      "tags": ["TensorRT-LLM", "1B", "Finetuned"],
      "size": 2151000000
    },
    "engine": "nitro-tensorrt-llm"
  },
  {
    "sources": [
      {
        "filename": "config.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/config.json"
      },
      {
        "filename": "mistral_float16_tp1_rank0.engine",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/mistral_float16_tp1_rank0.engine"
      },
      {
        "filename": "tokenizer.model",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/tokenizer.model"
      },
      {
        "filename": "special_tokens_map.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/special_tokens_map.json"
      },
      {
        "filename": "tokenizer.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/tokenizer.json"
      },
      {
        "filename": "tokenizer_config.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/tokenizer_config.json"
      },
      {
        "filename": "model.cache",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/TinyJensen-1.1B-Chat-fp16/model.cache"
      }
    ],
    "id": "tinyjensen-1.1b-chat-fp16",
    "object": "model",
    "name": "TinyJensen 1.1B Chat FP16",
    "version": "1.0",
    "description": "Do you want to chat with Jensen Huan? Here you are",
    "format": "TensorRT-LLM",
    "settings": {
      "ctx_len": 2048,
      "text_model": false
    },
    "parameters": {
      "max_tokens": 4096
    },
    "metadata": {
      "author": "LLama",
      "tags": ["TensorRT-LLM", "1B", "Finetuned"],
      "size": 2151000000
    },
    "engine": "nitro-tensorrt-llm"
  },
  {
    "sources": [
      {
        "filename": "config.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/config.json"
      },
      {
        "filename": "mistral_float16_tp1_rank0.engine",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/mistral_float16_tp1_rank0.engine"
      },
      {
        "filename": "tokenizer.model",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/tokenizer.model"
      },
      {
        "filename": "special_tokens_map.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/special_tokens_map.json"
      },
      {
        "filename": "tokenizer.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/tokenizer.json"
      },
      {
        "filename": "tokenizer_config.json",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/tokenizer_config.json"
      },
      {
        "filename": "model.cache",
        "url": "https://catalog.jan.ai/dist/models/<gpuarch>/<os>/tensorrt-llm-v0.7.1/Mistral-7B-Instruct-v0.1-int4/model.cache"
      }
    ],
    "id": "mistral-7b-instruct-int4",
    "object": "model",
    "name": "Mistral 7B Instruct v0.1 INT4",
    "version": "1.0",
    "description": "Mistral 7B Instruct v0.1 INT4",
    "format": "TensorRT-LLM",
    "settings": {
      "ctx_len": 2048,
      "text_model": false,
      "prompt_template": "[INST] {prompt} [/INST]"
    },
    "parameters": {
      "max_tokens": 4096
    },
    "metadata": {
      "author": "MistralAI",
      "tags": ["TensorRT-LLM", "7B", "Finetuned"],
      "size": 3840000000
    },
    "engine": "nitro-tensorrt-llm"
  }
 ]
--- a/extensions/tensorrt-llm-extension/rolldown.config.mjs
+++ b/extensions/tensorrt-llm-extension/rolldown.config.mjs
@ -1,59 +0,0 @@
 import { defineConfig } from 'rolldown'
 import packageJson from './package.json' with { type: 'json' }
 import modelsJson from './resources/models.json' with { type: 'json' }
 export default defineConfig([
  {
    input: 'src/index.ts',
    output: {
      format: 'esm',
      file: 'dist/index.js',
    },
    platform: 'browser',
    define: {
      MODELS: JSON.stringify(modelsJson),
      TENSORRT_VERSION: JSON.stringify(packageJson.tensorrtVersion),
      PROVIDER: JSON.stringify(packageJson.provider),
      DOWNLOAD_RUNNER_URL:
        process.platform === 'win32'
          ? JSON.stringify(
              'https://github.com/janhq/cortex.tensorrt-llm/releases/download/windows-v<version>-tensorrt-llm-v0.7.1/nitro-windows-v<version>-tensorrt-llm-v0.7.1-amd64-all-arch.tar.gz'
            )
          : JSON.stringify(
              'https://github.com/janhq/cortex.tensorrt-llm/releases/download/linux-v<version>/nitro-linux-v<version>-amd64-tensorrt-llm-<gpuarch>.tar.gz'
            ),
      NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
      INFERENCE_URL: JSON.stringify(
        process.env.INFERENCE_URL ||
          `${packageJson.config?.protocol ?? 'http'}://${packageJson.config?.host}:${packageJson.config?.port}/v1/chat/completions`
      ),
      COMPATIBILITY: JSON.stringify(packageJson.compatibility),
    },
  },
  {
    input: 'src/node/index.ts',
    external: ['@janhq/core/node'],
    output: {
      format: 'cjs',
      file: 'dist/node/index.cjs.js',
      sourcemap: false,
      inlineDynamicImports: true,
    },
    replace: {
      TENSORRT_VERSION: JSON.stringify(packageJson.tensorrtVersion),
      PROVIDER: JSON.stringify(packageJson.provider),
      LOAD_MODEL_URL: JSON.stringify(
        `${packageJson.config?.protocol ?? 'http'}://${packageJson.config?.host}:${packageJson.config?.port}/inferences/tensorrtllm/loadmodel`
      ),
      TERMINATE_ENGINE_URL: JSON.stringify(
        `${packageJson.config?.protocol ?? 'http'}://${packageJson.config?.host}:${packageJson.config?.port}/processmanager/destroy`
      ),
      ENGINE_HOST: JSON.stringify(packageJson.config?.host ?? '127.0.0.1'),
      ENGINE_PORT: JSON.stringify(packageJson.config?.port ?? '3928'),
    },
    resolve: {
      extensions: ['.js', '.ts', '.json'],
    },
    platform: 'node',
  },
 ])
--- a/extensions/tensorrt-llm-extension/src/@types/global.d.ts
+++ b/extensions/tensorrt-llm-extension/src/@types/global.d.ts
@ -1,11 +0,0 @@
 declare const NODE: string
 declare const INFERENCE_URL: string
 declare const LOAD_MODEL_URL: string
 declare const TERMINATE_ENGINE_URL: string
 declare const ENGINE_HOST: string
 declare const ENGINE_PORT: string
 declare const DOWNLOAD_RUNNER_URL: string
 declare const TENSORRT_VERSION: string
 declare const COMPATIBILITY: object
 declare const PROVIDER: string
 declare const MODELS: Array<any>
--- a/extensions/tensorrt-llm-extension/src/index.test.ts
+++ b/extensions/tensorrt-llm-extension/src/index.test.ts
@ -1,186 +0,0 @@
 import TensorRTLLMExtension from '../src/index'
 import {
  executeOnMain,
  systemInformation,
  fs,
  baseName,
  joinPath,
  downloadFile,
 } from '@janhq/core'
 jest.mock('@janhq/core', () => ({
  ...jest.requireActual('@janhq/core/node'),
  LocalOAIEngine: jest.fn().mockImplementation(function () {
    // @ts-ignore
    this.registerModels = () => {
      return Promise.resolve()
    }
    // @ts-ignore
    return this
  }),
  systemInformation: jest.fn(),
  fs: {
    existsSync: jest.fn(),
    mkdir: jest.fn(),
  },
  joinPath: jest.fn(),
  baseName: jest.fn(),
  downloadFile: jest.fn(),
  executeOnMain: jest.fn(),
  showToast: jest.fn(),
  events: {
    emit: jest.fn(),
    // @ts-ignore
    on: (event, func) => {
      func({ fileName: './' })
    },
    off: jest.fn(),
  },
 }))
 // @ts-ignore
 global.COMPATIBILITY = {
  platform: ['win32'],
 }
 // @ts-ignore
 global.PROVIDER = 'tensorrt-llm'
 // @ts-ignore
 global.INFERENCE_URL = 'http://localhost:5000'
 // @ts-ignore
 global.NODE = 'node'
 // @ts-ignore
 global.MODELS = []
 // @ts-ignore
 global.TENSORRT_VERSION = ''
 // @ts-ignore
 global.DOWNLOAD_RUNNER_URL = ''
 describe('TensorRTLLMExtension', () => {
  let extension: TensorRTLLMExtension
  beforeEach(() => {
    // @ts-ignore
    extension = new TensorRTLLMExtension()
    jest.clearAllMocks()
  })
  describe('compatibility', () => {
    it('should return the correct compatibility', () => {
      const result = extension.compatibility()
      expect(result).toEqual({
        platform: ['win32'],
      })
    })
  })
  describe('install', () => {
    it('should install if compatible', async () => {
      const mockSystemInfo: any = {
        osInfo: { platform: 'win32' },
        gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
      }
      ;(executeOnMain as jest.Mock).mockResolvedValue({})
      ;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
      ;(fs.existsSync as jest.Mock).mockResolvedValue(false)
      ;(fs.mkdir as jest.Mock).mockResolvedValue(undefined)
      ;(baseName as jest.Mock).mockResolvedValue('./')
      ;(joinPath as jest.Mock).mockResolvedValue('./')
      ;(downloadFile as jest.Mock).mockResolvedValue({})
      await extension.install()
      expect(executeOnMain).toHaveBeenCalled()
    })
    it('should not install if not compatible', async () => {
      const mockSystemInfo: any = {
        osInfo: { platform: 'linux' },
        gpuSetting: { gpus: [{ arch: 'pascal', name: 'NVIDIA GPU' }] },
      }
      ;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
      jest.spyOn(extension, 'registerModels').mockReturnValue(Promise.resolve())
      await extension.install()
      expect(executeOnMain).not.toHaveBeenCalled()
    })
  })
  describe('installationState', () => {
    it('should return NotCompatible if not compatible', async () => {
      const mockSystemInfo: any = {
        osInfo: { platform: 'linux' },
        gpuSetting: { gpus: [{ arch: 'pascal', name: 'NVIDIA GPU' }] },
      }
      ;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
      const result = await extension.installationState()
      expect(result).toBe('NotCompatible')
    })
    it('should return Installed if executable exists', async () => {
      const mockSystemInfo: any = {
        osInfo: { platform: 'win32' },
        gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
      }
      ;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
      ;(fs.existsSync as jest.Mock).mockResolvedValue(true)
      const result = await extension.installationState()
      expect(result).toBe('Installed')
    })
    it('should return NotInstalled if executable does not exist', async () => {
      const mockSystemInfo: any = {
        osInfo: { platform: 'win32' },
        gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
      }
      ;(systemInformation as jest.Mock).mockResolvedValue(mockSystemInfo)
      ;(fs.existsSync as jest.Mock).mockResolvedValue(false)
      const result = await extension.installationState()
      expect(result).toBe('NotInstalled')
    })
  })
  describe('isCompatible', () => {
    it('should return true for compatible system', () => {
      const mockInfo: any = {
        osInfo: { platform: 'win32' },
        gpuSetting: { gpus: [{ arch: 'ampere', name: 'NVIDIA GPU' }] },
      }
      const result = extension.isCompatible(mockInfo)
      expect(result).toBe(true)
    })
    it('should return false for incompatible system', () => {
      const mockInfo: any = {
        osInfo: { platform: 'linux' },
        gpuSetting: { gpus: [{ arch: 'pascal', name: 'AMD GPU' }] },
      }
      const result = extension.isCompatible(mockInfo)
      expect(result).toBe(false)
    })
  })
 })
 describe('GitHub Release File URL Test', () => {
  const url = 'https://github.com/janhq/cortex.tensorrt-llm/releases/download/windows-v0.1.8-tensorrt-llm-v0.7.1/nitro-windows-v0.1.8-tensorrt-llm-v0.7.1-amd64-all-arch.tar.gz';
  it('should return a status code 200 for the release file URL', async () => {
    const response = await fetch(url, { method: 'HEAD' });
    expect(response.status).toBe(200);
  });
  it('should not return a 404 status', async () => {
    const response = await fetch(url, { method: 'HEAD' });
    expect(response.status).not.toBe(404);
  });
 });
--- a/extensions/tensorrt-llm-extension/src/index.ts
+++ b/extensions/tensorrt-llm-extension/src/index.ts
@ -1,197 +0,0 @@
 /**
 * @module tensorrt-llm-extension/src/index
 */
 import {
  Compatibility,
  DownloadEvent,
  DownloadRequest,
  DownloadState,
  InstallationState,
  baseName,
  downloadFile,
  events,
  executeOnMain,
  joinPath,
  showToast,
  systemInformation,
  LocalOAIEngine,
  fs,
  MessageRequest,
  ModelEvent,
  getJanDataFolderPath,
  SystemInformation,
  Model,
 } from '@janhq/core'
 /**
 * TensorRTLLMExtension - Implementation of LocalOAIEngine
 * @extends BaseOAILocalInferenceProvider
 * Provide pre-populated models for TensorRTLLM
 */
 export default class TensorRTLLMExtension extends LocalOAIEngine {
  /**
   * Override custom function name for loading and unloading model
   * Which are implemented from node module
   */
  override provider = PROVIDER
  override inferenceUrl = INFERENCE_URL
  override nodeModule = NODE
  private supportedGpuArch = ['ampere', 'ada']
  override compatibility() {
    return COMPATIBILITY as unknown as Compatibility
  }
  override async onLoad(): Promise<void> {
    super.onLoad()
    if ((await this.installationState()) === 'Installed') {
      const models = MODELS as unknown as Model[]
      this.registerModels(models)
    }
  }
  override async install(): Promise<void> {
    await this.removePopulatedModels()
    const info = await systemInformation()
    if (!this.isCompatible(info)) return
    const janDataFolderPath = await getJanDataFolderPath()
    const engineVersion = TENSORRT_VERSION
    const executableFolderPath = await joinPath([
      janDataFolderPath,
      'engines',
      this.provider,
      engineVersion,
      info.gpuSetting?.gpus[0].arch,
    ])
    if (!(await fs.existsSync(executableFolderPath))) {
      await fs.mkdir(executableFolderPath)
    }
    const placeholderUrl = DOWNLOAD_RUNNER_URL
    const tensorrtVersion = TENSORRT_VERSION
    const url = placeholderUrl
      .replace(/<version>/g, tensorrtVersion)
      .replace(/<gpuarch>/g, info.gpuSetting!.gpus[0]!.arch!)
    const tarball = await baseName(url)
    const tarballFullPath = await joinPath([executableFolderPath, tarball])
    const downloadRequest: DownloadRequest = {
      url,
      localPath: tarballFullPath,
      extensionId: this.name,
      downloadType: 'extension',
    }
    downloadFile(downloadRequest)
    const onFileDownloadSuccess = async (state: DownloadState) => {
      // if other download, ignore
      if (state.fileName !== tarball) return
      events.off(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess)
      await executeOnMain(
        this.nodeModule,
        'decompressRunner',
        tarballFullPath,
        executableFolderPath
      )
      events.emit(DownloadEvent.onFileUnzipSuccess, state)
      // Prepopulate models as soon as it's ready
      const models = MODELS as unknown as Model[]
      this.registerModels(models).then(() => {
        showToast(
          'Extension installed successfully.',
          'New models are added to Model Hub.'
        )
      })
    }
    events.on(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess)
  }
  private async removePopulatedModels(): Promise<void> {
    const models = MODELS as unknown as Model[]
    console.debug(`removePopulatedModels`, JSON.stringify(models))
    const janDataFolderPath = await getJanDataFolderPath()
    const modelFolderPath = await joinPath([janDataFolderPath, 'models'])
    for (const model of models) {
      const modelPath = await joinPath([modelFolderPath, model.id])
      try {
        await fs.rm(modelPath)
      } catch (err) {
        console.error(`Error removing model ${modelPath}`, err)
      }
    }
    events.emit(ModelEvent.OnModelsUpdate, {})
  }
  override async loadModel(model: Model): Promise<void> {
    if ((await this.installationState()) === 'Installed')
      return super.loadModel(model)
    throw new Error('EXTENSION_IS_NOT_INSTALLED::TensorRT-LLM extension')
  }
  override async installationState(): Promise<InstallationState> {
    const info = await systemInformation()
    if (!this.isCompatible(info)) return 'NotCompatible'
    const firstGpu = info.gpuSetting?.gpus[0]
    const janDataFolderPath = await getJanDataFolderPath()
    const engineVersion = TENSORRT_VERSION
    const enginePath = await joinPath([
      janDataFolderPath,
      'engines',
      this.provider,
      engineVersion,
      firstGpu.arch,
      info.osInfo.platform === 'win32' ? 'nitro.exe' : 'nitro',
    ])
    // For now, we just check the executable of nitro x tensor rt
    return (await fs.existsSync(enginePath)) ? 'Installed' : 'NotInstalled'
  }
  override stopInference() {
    if (!this.loadedModel) return
    showToast(
      'Unable to Stop Inference',
      'The model does not support stopping inference.'
    )
    return Promise.resolve()
  }
  override async inference(data: MessageRequest) {
    if (!this.loadedModel) return
    // TensorRT LLM Extension supports streaming only
    if (data.model && data.model.parameters) data.model.parameters.stream = true
    super.inference(data)
  }
  isCompatible(info: SystemInformation): info is Required<SystemInformation> & {
    gpuSetting: { gpus: { arch: string }[] }
  } {
    const firstGpu = info.gpuSetting?.gpus[0]
    return (
      !!info.osInfo &&
      !!info.gpuSetting &&
      !!firstGpu &&
      info.gpuSetting.gpus.length > 0 &&
      this.compatibility().platform.includes(info.osInfo.platform) &&
      !!firstGpu.arch &&
      firstGpu.name.toLowerCase().includes('nvidia') &&
      this.supportedGpuArch.includes(firstGpu.arch)
    )
  }
 }
--- a/extensions/tensorrt-llm-extension/src/node/index.ts
+++ b/extensions/tensorrt-llm-extension/src/node/index.ts
@ -1,325 +0,0 @@
 import path from 'path'
 import { ChildProcessWithoutNullStreams, spawn } from 'child_process'
 import tcpPortUsed from 'tcp-port-used'
 import fetchRT from 'fetch-retry'
 import {
  log,
  getJanDataFolderPath,
  SystemInformation,
  PromptTemplate,
 } from '@janhq/core/node'
 import decompress from 'decompress'
 import terminate from 'terminate'
 // Polyfill fetch with retry
 const fetchRetry = fetchRT(fetch)
 const supportedPlatform = (): string[] => ['win32', 'linux']
 const supportedGpuArch = (): string[] => ['ampere', 'ada']
 const PORT_CHECK_INTERVAL = 100
 /**
 * The response object for model init operation.
 */
 interface ModelLoadParams {
  engine_path: string
  ctx_len: number
 }
 // The subprocess instance for Engine
 let subprocess: ChildProcessWithoutNullStreams | undefined = undefined
 /**
 * Initializes a engine subprocess to load a machine learning model.
 * @param params - The model load settings.
 */
 async function loadModel(
  params: any,
  systemInfo?: SystemInformation
 ): Promise<{ error: Error | undefined }> {
  // modelFolder is the absolute path to the running model folder
  // e.g. ~/jan/models/llama-2
  let modelFolder = params.modelFolder
  if (params.model.settings?.prompt_template) {
    const promptTemplate = params.model.settings.prompt_template
    const prompt = promptTemplateConverter(promptTemplate)
    if (prompt?.error) {
      return Promise.reject(prompt.error)
    }
    params.model.settings.system_prompt = prompt.system_prompt
    params.model.settings.user_prompt = prompt.user_prompt
    params.model.settings.ai_prompt = prompt.ai_prompt
  }
  const settings: ModelLoadParams = {
    engine_path: modelFolder,
    ctx_len: params.model.settings.ctx_len ?? 2048,
    ...params.model.settings,
  }
  if (!systemInfo) {
    throw new Error('Cannot get system info. Unable to start nitro x tensorrt.')
  }
  return runEngineAndLoadModel(settings, systemInfo)
 }
 /**
 * Stops a Engine subprocess.
 */
 function unloadModel(): Promise<void> {
  const controller = new AbortController()
  setTimeout(() => controller.abort(), 5000)
  debugLog(`Request to kill engine`)
  const killRequest = () => {
    return fetch(TERMINATE_ENGINE_URL, {
      method: 'DELETE',
      signal: controller.signal,
    })
      .then(() => {
        subprocess = undefined
      })
      .catch(() => {}) // Do nothing with this attempt
      .then(() =>
        tcpPortUsed.waitUntilFree(
          parseInt(ENGINE_PORT),
          PORT_CHECK_INTERVAL,
          5000
        )
      ) // Wait for port available
      .then(() => debugLog(`Engine process is terminated`))
      .catch((err) => {
        debugLog(
          `Could not kill running process on port ${ENGINE_PORT}. Might be another process running on the same port? ${err}`
        )
        throw 'PORT_NOT_AVAILABLE'
      })
  }
  if (subprocess?.pid) {
    log(`[CORTEX]:: Killing PID ${subprocess.pid}`)
    const pid = subprocess.pid
    return new Promise((resolve, reject) => {
      terminate(pid, function (err) {
        if (err) {
          return killRequest()
        } else {
          return tcpPortUsed
            .waitUntilFree(parseInt(ENGINE_PORT), PORT_CHECK_INTERVAL, 5000)
            .then(() => resolve())
            .then(() => log(`[CORTEX]:: cortex process is terminated`))
            .catch(() => {
              killRequest()
            })
        }
      })
    })
  } else {
    return killRequest()
  }
 }
 /**
 * 1. Spawn engine process
 * 2. Load model into engine subprocess
 * @returns
 */
 async function runEngineAndLoadModel(
  settings: ModelLoadParams,
  systemInfo: SystemInformation
 ) {
  return unloadModel()
    .then(() => runEngine(systemInfo))
    .then(() => loadModelRequest(settings))
    .catch((err) => {
      // TODO: Broadcast error so app could display proper error message
      debugLog(`${err}`, 'Error')
      return { error: err }
    })
 }
 /**
 * Loads a LLM model into the Engine subprocess by sending a HTTP POST request.
 */
 async function loadModelRequest(
  settings: ModelLoadParams
 ): Promise<{ error: Error | undefined }> {
  debugLog(`Loading model with params ${JSON.stringify(settings)}`)
  return fetchRetry(LOAD_MODEL_URL, {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
    },
    body: JSON.stringify(settings),
    retries: 3,
    retryDelay: 500,
  })
    .then((res) => {
      debugLog(`Load model success with response ${JSON.stringify(res)}`)
      return Promise.resolve({ error: undefined })
    })
    .catch((err) => {
      debugLog(`Load model failed with error ${err}`, 'Error')
      return Promise.resolve({ error: err })
    })
 }
 /**
 * Spawns engine subprocess.
 */
 async function runEngine(systemInfo: SystemInformation): Promise<void> {
  debugLog(`Spawning engine subprocess...`)
  if (systemInfo.gpuSetting == null) {
    return Promise.reject(
      'No GPU information found. Please check your GPU setting.'
    )
  }
  if (systemInfo.gpuSetting?.gpus.length === 0) {
    return Promise.reject('No GPU found. Please check your GPU setting.')
  }
  if (systemInfo.osInfo == null) {
    return Promise.reject(
      'No OS information found. Please check your OS setting.'
    )
  }
  const platform = systemInfo.osInfo.platform
  if (platform == null || supportedPlatform().includes(platform) === false) {
    return Promise.reject(
      'No OS architecture found. Please check your OS setting.'
    )
  }
  const gpu = systemInfo.gpuSetting?.gpus[0]
  if (gpu.name.toLowerCase().includes('nvidia') === false) {
    return Promise.reject('No Nvidia GPU found. Please check your GPU setting.')
  }
  const gpuArch = gpu.arch
  if (gpuArch == null || supportedGpuArch().includes(gpuArch) === false) {
    return Promise.reject(
      `Your GPU: ${gpu.name} is not supported. Only ${supportedGpuArch().join(
        ', '
      )} series are supported.`
    )
  }
  const janDataFolderPath = await getJanDataFolderPath()
  const tensorRtVersion = TENSORRT_VERSION
  const provider = PROVIDER
  return new Promise<void>((resolve, reject) => {
    // Current directory by default
    const executableFolderPath = path.join(
      janDataFolderPath,
      'engines',
      provider,
      tensorRtVersion,
      gpuArch
    )
    const nitroExecutablePath = path.join(
      executableFolderPath,
      platform === 'win32' ? 'nitro.exe' : 'nitro'
    )
    const args: string[] = ['1', ENGINE_HOST, ENGINE_PORT]
    // Execute the binary
    debugLog(`Spawn nitro at path: ${nitroExecutablePath}, and args: ${args}`)
    subprocess = spawn(nitroExecutablePath, args, {
      cwd: executableFolderPath,
      env: {
        ...process.env,
      },
    })
    // Handle subprocess output
    subprocess.stdout.on('data', (data: any) => {
      debugLog(`${data}`)
    })
    subprocess.stderr.on('data', (data: any) => {
      debugLog(`${data}`)
    })
    subprocess.on('close', (code: any) => {
      debugLog(`Engine exited with code: ${code}`)
      subprocess = undefined
      reject(`child process exited with code ${code}`)
    })
    tcpPortUsed
      .waitUntilUsed(parseInt(ENGINE_PORT), PORT_CHECK_INTERVAL, 30000)
      .then(() => {
        debugLog(`Engine is ready`)
        resolve()
      })
  })
 }
 function debugLog(message: string, level: string = 'Debug') {
  log(`[TENSORRT_LLM_NITRO]::${level}:${message}`)
 }
 const decompressRunner = async (zipPath: string, output: string) => {
  console.debug(`Decompressing ${zipPath} to ${output}...`)
  try {
    const files = await decompress(zipPath, output)
    console.debug('Decompress finished!', files)
  } catch (err) {
    console.error(`Decompress ${zipPath} failed: ${err}`)
  }
 }
 /**
 * Parse prompt template into agrs settings
 * @param promptTemplate Template as string
 * @returns
 */
 function promptTemplateConverter(promptTemplate: string): PromptTemplate {
  // Split the string using the markers
  const systemMarker = '{system_message}'
  const promptMarker = '{prompt}'
  if (
    promptTemplate.includes(systemMarker) &&
    promptTemplate.includes(promptMarker)
  ) {
    // Find the indices of the markers
    const systemIndex = promptTemplate.indexOf(systemMarker)
    const promptIndex = promptTemplate.indexOf(promptMarker)
    // Extract the parts of the string
    const system_prompt = promptTemplate.substring(0, systemIndex)
    const user_prompt = promptTemplate.substring(
      systemIndex + systemMarker.length,
      promptIndex
    )
    const ai_prompt = promptTemplate.substring(
      promptIndex + promptMarker.length
    )
    // Return the split parts
    return { system_prompt, user_prompt, ai_prompt }
  } else if (promptTemplate.includes(promptMarker)) {
    // Extract the parts of the string for the case where only promptMarker is present
    const promptIndex = promptTemplate.indexOf(promptMarker)
    const user_prompt = promptTemplate.substring(0, promptIndex)
    const ai_prompt = promptTemplate.substring(
      promptIndex + promptMarker.length
    )
    // Return the split parts
    return { user_prompt, ai_prompt }
  }
  // Return an error if none of the conditions are met
  return { error: 'Cannot split prompt template' }
 }
 export default {
  supportedPlatform,
  supportedGpuArch,
  decompressRunner,
  loadModel,
  unloadModel,
  dispose: unloadModel,
 }
--- a/extensions/tensorrt-llm-extension/tsconfig.json
+++ b/extensions/tensorrt-llm-extension/tsconfig.json
@ -1,21 +0,0 @@
 {
  "compilerOptions": {
    "moduleResolution": "node",
    "target": "ES2015",
    "module": "ES2020",
    "lib": ["es2015", "es2016", "es2017", "dom"],
    "strict": true,
    "sourceMap": true,
    "declaration": true,
    "allowSyntheticDefaultImports": true,
    "experimentalDecorators": true,
    "emitDecoratorMetadata": true,
    "declarationDir": "dist/types",
    "outDir": "dist",
    "importHelpers": true,
    "resolveJsonModule": true,
    "typeRoots": ["node_modules/@types"]
  },
  "include": ["src"],
  "exclude": ["**/*.test.ts"]
 }