diff --git a/README.md b/README.md
index b25a917d7..adebb8ea1 100644
--- a/README.md
+++ b/README.md
@@ -76,31 +76,31 @@ Jan is an open-source ChatGPT alternative that runs 100% offline on your compute
-
+ jan.AppImage
diff --git a/docs/docs/guides/providers/README.mdx b/docs/docs/guides/providers/README.mdx
new file mode 100644
index 000000000..aa3bfea1f
--- /dev/null
+++ b/docs/docs/guides/providers/README.mdx
@@ -0,0 +1,8 @@
+---
+title: Inference Providers
+slug: /guides/providers
+---
+
+import DocCardList from "@theme/DocCardList";
+
+
diff --git a/docs/docs/guides/providers/image.png b/docs/docs/guides/providers/image.png
new file mode 100644
index 000000000..5f1f7104e
Binary files /dev/null and b/docs/docs/guides/providers/image.png differ
diff --git a/docs/docs/guides/providers/llama-cpp.md b/docs/docs/guides/providers/llama-cpp.md
new file mode 100644
index 000000000..d2b0daa2a
--- /dev/null
+++ b/docs/docs/guides/providers/llama-cpp.md
@@ -0,0 +1,10 @@
+---
+title: llama.cpp
+slug: /guides/providers/llama-cpp
+---
+
+## Overview
+
+[Nitro](https://github.com/janhq/nitro) is an inference server on top of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides an OpenAI-compatible API, queue, & scaling.
+
+Nitro is the default AI engine downloaded with Jan. There is no additional setup needed.
\ No newline at end of file
diff --git a/docs/docs/guides/providers/tensorrt-llm.md b/docs/docs/guides/providers/tensorrt-llm.md
new file mode 100644
index 000000000..52da83b36
--- /dev/null
+++ b/docs/docs/guides/providers/tensorrt-llm.md
@@ -0,0 +1,87 @@
+---
+title: TensorRT-LLM
+slug: /guides/providers/tensorrt-llm
+---
+
+Users with Nvidia GPUs can get **20-40% faster\* token speeds** on their laptop or desktops by using [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). The greater implication is that you are running FP16, which is also more accurate than quantized models.
+
+This guide walks you through how to install Jan's official [TensorRT-LLM Extension](https://github.com/janhq/nitro-tensorrt-llm). This extension uses [Nitro-TensorRT-LLM](https://github.com/janhq/nitro-tensorrt-llm) as the AI engine, instead of the default [Nitro-Llama-CPP](https://github.com/janhq/nitro). It includes an efficient C++ server to natively execute the [TRT-LLM C++ runtime](https://nvidia.github.io/TensorRT-LLM/gpt_runtime.html). It also comes with additional feature and performance improvements like OpenAI compatibility, tokenizer improvements, and queues.
+
+*Compared to using LlamaCPP engine.
+
+:::warning
+This feature is only available for Windows users. Linux is coming soon.
+
+Additionally, we only prebuilt a few demo models. You can always build your desired models directly on your machine. [Read here](#build-your-own-tensorrt-models).
+
+:::
+
+## Requirements
+
+- A Windows PC
+- Nvidia GPU(s): Ada or Ampere series (i.e. RTX 4000s & 3000s). More will be supported soon.
+- 3GB+ of disk space to download TRT-LLM artifacts and a Nitro binary
+- Jan v0.4.9+ or Jan v0.4.8-321+ (nightly)
+- Nvidia Driver v535+ ([installation guide](https://jan.ai/guides/common-error/not-using-gpu/#1-ensure-gpu-mode-requirements))
+- CUDA Toolkit v12.2+ ([installation guide](https://jan.ai/guides/common-error/not-using-gpu/#1-ensure-gpu-mode-requirements))
+
+## Install TensorRT-Extension
+
+1. Go to Settings > Extensions
+2. Click install next to the TensorRT-LLM Extension
+3. Check that files are correctly downloaded
+
+```sh
+ls ~\jan\extensions\@janhq\tensorrt-llm-extension\dist\bin
+# Your Extension Folder should now include `nitro.exe`, among other artifacts needed to run TRT-LLM
+```
+
+## Download a Compatible Model
+TensorRT-LLM can only run models in `TensorRT` format. These models, aka "TensorRT Engines", are prebuilt specifically for each target OS+GPU architecture.
+
+We offer a handful of precompiled models for Ampere and Ada cards that you can immediately download and play with:
+
+1. Restart the application and go to the Hub
+2. Look for models with the `TensorRT-LLM` label in the recommended models list. Click download. This step might take some time. 🙏
+
+
+
+3. Click use and start chatting!
+4. You may need to allow Nitro in your network
+
+
+
+:::warning
+If you are our nightly builds, you may have to reinstall the TensorRT-LLM extension each time you update the app. We're working on better extension lifecyles - stay tuned.
+:::
+
+## Configure Settings
+
+You can customize the default parameters for how Jan runs TensorRT-LLM.
+
+:::info
+coming soon
+:::
+
+## Troubleshooting
+
+### Incompatible Extension vs Engine versions
+
+For now, the model versions are pinned to the extension versions.
+
+### Uninstall Extension
+
+1. Quit the app
+2. Go to Settings > Extensions
+3. Delete the entire Extensions folder.
+4. Reopen the app, only the default extensions should be restored.
+
+### Install Nitro-TensorRT-LLM manually
+
+To manually build the artifacts needed to run the server and TensorRT-LLM, you can reference the source code. [Read here](https://github.com/janhq/nitro-tensorrt-llm?tab=readme-ov-file#quickstart).
+
+### Build your own TensorRT models
+
+:::info
+coming soon
+:::
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 4c45cadbe..b95e4044f 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -199,6 +199,19 @@ const sidebars = {
"guides/models/integrate-remote",
]
},
+ {
+ type: "category",
+ label: "Inference Providers",
+ className: "head_SubMenu",
+ link: {
+ type: 'doc',
+ id: "guides/providers/README",
+ },
+ items: [
+ "guides/providers/llama-cpp",
+ "guides/providers/tensorrt-llm",
+ ]
+ },
{
type: "category",
label: "Extensions",
diff --git a/extensions/monitoring-extension/src/node/index.ts b/extensions/monitoring-extension/src/node/index.ts
index 1d65704de..25f151112 100644
--- a/extensions/monitoring-extension/src/node/index.ts
+++ b/extensions/monitoring-extension/src/node/index.ts
@@ -2,17 +2,17 @@ import { GpuSetting, GpuSettingInfo, ResourceInfo } from '@janhq/core'
import { getJanDataFolderPath, log } from '@janhq/core/node'
import { mem, cpu } from 'node-os-utils'
import { exec } from 'child_process'
-import { writeFileSync, existsSync, readFileSync } from 'fs'
+import { writeFileSync, existsSync, readFileSync, mkdirSync } from 'fs'
import path from 'path'
+/**
+ * Path to the settings directory
+ **/
+export const SETTINGS_DIR = path.join(getJanDataFolderPath(), 'settings')
/**
* Path to the settings file
**/
-export const GPU_INFO_FILE = path.join(
- getJanDataFolderPath(),
- 'settings',
- 'settings.json'
-)
+export const GPU_INFO_FILE = path.join(SETTINGS_DIR, 'settings.json')
/**
* Default GPU settings
@@ -136,6 +136,11 @@ export const updateNvidiaInfo = async () => {
try {
JSON.parse(readFileSync(GPU_INFO_FILE, 'utf-8'))
} catch (error) {
+ if (!existsSync(SETTINGS_DIR)) {
+ mkdirSync(SETTINGS_DIR, {
+ recursive: true,
+ })
+ }
writeFileSync(GPU_INFO_FILE, JSON.stringify(DEFAULT_SETTINGS, null, 2))
}
diff --git a/extensions/tensorrt-llm-extension/models.json b/extensions/tensorrt-llm-extension/models.json
index bc6a78256..30f345f47 100644
--- a/extensions/tensorrt-llm-extension/models.json
+++ b/extensions/tensorrt-llm-extension/models.json
@@ -33,10 +33,57 @@
"description": "LlamaCorn is a refined version of TinyLlama-1.1B, optimized for conversational quality, running on consumer devices through TensorRT-LLM",
"format": "TensorRT-LLM",
"settings": {
- "ctx_len": 2048
+ "ctx_len": 2048,
+ "text_model": false
+ },
+ "parameters": {
+ "max_tokens": 4096
+ },
+ "metadata": {
+ "author": "LLama",
+ "tags": ["TensorRT-LLM", "1B", "Finetuned"],
+ "size": 2151000000
+ },
+ "engine": "nitro-tensorrt-llm"
+ },
+ {
+ "sources": [
+ {
+ "filename": "config.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/config.json"
+ },
+ {
+ "filename": "rank0.engine",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/rank0.engine"
+ },
+ {
+ "filename": "tokenizer.model",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.model"
+ },
+ {
+ "filename": "special_tokens_map.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/special_tokens_map.json"
+ },
+ {
+ "filename": "tokenizer.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.json"
+ },
+ {
+ "filename": "tokenizer_config.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer_config.json"
+ }
+ ],
+ "id": "tinyjensen-1.1b-chat-fp16",
+ "object": "model",
+ "name": "TinyJensen 1.1B Chat FP16",
+ "version": "1.0",
+ "description": "Do you want to chat with Jensen Huan? Here you are",
+ "format": "TensorRT-LLM",
+ "settings": {
+ "ctx_len": 2048,
+ "text_model": false
},
"parameters": {
- "stream": true,
"max_tokens": 4096
},
"metadata": {
diff --git a/extensions/tensorrt-llm-extension/package.json b/extensions/tensorrt-llm-extension/package.json
index 01ff3e2c6..96ede4a56 100644
--- a/extensions/tensorrt-llm-extension/package.json
+++ b/extensions/tensorrt-llm-extension/package.json
@@ -1,6 +1,6 @@
{
"name": "@janhq/tensorrt-llm-extension",
- "version": "0.0.2",
+ "version": "0.0.3",
"description": "Enables accelerated inference leveraging Nvidia's TensorRT-LLM for optimal GPU hardware optimizations. Compatible with models in TensorRT-LLM format. Requires Nvidia GPU driver and CUDA Toolkit installation.",
"main": "dist/index.js",
"node": "dist/node/index.cjs.js",
@@ -8,7 +8,7 @@
"license": "AGPL-3.0",
"config": {
"host": "127.0.0.1",
- "port": "3928"
+ "port": "3929"
},
"compatibility": {
"platform": [
diff --git a/extensions/tensorrt-llm-extension/src/index.ts b/extensions/tensorrt-llm-extension/src/index.ts
index 076951c3f..02c676841 100644
--- a/extensions/tensorrt-llm-extension/src/index.ts
+++ b/extensions/tensorrt-llm-extension/src/index.ts
@@ -19,6 +19,8 @@ import {
systemInformations,
LocalOAIEngine,
fs,
+ MessageRequest,
+ ModelEvent,
} from '@janhq/core'
import models from '../models.json'
@@ -126,6 +128,21 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
events.on(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess)
}
+ async onModelInit(model: Model): Promise {
+ if (model.engine !== this.provider) return
+
+ if ((await this.installationState()) === 'Installed')
+ return super.onModelInit(model)
+ else {
+ events.emit(ModelEvent.OnModelFail, {
+ ...model,
+ error: {
+ message: 'EXTENSION_IS_NOT_INSTALLED::TensorRT-LLM extension',
+ },
+ })
+ }
+ }
+
override async installationState(): Promise {
// For now, we just check the executable of nitro x tensor rt
const isNitroExecutableAvailable = await executeOnMain(
@@ -144,4 +161,11 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
)
return Promise.resolve()
}
+
+ inference(data: MessageRequest): void {
+ if (!this.isRunning) return
+ // TensorRT LLM Extension supports streaming only
+ if (data.model) data.model.parameters.stream = true
+ super.inference(data)
+ }
}
diff --git a/web/hooks/useCreateNewThread.ts b/web/hooks/useCreateNewThread.ts
index 247c65c55..55faded37 100644
--- a/web/hooks/useCreateNewThread.ts
+++ b/web/hooks/useCreateNewThread.ts
@@ -74,11 +74,15 @@ export const useCreateNewThread = () => {
const defaultModel = model ?? recommendedModel ?? downloadedModels[0]
- // check last thread message, if there empty last message use can not create thread
- const lastMessage = threads[0]?.metadata?.lastMessage
+ if (!model) {
+ // if we have model, which means user wants to create new thread from Model hub. Allow them.
- if (!lastMessage && threads.length) {
- return null
+ // check last thread message, if there empty last message use can not create thread
+ const lastMessage = threads[0]?.metadata?.lastMessage
+
+ if (!lastMessage && threads.length) {
+ return null
+ }
}
// modify assistant tools when experimental on, retieval toggle enabled in default
diff --git a/web/screens/Chat/ChatInput/index.tsx b/web/screens/Chat/ChatInput/index.tsx
index c90a12cd2..8707e8bcd 100644
--- a/web/screens/Chat/ChatInput/index.tsx
+++ b/web/screens/Chat/ChatInput/index.tsx
@@ -244,16 +244,13 @@ const ChatInput: React.FC = () => {