| Experimental (Nightly Build) |
-
+
jan.exe
|
-
+
Intel
|
-
+
M1/M2
|
-
+
jan.deb
|
-
+
jan.AppImage
From 1fe3dff875de61b53420058200741d5eca7fab7b Mon Sep 17 00:00:00 2001
From: Nicole Zhu
Date: Thu, 14 Mar 2024 21:08:50 +0800
Subject: [PATCH 07/13] docs: update slugs again
---
docs/docs/guides/engines/llama-cpp.md | 12 ------------
docs/docs/guides/{engines => providers}/README.mdx | 4 ++--
docs/docs/guides/{engines => providers}/image.png | Bin
docs/docs/guides/providers/llama-cpp.md | 10 ++++++++++
.../guides/{engines => providers}/tensorrt-llm.md | 4 ++--
docs/sidebars.js | 8 ++++----
6 files changed, 18 insertions(+), 20 deletions(-)
delete mode 100644 docs/docs/guides/engines/llama-cpp.md
rename docs/docs/guides/{engines => providers}/README.mdx (54%)
rename docs/docs/guides/{engines => providers}/image.png (100%)
create mode 100644 docs/docs/guides/providers/llama-cpp.md
rename docs/docs/guides/{engines => providers}/tensorrt-llm.md (95%)
diff --git a/docs/docs/guides/engines/llama-cpp.md b/docs/docs/guides/engines/llama-cpp.md
deleted file mode 100644
index bc485df6c..000000000
--- a/docs/docs/guides/engines/llama-cpp.md
+++ /dev/null
@@ -1,12 +0,0 @@
----
-title: Llama-CPP Extension
-slug: /guides/engines/llama-cpp
----
-
-## Overview
-
-[LlamaCPP](https://github.com/ggerganov/llama.cpp) is the default AI engine downloaded with Jan. It is served through Nitro, a C++ inference server, that handles additional UX and hardware optimizations.
-
-The source code for Nitro-llama-cpp is [here](https://github.com/janhq/nitro).
-
-There is no additional setup needed.
\ No newline at end of file
diff --git a/docs/docs/guides/engines/README.mdx b/docs/docs/guides/providers/README.mdx
similarity index 54%
rename from docs/docs/guides/engines/README.mdx
rename to docs/docs/guides/providers/README.mdx
index 3a7cdcc44..aa3bfea1f 100644
--- a/docs/docs/guides/engines/README.mdx
+++ b/docs/docs/guides/providers/README.mdx
@@ -1,6 +1,6 @@
---
-title: Extensions
-slug: /guides/engines
+title: Inference Providers
+slug: /guides/providers
---
import DocCardList from "@theme/DocCardList";
diff --git a/docs/docs/guides/engines/image.png b/docs/docs/guides/providers/image.png
similarity index 100%
rename from docs/docs/guides/engines/image.png
rename to docs/docs/guides/providers/image.png
diff --git a/docs/docs/guides/providers/llama-cpp.md b/docs/docs/guides/providers/llama-cpp.md
new file mode 100644
index 000000000..3a21e80a7
--- /dev/null
+++ b/docs/docs/guides/providers/llama-cpp.md
@@ -0,0 +1,10 @@
+---
+title: llama.cpp
+slug: /guides/providers/llama-cpp
+---
+
+## Overview
+
+[Nitro](https://github.com/janhq/nitro) is an inference server on top of [llama.cpp](https://github.com/ggerganov/llama.cpp). OpenAI-compatible API, queue, & scaling.
+
+Nitro is the default AI engine downloaded with Jan. There is no additional setup needed.
\ No newline at end of file
diff --git a/docs/docs/guides/engines/tensorrt-llm.md b/docs/docs/guides/providers/tensorrt-llm.md
similarity index 95%
rename from docs/docs/guides/engines/tensorrt-llm.md
rename to docs/docs/guides/providers/tensorrt-llm.md
index 177cc0cf4..4b0edec2a 100644
--- a/docs/docs/guides/engines/tensorrt-llm.md
+++ b/docs/docs/guides/providers/tensorrt-llm.md
@@ -1,6 +1,6 @@
---
-title: TensorRT-LLM Extension
-slug: /guides/engines/tensorrt-llm
+title: TensorRT-LLM
+slug: /guides/providers/tensorrt-llm
---
Users with Nvidia GPUs can get 20-40% faster* token speeds on their laptop or desktops by using [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 8deafeaa1..b95e4044f 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -201,15 +201,15 @@ const sidebars = {
},
{
type: "category",
- label: "AI Engines",
+ label: "Inference Providers",
className: "head_SubMenu",
link: {
type: 'doc',
- id: "guides/engines/README",
+ id: "guides/providers/README",
},
items: [
- "guides/engines/llama-cpp",
- "guides/engines/tensorrt-llm",
+ "guides/providers/llama-cpp",
+ "guides/providers/tensorrt-llm",
]
},
{
From 70fc24f6f1857f19d0d837fe95086a373443f210 Mon Sep 17 00:00:00 2001
From: Nicole Zhu
Date: Thu, 14 Mar 2024 21:12:16 +0800
Subject: [PATCH 08/13] docs: nits
---
docs/docs/guides/providers/llama-cpp.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/docs/guides/providers/llama-cpp.md b/docs/docs/guides/providers/llama-cpp.md
index 3a21e80a7..d2b0daa2a 100644
--- a/docs/docs/guides/providers/llama-cpp.md
+++ b/docs/docs/guides/providers/llama-cpp.md
@@ -5,6 +5,6 @@ slug: /guides/providers/llama-cpp
## Overview
-[Nitro](https://github.com/janhq/nitro) is an inference server on top of [llama.cpp](https://github.com/ggerganov/llama.cpp). OpenAI-compatible API, queue, & scaling.
+[Nitro](https://github.com/janhq/nitro) is an inference server on top of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides an OpenAI-compatible API, queue, & scaling.
Nitro is the default AI engine downloaded with Jan. There is no additional setup needed.
\ No newline at end of file
From 4b6f218639b5c3f5dc0f231ea88af386c0b1c9c0 Mon Sep 17 00:00:00 2001
From: Nicole Zhu
Date: Thu, 14 Mar 2024 21:45:31 +0800
Subject: [PATCH 09/13] docs: nits
---
docs/docs/guides/providers/tensorrt-llm.md | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/docs/docs/guides/providers/tensorrt-llm.md b/docs/docs/guides/providers/tensorrt-llm.md
index 4b0edec2a..52da83b36 100644
--- a/docs/docs/guides/providers/tensorrt-llm.md
+++ b/docs/docs/guides/providers/tensorrt-llm.md
@@ -3,14 +3,17 @@ title: TensorRT-LLM
slug: /guides/providers/tensorrt-llm
---
-Users with Nvidia GPUs can get 20-40% faster* token speeds on their laptop or desktops by using [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
+Users with Nvidia GPUs can get **20-40% faster\* token speeds** on their laptop or desktops by using [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). The greater implication is that you are running FP16, which is also more accurate than quantized models.
This guide walks you through how to install Jan's official [TensorRT-LLM Extension](https://github.com/janhq/nitro-tensorrt-llm). This extension uses [Nitro-TensorRT-LLM](https://github.com/janhq/nitro-tensorrt-llm) as the AI engine, instead of the default [Nitro-Llama-CPP](https://github.com/janhq/nitro). It includes an efficient C++ server to natively execute the [TRT-LLM C++ runtime](https://nvidia.github.io/TensorRT-LLM/gpt_runtime.html). It also comes with additional feature and performance improvements like OpenAI compatibility, tokenizer improvements, and queues.
*Compared to using LlamaCPP engine.
-:::info
+:::warning
This feature is only available for Windows users. Linux is coming soon.
+
+Additionally, we only prebuilt a few demo models. You can always build your desired models directly on your machine. [Read here](#build-your-own-tensorrt-models).
+
:::
## Requirements
@@ -48,8 +51,8 @@ We offer a handful of precompiled models for Ampere and Ada cards that you can i

-:::info
-Due to our limited resources, we only prebuilt a few demo models. You can always build your desired models directly on your machine. [Read here](#build-your-own-tensorrt-models).
+:::warning
+If you are our nightly builds, you may have to reinstall the TensorRT-LLM extension each time you update the app. We're working on better extension lifecyles - stay tuned.
:::
## Configure Settings
From 758afdbeb44589076201aabdbdcf618bca3a581d Mon Sep 17 00:00:00 2001
From: Louis
Date: Thu, 14 Mar 2024 22:11:55 +0700
Subject: [PATCH 10/13] fix: incompatible GPU error message (#2357)
* fix: incompatible GPU error message
* fix: change port
---
extensions/tensorrt-llm-extension/models.json | 47 +++++++++++++++++++
.../tensorrt-llm-extension/package.json | 4 +-
.../tensorrt-llm-extension/src/index.ts | 15 ++++++
web/screens/Chat/ErrorMessage/index.tsx | 21 +++++++++
.../CoreExtensions/TensorRtExtensionItem.tsx | 3 +-
5 files changed, 87 insertions(+), 3 deletions(-)
diff --git a/extensions/tensorrt-llm-extension/models.json b/extensions/tensorrt-llm-extension/models.json
index 31bb11a9e..30f345f47 100644
--- a/extensions/tensorrt-llm-extension/models.json
+++ b/extensions/tensorrt-llm-extension/models.json
@@ -45,5 +45,52 @@
"size": 2151000000
},
"engine": "nitro-tensorrt-llm"
+ },
+ {
+ "sources": [
+ {
+ "filename": "config.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/config.json"
+ },
+ {
+ "filename": "rank0.engine",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/rank0.engine"
+ },
+ {
+ "filename": "tokenizer.model",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.model"
+ },
+ {
+ "filename": "special_tokens_map.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/special_tokens_map.json"
+ },
+ {
+ "filename": "tokenizer.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.json"
+ },
+ {
+ "filename": "tokenizer_config.json",
+ "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer_config.json"
+ }
+ ],
+ "id": "tinyjensen-1.1b-chat-fp16",
+ "object": "model",
+ "name": "TinyJensen 1.1B Chat FP16",
+ "version": "1.0",
+ "description": "Do you want to chat with Jensen Huan? Here you are",
+ "format": "TensorRT-LLM",
+ "settings": {
+ "ctx_len": 2048,
+ "text_model": false
+ },
+ "parameters": {
+ "max_tokens": 4096
+ },
+ "metadata": {
+ "author": "LLama",
+ "tags": ["TensorRT-LLM", "1B", "Finetuned"],
+ "size": 2151000000
+ },
+ "engine": "nitro-tensorrt-llm"
}
]
diff --git a/extensions/tensorrt-llm-extension/package.json b/extensions/tensorrt-llm-extension/package.json
index 01ff3e2c6..96ede4a56 100644
--- a/extensions/tensorrt-llm-extension/package.json
+++ b/extensions/tensorrt-llm-extension/package.json
@@ -1,6 +1,6 @@
{
"name": "@janhq/tensorrt-llm-extension",
- "version": "0.0.2",
+ "version": "0.0.3",
"description": "Enables accelerated inference leveraging Nvidia's TensorRT-LLM for optimal GPU hardware optimizations. Compatible with models in TensorRT-LLM format. Requires Nvidia GPU driver and CUDA Toolkit installation.",
"main": "dist/index.js",
"node": "dist/node/index.cjs.js",
@@ -8,7 +8,7 @@
"license": "AGPL-3.0",
"config": {
"host": "127.0.0.1",
- "port": "3928"
+ "port": "3929"
},
"compatibility": {
"platform": [
diff --git a/extensions/tensorrt-llm-extension/src/index.ts b/extensions/tensorrt-llm-extension/src/index.ts
index e3014b447..cd85601dd 100644
--- a/extensions/tensorrt-llm-extension/src/index.ts
+++ b/extensions/tensorrt-llm-extension/src/index.ts
@@ -20,6 +20,7 @@ import {
LocalOAIEngine,
fs,
MessageRequest,
+ ModelEvent,
} from '@janhq/core'
import models from '../models.json'
@@ -127,6 +128,20 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
events.on(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess)
}
+ async onModelInit(model: Model): Promise {
+ if ((await this.installationState()) === 'Installed')
+ return super.onModelInit(model)
+ else {
+ events.emit(ModelEvent.OnModelFail, {
+ ...model,
+ error: {
+ message: 'EXTENSION_IS_NOT_INSTALLED::TensorRT-LLM extension',
+ },
+ })
+ return
+ }
+ }
+
override async installationState(): Promise {
// For now, we just check the executable of nitro x tensor rt
const isNitroExecutableAvailable = await executeOnMain(
diff --git a/web/screens/Chat/ErrorMessage/index.tsx b/web/screens/Chat/ErrorMessage/index.tsx
index 25cec1cb9..5be87a59d 100644
--- a/web/screens/Chat/ErrorMessage/index.tsx
+++ b/web/screens/Chat/ErrorMessage/index.tsx
@@ -7,11 +7,14 @@ import ModalTroubleShooting, {
modalTroubleShootingAtom,
} from '@/containers/ModalTroubleShoot'
+import { MainViewState } from '@/constants/screens'
+
import { loadModelErrorAtom } from '@/hooks/useActiveModel'
import useSendChatMessage from '@/hooks/useSendChatMessage'
import { getErrorTitle } from '@/utils/errorMessage'
+import { mainViewStateAtom } from '@/helpers/atoms/App.atom'
import { getCurrentChatMessagesAtom } from '@/helpers/atoms/ChatMessage.atom'
const ErrorMessage = ({ message }: { message: ThreadMessage }) => {
@@ -19,6 +22,7 @@ const ErrorMessage = ({ message }: { message: ThreadMessage }) => {
const { resendChatMessage } = useSendChatMessage()
const setModalTroubleShooting = useSetAtom(modalTroubleShootingAtom)
const loadModelError = useAtomValue(loadModelErrorAtom)
+ const setMainState = useSetAtom(mainViewStateAtom)
const PORT_NOT_AVAILABLE = 'PORT_NOT_AVAILABLE'
const regenerateMessage = async () => {
@@ -70,6 +74,23 @@ const ErrorMessage = ({ message }: { message: ThreadMessage }) => {
+ ) : loadModelError?.includes('EXTENSION_IS_NOT_INSTALLED') ? (
+
+
+ Model is currently unavailable. Please switch to a different
+ model or install the{' '}
+ {' '}
+ to continue using it.
+
+
) : (
= ({ item }) => {
- {compatibility ? (
+ {compatibility &&
+ !compatibility['platform']?.includes(PLATFORM) ? (
Only available on{' '}
{compatibility?.platform
From e40d0481b747bcfac8c34a95e65536e4a11809b0 Mon Sep 17 00:00:00 2001
From: Service Account
Date: Thu, 14 Mar 2024 16:28:47 +0000
Subject: [PATCH 11/13] janhq/jan: Update README.md with nightly build artifact
URL
---
README.md | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 6f6044b30..adebb8ea1 100644
--- a/README.md
+++ b/README.md
@@ -76,31 +76,31 @@ Jan is an open-source ChatGPT alternative that runs 100% offline on your compute
| Experimental (Nightly Build) |
-
+
jan.exe
|
-
+
Intel
|
-
+
M1/M2
|
-
+
jan.deb
|
-
+
jan.AppImage
From 58e12f35c978cc7ce40839f277f4ae3cbf7bc234 Mon Sep 17 00:00:00 2001
From: Louis
Date: Thu, 14 Mar 2024 23:59:42 +0700
Subject: [PATCH 12/13] fix: wrong engine handling (#2363)
---
extensions/tensorrt-llm-extension/src/index.ts | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/extensions/tensorrt-llm-extension/src/index.ts b/extensions/tensorrt-llm-extension/src/index.ts
index cd85601dd..02c676841 100644
--- a/extensions/tensorrt-llm-extension/src/index.ts
+++ b/extensions/tensorrt-llm-extension/src/index.ts
@@ -129,6 +129,8 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
}
async onModelInit(model: Model): Promise {
+ if (model.engine !== this.provider) return
+
if ((await this.installationState()) === 'Installed')
return super.onModelInit(model)
else {
@@ -138,7 +140,6 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
message: 'EXTENSION_IS_NOT_INSTALLED::TensorRT-LLM extension',
},
})
- return
}
}
@@ -162,6 +163,7 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
}
inference(data: MessageRequest): void {
+ if (!this.isRunning) return
// TensorRT LLM Extension supports streaming only
if (data.model) data.model.parameters.stream = true
super.inference(data)
From 3e27e9711021fa7a996040c9e88266f9fb054cea Mon Sep 17 00:00:00 2001
From: NamH
Date: Fri, 15 Mar 2024 10:33:59 +0700
Subject: [PATCH 13/13] fix: use model from model hub not load correct model in
thread screen (#2368)
Signed-off-by: James
Co-authored-by: James
---
web/hooks/useCreateNewThread.ts | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/web/hooks/useCreateNewThread.ts b/web/hooks/useCreateNewThread.ts
index 247c65c55..55faded37 100644
--- a/web/hooks/useCreateNewThread.ts
+++ b/web/hooks/useCreateNewThread.ts
@@ -74,11 +74,15 @@ export const useCreateNewThread = () => {
const defaultModel = model ?? recommendedModel ?? downloadedModels[0]
- // check last thread message, if there empty last message use can not create thread
- const lastMessage = threads[0]?.metadata?.lastMessage
+ if (!model) {
+ // if we have model, which means user wants to create new thread from Model hub. Allow them.
- if (!lastMessage && threads.length) {
- return null
+ // check last thread message, if there empty last message use can not create thread
+ const lastMessage = threads[0]?.metadata?.lastMessage
+
+ if (!lastMessage && threads.length) {
+ return null
+ }
}
// modify assistant tools when experimental on, retieval toggle enabled in default
| |