diff --git a/README.md b/README.md index b25a917d7..adebb8ea1 100644 --- a/README.md +++ b/README.md @@ -76,31 +76,31 @@ Jan is an open-source ChatGPT alternative that runs 100% offline on your compute Experimental (Nightly Build) - + jan.exe - + Intel - + M1/M2 - + jan.deb - + jan.AppImage diff --git a/docs/docs/guides/providers/README.mdx b/docs/docs/guides/providers/README.mdx new file mode 100644 index 000000000..aa3bfea1f --- /dev/null +++ b/docs/docs/guides/providers/README.mdx @@ -0,0 +1,8 @@ +--- +title: Inference Providers +slug: /guides/providers +--- + +import DocCardList from "@theme/DocCardList"; + + diff --git a/docs/docs/guides/providers/image.png b/docs/docs/guides/providers/image.png new file mode 100644 index 000000000..5f1f7104e Binary files /dev/null and b/docs/docs/guides/providers/image.png differ diff --git a/docs/docs/guides/providers/llama-cpp.md b/docs/docs/guides/providers/llama-cpp.md new file mode 100644 index 000000000..d2b0daa2a --- /dev/null +++ b/docs/docs/guides/providers/llama-cpp.md @@ -0,0 +1,10 @@ +--- +title: llama.cpp +slug: /guides/providers/llama-cpp +--- + +## Overview + +[Nitro](https://github.com/janhq/nitro) is an inference server on top of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides an OpenAI-compatible API, queue, & scaling. + +Nitro is the default AI engine downloaded with Jan. There is no additional setup needed. \ No newline at end of file diff --git a/docs/docs/guides/providers/tensorrt-llm.md b/docs/docs/guides/providers/tensorrt-llm.md new file mode 100644 index 000000000..52da83b36 --- /dev/null +++ b/docs/docs/guides/providers/tensorrt-llm.md @@ -0,0 +1,87 @@ +--- +title: TensorRT-LLM +slug: /guides/providers/tensorrt-llm +--- + +Users with Nvidia GPUs can get **20-40% faster\* token speeds** on their laptop or desktops by using [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). The greater implication is that you are running FP16, which is also more accurate than quantized models. + +This guide walks you through how to install Jan's official [TensorRT-LLM Extension](https://github.com/janhq/nitro-tensorrt-llm). This extension uses [Nitro-TensorRT-LLM](https://github.com/janhq/nitro-tensorrt-llm) as the AI engine, instead of the default [Nitro-Llama-CPP](https://github.com/janhq/nitro). It includes an efficient C++ server to natively execute the [TRT-LLM C++ runtime](https://nvidia.github.io/TensorRT-LLM/gpt_runtime.html). It also comes with additional feature and performance improvements like OpenAI compatibility, tokenizer improvements, and queues. + +*Compared to using LlamaCPP engine. + +:::warning +This feature is only available for Windows users. Linux is coming soon. + +Additionally, we only prebuilt a few demo models. You can always build your desired models directly on your machine. [Read here](#build-your-own-tensorrt-models). + +::: + +## Requirements + +- A Windows PC +- Nvidia GPU(s): Ada or Ampere series (i.e. RTX 4000s & 3000s). More will be supported soon. +- 3GB+ of disk space to download TRT-LLM artifacts and a Nitro binary +- Jan v0.4.9+ or Jan v0.4.8-321+ (nightly) +- Nvidia Driver v535+ ([installation guide](https://jan.ai/guides/common-error/not-using-gpu/#1-ensure-gpu-mode-requirements)) +- CUDA Toolkit v12.2+ ([installation guide](https://jan.ai/guides/common-error/not-using-gpu/#1-ensure-gpu-mode-requirements)) + +## Install TensorRT-Extension + +1. Go to Settings > Extensions +2. Click install next to the TensorRT-LLM Extension +3. Check that files are correctly downloaded + +```sh +ls ~\jan\extensions\@janhq\tensorrt-llm-extension\dist\bin +# Your Extension Folder should now include `nitro.exe`, among other artifacts needed to run TRT-LLM +``` + +## Download a Compatible Model +TensorRT-LLM can only run models in `TensorRT` format. These models, aka "TensorRT Engines", are prebuilt specifically for each target OS+GPU architecture. + +We offer a handful of precompiled models for Ampere and Ada cards that you can immediately download and play with: + +1. Restart the application and go to the Hub +2. Look for models with the `TensorRT-LLM` label in the recommended models list. Click download. This step might take some time. 🙏 + +![image](https://hackmd.io/_uploads/rJewrEgRp.png) + +3. Click use and start chatting! +4. You may need to allow Nitro in your network + +![alt text](image.png) + +:::warning +If you are our nightly builds, you may have to reinstall the TensorRT-LLM extension each time you update the app. We're working on better extension lifecyles - stay tuned. +::: + +## Configure Settings + +You can customize the default parameters for how Jan runs TensorRT-LLM. + +:::info +coming soon +::: + +## Troubleshooting + +### Incompatible Extension vs Engine versions + +For now, the model versions are pinned to the extension versions. + +### Uninstall Extension + +1. Quit the app +2. Go to Settings > Extensions +3. Delete the entire Extensions folder. +4. Reopen the app, only the default extensions should be restored. + +### Install Nitro-TensorRT-LLM manually + +To manually build the artifacts needed to run the server and TensorRT-LLM, you can reference the source code. [Read here](https://github.com/janhq/nitro-tensorrt-llm?tab=readme-ov-file#quickstart). + +### Build your own TensorRT models + +:::info +coming soon +::: diff --git a/docs/sidebars.js b/docs/sidebars.js index 4c45cadbe..b95e4044f 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -199,6 +199,19 @@ const sidebars = { "guides/models/integrate-remote", ] }, + { + type: "category", + label: "Inference Providers", + className: "head_SubMenu", + link: { + type: 'doc', + id: "guides/providers/README", + }, + items: [ + "guides/providers/llama-cpp", + "guides/providers/tensorrt-llm", + ] + }, { type: "category", label: "Extensions", diff --git a/extensions/monitoring-extension/src/node/index.ts b/extensions/monitoring-extension/src/node/index.ts index 1d65704de..25f151112 100644 --- a/extensions/monitoring-extension/src/node/index.ts +++ b/extensions/monitoring-extension/src/node/index.ts @@ -2,17 +2,17 @@ import { GpuSetting, GpuSettingInfo, ResourceInfo } from '@janhq/core' import { getJanDataFolderPath, log } from '@janhq/core/node' import { mem, cpu } from 'node-os-utils' import { exec } from 'child_process' -import { writeFileSync, existsSync, readFileSync } from 'fs' +import { writeFileSync, existsSync, readFileSync, mkdirSync } from 'fs' import path from 'path' +/** + * Path to the settings directory + **/ +export const SETTINGS_DIR = path.join(getJanDataFolderPath(), 'settings') /** * Path to the settings file **/ -export const GPU_INFO_FILE = path.join( - getJanDataFolderPath(), - 'settings', - 'settings.json' -) +export const GPU_INFO_FILE = path.join(SETTINGS_DIR, 'settings.json') /** * Default GPU settings @@ -136,6 +136,11 @@ export const updateNvidiaInfo = async () => { try { JSON.parse(readFileSync(GPU_INFO_FILE, 'utf-8')) } catch (error) { + if (!existsSync(SETTINGS_DIR)) { + mkdirSync(SETTINGS_DIR, { + recursive: true, + }) + } writeFileSync(GPU_INFO_FILE, JSON.stringify(DEFAULT_SETTINGS, null, 2)) } diff --git a/extensions/tensorrt-llm-extension/models.json b/extensions/tensorrt-llm-extension/models.json index bc6a78256..30f345f47 100644 --- a/extensions/tensorrt-llm-extension/models.json +++ b/extensions/tensorrt-llm-extension/models.json @@ -33,10 +33,57 @@ "description": "LlamaCorn is a refined version of TinyLlama-1.1B, optimized for conversational quality, running on consumer devices through TensorRT-LLM", "format": "TensorRT-LLM", "settings": { - "ctx_len": 2048 + "ctx_len": 2048, + "text_model": false + }, + "parameters": { + "max_tokens": 4096 + }, + "metadata": { + "author": "LLama", + "tags": ["TensorRT-LLM", "1B", "Finetuned"], + "size": 2151000000 + }, + "engine": "nitro-tensorrt-llm" + }, + { + "sources": [ + { + "filename": "config.json", + "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/config.json" + }, + { + "filename": "rank0.engine", + "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/rank0.engine" + }, + { + "filename": "tokenizer.model", + "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.model" + }, + { + "filename": "special_tokens_map.json", + "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/special_tokens_map.json" + }, + { + "filename": "tokenizer.json", + "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.json" + }, + { + "filename": "tokenizer_config.json", + "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer_config.json" + } + ], + "id": "tinyjensen-1.1b-chat-fp16", + "object": "model", + "name": "TinyJensen 1.1B Chat FP16", + "version": "1.0", + "description": "Do you want to chat with Jensen Huan? Here you are", + "format": "TensorRT-LLM", + "settings": { + "ctx_len": 2048, + "text_model": false }, "parameters": { - "stream": true, "max_tokens": 4096 }, "metadata": { diff --git a/extensions/tensorrt-llm-extension/package.json b/extensions/tensorrt-llm-extension/package.json index 01ff3e2c6..96ede4a56 100644 --- a/extensions/tensorrt-llm-extension/package.json +++ b/extensions/tensorrt-llm-extension/package.json @@ -1,6 +1,6 @@ { "name": "@janhq/tensorrt-llm-extension", - "version": "0.0.2", + "version": "0.0.3", "description": "Enables accelerated inference leveraging Nvidia's TensorRT-LLM for optimal GPU hardware optimizations. Compatible with models in TensorRT-LLM format. Requires Nvidia GPU driver and CUDA Toolkit installation.", "main": "dist/index.js", "node": "dist/node/index.cjs.js", @@ -8,7 +8,7 @@ "license": "AGPL-3.0", "config": { "host": "127.0.0.1", - "port": "3928" + "port": "3929" }, "compatibility": { "platform": [ diff --git a/extensions/tensorrt-llm-extension/src/index.ts b/extensions/tensorrt-llm-extension/src/index.ts index 076951c3f..02c676841 100644 --- a/extensions/tensorrt-llm-extension/src/index.ts +++ b/extensions/tensorrt-llm-extension/src/index.ts @@ -19,6 +19,8 @@ import { systemInformations, LocalOAIEngine, fs, + MessageRequest, + ModelEvent, } from '@janhq/core' import models from '../models.json' @@ -126,6 +128,21 @@ export default class TensorRTLLMExtension extends LocalOAIEngine { events.on(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess) } + async onModelInit(model: Model): Promise { + if (model.engine !== this.provider) return + + if ((await this.installationState()) === 'Installed') + return super.onModelInit(model) + else { + events.emit(ModelEvent.OnModelFail, { + ...model, + error: { + message: 'EXTENSION_IS_NOT_INSTALLED::TensorRT-LLM extension', + }, + }) + } + } + override async installationState(): Promise { // For now, we just check the executable of nitro x tensor rt const isNitroExecutableAvailable = await executeOnMain( @@ -144,4 +161,11 @@ export default class TensorRTLLMExtension extends LocalOAIEngine { ) return Promise.resolve() } + + inference(data: MessageRequest): void { + if (!this.isRunning) return + // TensorRT LLM Extension supports streaming only + if (data.model) data.model.parameters.stream = true + super.inference(data) + } } diff --git a/web/hooks/useCreateNewThread.ts b/web/hooks/useCreateNewThread.ts index 247c65c55..55faded37 100644 --- a/web/hooks/useCreateNewThread.ts +++ b/web/hooks/useCreateNewThread.ts @@ -74,11 +74,15 @@ export const useCreateNewThread = () => { const defaultModel = model ?? recommendedModel ?? downloadedModels[0] - // check last thread message, if there empty last message use can not create thread - const lastMessage = threads[0]?.metadata?.lastMessage + if (!model) { + // if we have model, which means user wants to create new thread from Model hub. Allow them. - if (!lastMessage && threads.length) { - return null + // check last thread message, if there empty last message use can not create thread + const lastMessage = threads[0]?.metadata?.lastMessage + + if (!lastMessage && threads.length) { + return null + } } // modify assistant tools when experimental on, retieval toggle enabled in default diff --git a/web/screens/Chat/ChatInput/index.tsx b/web/screens/Chat/ChatInput/index.tsx index c90a12cd2..8707e8bcd 100644 --- a/web/screens/Chat/ChatInput/index.tsx +++ b/web/screens/Chat/ChatInput/index.tsx @@ -244,16 +244,13 @@ const ChatInput: React.FC = () => {
  • { if ( - !activeThread?.assistants[0].model.settings - .vision_model || activeThread?.assistants[0].model.settings .text_model !== false ) { diff --git a/web/screens/Chat/ErrorMessage/index.tsx b/web/screens/Chat/ErrorMessage/index.tsx index 25cec1cb9..5be87a59d 100644 --- a/web/screens/Chat/ErrorMessage/index.tsx +++ b/web/screens/Chat/ErrorMessage/index.tsx @@ -7,11 +7,14 @@ import ModalTroubleShooting, { modalTroubleShootingAtom, } from '@/containers/ModalTroubleShoot' +import { MainViewState } from '@/constants/screens' + import { loadModelErrorAtom } from '@/hooks/useActiveModel' import useSendChatMessage from '@/hooks/useSendChatMessage' import { getErrorTitle } from '@/utils/errorMessage' +import { mainViewStateAtom } from '@/helpers/atoms/App.atom' import { getCurrentChatMessagesAtom } from '@/helpers/atoms/ChatMessage.atom' const ErrorMessage = ({ message }: { message: ThreadMessage }) => { @@ -19,6 +22,7 @@ const ErrorMessage = ({ message }: { message: ThreadMessage }) => { const { resendChatMessage } = useSendChatMessage() const setModalTroubleShooting = useSetAtom(modalTroubleShootingAtom) const loadModelError = useAtomValue(loadModelErrorAtom) + const setMainState = useSetAtom(mainViewStateAtom) const PORT_NOT_AVAILABLE = 'PORT_NOT_AVAILABLE' const regenerateMessage = async () => { @@ -70,6 +74,23 @@ const ErrorMessage = ({ message }: { message: ThreadMessage }) => {

    + ) : loadModelError?.includes('EXTENSION_IS_NOT_INSTALLED') ? ( +
    +

    + Model is currently unavailable. Please switch to a different + model or install the{' '} + {' '} + to continue using it. +

    +
    ) : (
    = ({ item }) => { {item.description}

    + {(!compatibility || compatibility['platform']?.includes(PLATFORM)) && isGpuSupported ? (
    @@ -143,7 +144,8 @@ const TensorRtExtensionItem: React.FC = ({ item }) => { - {compatibility ? ( + {compatibility && + !compatibility['platform']?.includes(PLATFORM) ? ( Only available on{' '} {compatibility?.platform @@ -185,15 +187,14 @@ const InstallStateIndicator: React.FC = ({ onInstallClick, onCancelClick, }) => { - // TODO: NamH support dark mode for this if (installProgress !== -1) { const progress = installProgress * 100 return ( -
    +
    -
    +
    {progress.toFixed(0)}%