Merge branch 'dev' into eckartal-patch-1

2024-03-15 07:09:59 +03:00 · 2024-03-15 07:09:59 +03:00 · 54425ef10f
commit 54425ef10f
parent 9181ec1f38 3e27e97110
14 changed files with 245 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -76,31 +76,31 @@ Jan is an open-source ChatGPT alternative that runs 100% offline on your compute
  <tr style="text-align:center">
    <td style="text-align:center"><b>Experimental (Nightly Build)</b></td>
    <td style="text-align:center">
-      <a href='https://delta.jan.ai/latest/jan-win-x64-0.4.8-321.exe'>
+      <a href='https://delta.jan.ai/latest/jan-win-x64-0.4.8-323.exe'>
        <img src='./docs/static/img/windows.png' style="height:14px; width: 14px" />
        <b>jan.exe</b>
      </a>
    </td>
    <td style="text-align:center">
-      <a href='https://delta.jan.ai/latest/jan-mac-x64-0.4.8-321.dmg'>
+      <a href='https://delta.jan.ai/latest/jan-mac-x64-0.4.8-323.dmg'>
        <img src='./docs/static/img/mac.png' style="height:15px; width: 15px" />
        <b>Intel</b>
      </a>
    </td>
    <td style="text-align:center">
-      <a href='https://delta.jan.ai/latest/jan-mac-arm64-0.4.8-321.dmg'>
+      <a href='https://delta.jan.ai/latest/jan-mac-arm64-0.4.8-323.dmg'>
        <img src='./docs/static/img/mac.png' style="height:15px; width: 15px" />
        <b>M1/M2</b>
      </a>
    </td>
    <td style="text-align:center">
-      <a href='https://delta.jan.ai/latest/jan-linux-amd64-0.4.8-321.deb'>
+      <a href='https://delta.jan.ai/latest/jan-linux-amd64-0.4.8-323.deb'>
        <img src='./docs/static/img/linux.png' style="height:14px; width: 14px" />
        <b>jan.deb</b>
      </a>
    </td>
    <td style="text-align:center">
-      <a href='https://delta.jan.ai/latest/jan-linux-x86_64-0.4.8-321.AppImage'>
+      <a href='https://delta.jan.ai/latest/jan-linux-x86_64-0.4.8-323.AppImage'>
        <img src='./docs/static/img/linux.png' style="height:14px; width: 14px" />
        <b>jan.AppImage</b>
      </a>
--- a/docs/docs/guides/providers/README.mdx
+++ b/docs/docs/guides/providers/README.mdx
@ -0,0 +1,8 @@
+---
+title: Inference Providers
+slug: /guides/providers
+---
+
+import DocCardList from "@theme/DocCardList";
+
+<DocCardList />
--- a/docs/docs/guides/providers/image.png
+++ b/docs/docs/guides/providers/image.png
--- a/docs/docs/guides/providers/llama-cpp.md
+++ b/docs/docs/guides/providers/llama-cpp.md
@ -0,0 +1,10 @@
+---
+title: llama.cpp
+slug: /guides/providers/llama-cpp
+---
+
+## Overview
+
+[Nitro](https://github.com/janhq/nitro) is an inference server on top of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides an OpenAI-compatible API, queue, & scaling.
+
+Nitro is the default AI engine downloaded with Jan. There is no additional setup needed.
--- a/docs/docs/guides/providers/tensorrt-llm.md
+++ b/docs/docs/guides/providers/tensorrt-llm.md
@ -0,0 +1,87 @@
+---
+title: TensorRT-LLM
+slug: /guides/providers/tensorrt-llm
+---
+
+Users with Nvidia GPUs can get **20-40% faster\* token speeds** on their laptop or desktops by using [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). The greater implication is that you are running FP16, which is also more accurate than quantized models.
+
+This guide walks you through how to install Jan's official [TensorRT-LLM Extension](https://github.com/janhq/nitro-tensorrt-llm). This extension uses [Nitro-TensorRT-LLM](https://github.com/janhq/nitro-tensorrt-llm) as the AI engine, instead of the default [Nitro-Llama-CPP](https://github.com/janhq/nitro). It includes an efficient C++ server to natively execute the [TRT-LLM C++ runtime](https://nvidia.github.io/TensorRT-LLM/gpt_runtime.html). It also comes with additional feature and performance improvements like OpenAI compatibility, tokenizer improvements, and queues.
+
+*Compared to using LlamaCPP engine.
+
+:::warning
+This feature is only available for Windows users. Linux is coming soon.
+
+Additionally, we only prebuilt a few demo models. You can always build your desired models directly on your machine. [Read here](#build-your-own-tensorrt-models).
+
+:::
+
+## Requirements
+
+- A Windows PC
+- Nvidia GPU(s): Ada or Ampere series (i.e. RTX 4000s & 3000s). More will be supported soon.
+- 3GB+ of disk space to download TRT-LLM artifacts and a Nitro binary
+- Jan v0.4.9+ or Jan v0.4.8-321+ (nightly)
+- Nvidia Driver v535+ ([installation guide](https://jan.ai/guides/common-error/not-using-gpu/#1-ensure-gpu-mode-requirements))
+- CUDA Toolkit v12.2+ ([installation guide](https://jan.ai/guides/common-error/not-using-gpu/#1-ensure-gpu-mode-requirements))
+
+## Install TensorRT-Extension
+
+1. Go to Settings > Extensions
+2. Click install next to the TensorRT-LLM Extension
+3. Check that files are correctly downloaded
+
+```sh
+ls ~\jan\extensions\@janhq\tensorrt-llm-extension\dist\bin
+# Your Extension Folder should now include `nitro.exe`, among other artifacts needed to run TRT-LLM
+```
+
+## Download a Compatible Model
+TensorRT-LLM can only run models in `TensorRT` format. These models, aka "TensorRT Engines", are prebuilt specifically for each target OS+GPU architecture.
+
+We offer a handful of precompiled models for Ampere and Ada cards that you can immediately download and play with:
+
+1. Restart the application and go to the Hub
+2. Look for models with the `TensorRT-LLM` label in the recommended models list. Click download. This step might take some time. 🙏
+
+![image](https://hackmd.io/_uploads/rJewrEgRp.png)
+
+3. Click use and start chatting!
+4. You may need to allow Nitro in your network 
+
+![alt text](image.png)
+
+:::warning
+If you are our nightly builds, you may have to reinstall the TensorRT-LLM extension each time you update the app. We're working on better extension lifecyles - stay tuned.
+:::
+
+## Configure Settings
+
+You can customize the default parameters for how Jan runs TensorRT-LLM. 
+
+:::info
+coming soon
+:::
+
+## Troubleshooting
+
+### Incompatible Extension vs Engine versions
+
+For now, the model versions are pinned to the extension versions.
+
+### Uninstall Extension
+
+1. Quit the app
+2. Go to Settings > Extensions
+3. Delete the entire Extensions folder.
+4. Reopen the app, only the default extensions should be restored.
+
+### Install Nitro-TensorRT-LLM manually
+
+To manually build the artifacts needed to run the server and TensorRT-LLM, you can reference the source code. [Read here](https://github.com/janhq/nitro-tensorrt-llm?tab=readme-ov-file#quickstart).
+
+### Build your own TensorRT models
+
+:::info
+coming soon
+:::
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@ -199,6 +199,19 @@ const sidebars = {
            "guides/models/integrate-remote",
          ]
        },
+        {
+          type: "category",
+          label: "Inference Providers",
+          className: "head_SubMenu",
+          link: {
+            type: 'doc',
+            id: "guides/providers/README",
+          },
+          items: [
+            "guides/providers/llama-cpp",
+            "guides/providers/tensorrt-llm",
+          ]
+        },
        {
          type: "category",
          label: "Extensions",
--- a/extensions/monitoring-extension/src/node/index.ts
+++ b/extensions/monitoring-extension/src/node/index.ts
@ -2,17 +2,17 @@ import { GpuSetting, GpuSettingInfo, ResourceInfo } from '@janhq/core'
 import { getJanDataFolderPath, log } from '@janhq/core/node'
 import { mem, cpu } from 'node-os-utils'
 import { exec } from 'child_process'
-import { writeFileSync, existsSync, readFileSync } from 'fs'
+import { writeFileSync, existsSync, readFileSync, mkdirSync } from 'fs'
 import path from 'path'

+/**
+ * Path to the settings directory
+ **/
+export const SETTINGS_DIR = path.join(getJanDataFolderPath(), 'settings')
 /**
 * Path to the settings file
 **/
-export const GPU_INFO_FILE = path.join(
-  getJanDataFolderPath(),
-  'settings',
-  'settings.json'
-)
+export const GPU_INFO_FILE = path.join(SETTINGS_DIR, 'settings.json')

 /**
 * Default GPU settings
@ -136,6 +136,11 @@ export const updateNvidiaInfo = async () => {
  try {
    JSON.parse(readFileSync(GPU_INFO_FILE, 'utf-8'))
  } catch (error) {
+    if (!existsSync(SETTINGS_DIR)) {
+      mkdirSync(SETTINGS_DIR, {
+        recursive: true,
+      })
+    }
    writeFileSync(GPU_INFO_FILE, JSON.stringify(DEFAULT_SETTINGS, null, 2))
  }

--- a/extensions/tensorrt-llm-extension/models.json
+++ b/extensions/tensorrt-llm-extension/models.json
@ -33,10 +33,57 @@
    "description": "LlamaCorn is a refined version of TinyLlama-1.1B, optimized for conversational quality, running on consumer devices through TensorRT-LLM",
    "format": "TensorRT-LLM",
    "settings": {
-      "ctx_len": 2048
+      "ctx_len": 2048,
+      "text_model": false
+    },
+    "parameters": {
+      "max_tokens": 4096
+    },
+    "metadata": {
+      "author": "LLama",
+      "tags": ["TensorRT-LLM", "1B", "Finetuned"],
+      "size": 2151000000
+    },
+    "engine": "nitro-tensorrt-llm"
+  },
+  {
+    "sources": [
+      {
+        "filename": "config.json",
+        "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/config.json"
+      },
+      {
+        "filename": "rank0.engine",
+        "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/rank0.engine"
+      },
+      {
+        "filename": "tokenizer.model",
+        "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.model"
+      },
+      {
+        "filename": "special_tokens_map.json",
+        "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/special_tokens_map.json"
+      },
+      {
+        "filename": "tokenizer.json",
+        "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer.json"
+      },
+      {
+        "filename": "tokenizer_config.json",
+        "url": "https://delta.jan.ai/dist/models/turing/windows/TinyJensen-1.1B-Chat-fp16/tokenizer_config.json"
+      }
+    ],
+    "id": "tinyjensen-1.1b-chat-fp16",
+    "object": "model",
+    "name": "TinyJensen 1.1B Chat FP16",
+    "version": "1.0",
+    "description": "Do you want to chat with Jensen Huan? Here you are",
+    "format": "TensorRT-LLM",
+    "settings": {
+      "ctx_len": 2048,
+      "text_model": false
    },
    "parameters": {
-      "stream": true,
      "max_tokens": 4096
    },
    "metadata": {
--- a/extensions/tensorrt-llm-extension/package.json
+++ b/extensions/tensorrt-llm-extension/package.json
@ -1,6 +1,6 @@
 {
  "name": "@janhq/tensorrt-llm-extension",
-  "version": "0.0.2",
+  "version": "0.0.3",
  "description": "Enables accelerated inference leveraging Nvidia's TensorRT-LLM for optimal GPU hardware optimizations. Compatible with models in TensorRT-LLM format. Requires Nvidia GPU driver and CUDA Toolkit installation.",
  "main": "dist/index.js",
  "node": "dist/node/index.cjs.js",
@ -8,7 +8,7 @@
  "license": "AGPL-3.0",
  "config": {
    "host": "127.0.0.1",
-    "port": "3928"
+    "port": "3929"
  },
  "compatibility": {
    "platform": [
--- a/extensions/tensorrt-llm-extension/src/index.ts
+++ b/extensions/tensorrt-llm-extension/src/index.ts
@ -19,6 +19,8 @@ import {
  systemInformations,
  LocalOAIEngine,
  fs,
+  MessageRequest,
+  ModelEvent,
 } from '@janhq/core'
 import models from '../models.json'

@ -126,6 +128,21 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
    events.on(DownloadEvent.onFileDownloadSuccess, onFileDownloadSuccess)
  }

+  async onModelInit(model: Model): Promise<void> {
+    if (model.engine !== this.provider) return
+
+    if ((await this.installationState()) === 'Installed')
+      return super.onModelInit(model)
+    else {
+      events.emit(ModelEvent.OnModelFail, {
+        ...model,
+        error: {
+          message: 'EXTENSION_IS_NOT_INSTALLED::TensorRT-LLM extension',
+        },
+      })
+    }
+  }
+
  override async installationState(): Promise<InstallationState> {
    // For now, we just check the executable of nitro x tensor rt
    const isNitroExecutableAvailable = await executeOnMain(
@ -144,4 +161,11 @@ export default class TensorRTLLMExtension extends LocalOAIEngine {
    )
    return Promise.resolve()
  }
+
+  inference(data: MessageRequest): void {
+    if (!this.isRunning) return
+    // TensorRT LLM Extension supports streaming only
+    if (data.model) data.model.parameters.stream = true
+    super.inference(data)
+  }
 }
--- a/web/hooks/useCreateNewThread.ts
+++ b/web/hooks/useCreateNewThread.ts
@ -74,12 +74,16 @@ export const useCreateNewThread = () => {

    const defaultModel = model ?? recommendedModel ?? downloadedModels[0]

+    if (!model) {
+      // if we have model, which means user wants to create new thread from Model hub. Allow them.
+
      // check last thread message, if there empty last message use can not create thread
      const lastMessage = threads[0]?.metadata?.lastMessage

      if (!lastMessage && threads.length) {
        return null
      }
+    }

    // modify assistant tools when experimental on, retieval toggle enabled in default
    const assistantTools: AssistantTool = {
--- a/web/screens/Chat/ChatInput/index.tsx
+++ b/web/screens/Chat/ChatInput/index.tsx
@ -244,16 +244,13 @@ const ChatInput: React.FC = () => {
                  <li
                    className={twMerge(
                      'flex w-full cursor-pointer items-center space-x-2 px-4 py-2 text-muted-foreground hover:bg-secondary',
-                      activeThread?.assistants[0].model.settings.vision_model &&
-                        activeThread?.assistants[0].model.settings
-                          .text_model === false
+                      activeThread?.assistants[0].model.settings.text_model ===
+                        false
                        ? 'cursor-not-allowed opacity-50'
                        : 'cursor-pointer'
                    )}
                    onClick={() => {
                      if (
-                        !activeThread?.assistants[0].model.settings
-                          .vision_model ||
                        activeThread?.assistants[0].model.settings
                          .text_model !== false
                      ) {
--- a/web/screens/Chat/ErrorMessage/index.tsx
+++ b/web/screens/Chat/ErrorMessage/index.tsx
@ -7,11 +7,14 @@ import ModalTroubleShooting, {
  modalTroubleShootingAtom,
 } from '@/containers/ModalTroubleShoot'

+import { MainViewState } from '@/constants/screens'
+
 import { loadModelErrorAtom } from '@/hooks/useActiveModel'
 import useSendChatMessage from '@/hooks/useSendChatMessage'

 import { getErrorTitle } from '@/utils/errorMessage'

+import { mainViewStateAtom } from '@/helpers/atoms/App.atom'
 import { getCurrentChatMessagesAtom } from '@/helpers/atoms/ChatMessage.atom'

 const ErrorMessage = ({ message }: { message: ThreadMessage }) => {
@ -19,6 +22,7 @@ const ErrorMessage = ({ message }: { message: ThreadMessage }) => {
  const { resendChatMessage } = useSendChatMessage()
  const setModalTroubleShooting = useSetAtom(modalTroubleShootingAtom)
  const loadModelError = useAtomValue(loadModelErrorAtom)
+  const setMainState = useSetAtom(mainViewStateAtom)
  const PORT_NOT_AVAILABLE = 'PORT_NOT_AVAILABLE'

  const regenerateMessage = async () => {
@ -70,6 +74,23 @@ const ErrorMessage = ({ message }: { message: ThreadMessage }) => {
              </p>
              <ModalTroubleShooting />
            </div>
+          ) : loadModelError?.includes('EXTENSION_IS_NOT_INSTALLED') ? (
+            <div
+              key={message.id}
+              className="flex w-full flex-col items-center text-center text-sm font-medium text-gray-500"
+            >
+              <p className="w-[90%]">
+                Model is currently unavailable. Please switch to a different
+                model or install the{' '}
+                <button
+                  className="font-medium text-blue-500"
+                  onClick={() => setMainState(MainViewState.Settings)}
+                >
+                  {loadModelError.split('::')[1] ?? ''}
+                </button>{' '}
+                to continue using it.
+              </p>
+            </div>
          ) : (
            <div
              key={message.id}
--- a/web/screens/Settings/CoreExtensions/TensorRtExtensionItem.tsx
+++ b/web/screens/Settings/CoreExtensions/TensorRtExtensionItem.tsx
@ -123,6 +123,7 @@ const TensorRtExtensionItem: React.FC<Props> = ({ item }) => {
          {item.description}
        </p>
      </div>
+
      {(!compatibility || compatibility['platform']?.includes(PLATFORM)) &&
      isGpuSupported ? (
        <div className="flex min-w-[150px] flex-row justify-end">
@ -143,7 +144,8 @@ const TensorRtExtensionItem: React.FC<Props> = ({ item }) => {
              </TooltipTrigger>
              <TooltipPortal>
                <TooltipContent side="top">
-                  {compatibility ? (
+                  {compatibility &&
+                  !compatibility['platform']?.includes(PLATFORM) ? (
                    <span>
                      Only available on{' '}
                      {compatibility?.platform
@ -185,15 +187,14 @@ const InstallStateIndicator: React.FC<InstallStateProps> = ({
  onInstallClick,
  onCancelClick,
 }) => {
-  // TODO: NamH support dark mode for this
  if (installProgress !== -1) {
    const progress = installProgress * 100
    return (
-      <div className="flex h-10 flex-row items-center justify-center space-x-2 rounded-md bg-[#EFF8FF] px-4 text-primary">
+      <div className="flex h-10 flex-row items-center justify-center space-x-2 rounded-lg bg-[#EFF8FF] px-4 text-primary dark:bg-secondary">
        <button onClick={onCancelClick} className="font-semibold text-primary">
          Cancel
        </button>
-        <div className="flex w-[113px] flex-row items-center justify-center space-x-2 rounded-md bg-[#D1E9FF] px-2 py-[2px]">
+        <div className="flex w-[113px] flex-row items-center justify-center space-x-2 rounded-md bg-[#D1E9FF] px-2 py-[2px] dark:bg-black/50">
          <Progress className="h-1 w-[69px]" value={progress} />
          <span className="text-xs font-bold text-primary">
            {progress.toFixed(0)}%