diff --git a/extensions/tensorrt-llm-extension/models.json b/extensions/tensorrt-llm-extension/models.json index bc6a78256..31bb11a9e 100644 --- a/extensions/tensorrt-llm-extension/models.json +++ b/extensions/tensorrt-llm-extension/models.json @@ -33,10 +33,10 @@ "description": "LlamaCorn is a refined version of TinyLlama-1.1B, optimized for conversational quality, running on consumer devices through TensorRT-LLM", "format": "TensorRT-LLM", "settings": { - "ctx_len": 2048 + "ctx_len": 2048, + "text_model": false }, "parameters": { - "stream": true, "max_tokens": 4096 }, "metadata": { diff --git a/extensions/tensorrt-llm-extension/src/index.ts b/extensions/tensorrt-llm-extension/src/index.ts index 076951c3f..e3014b447 100644 --- a/extensions/tensorrt-llm-extension/src/index.ts +++ b/extensions/tensorrt-llm-extension/src/index.ts @@ -19,6 +19,7 @@ import { systemInformations, LocalOAIEngine, fs, + MessageRequest, } from '@janhq/core' import models from '../models.json' @@ -144,4 +145,10 @@ export default class TensorRTLLMExtension extends LocalOAIEngine { ) return Promise.resolve() } + + inference(data: MessageRequest): void { + // TensorRT LLM Extension supports streaming only + if (data.model) data.model.parameters.stream = true + super.inference(data) + } } diff --git a/web/screens/Chat/ChatInput/index.tsx b/web/screens/Chat/ChatInput/index.tsx index c90a12cd2..8707e8bcd 100644 --- a/web/screens/Chat/ChatInput/index.tsx +++ b/web/screens/Chat/ChatInput/index.tsx @@ -244,16 +244,13 @@ const ChatInput: React.FC = () => {
  • { if ( - !activeThread?.assistants[0].model.settings - .vision_model || activeThread?.assistants[0].model.settings .text_model !== false ) {