From 4293fe7edc86565539bbdcec4a28ccdad5335e14 Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Fri, 12 Sep 2025 10:58:34 +0700 Subject: [PATCH 01/32] fix: avoid error validate nested dom --- .../src/containers/dialogs/DeleteMessageDialog.tsx | 14 ++++++++++++-- .../src/containers/dialogs/EditMessageDialog.tsx | 14 ++++++++++++-- web-app/src/containers/dialogs/ErrorDialog.tsx | 12 ++++++++++-- .../containers/dialogs/LoadModelErrorDialog.tsx | 12 ++++++++++-- .../containers/dialogs/MessageMetadataDialog.tsx | 14 ++++++++++++-- 5 files changed, 56 insertions(+), 10 deletions(-) diff --git a/web-app/src/containers/dialogs/DeleteMessageDialog.tsx b/web-app/src/containers/dialogs/DeleteMessageDialog.tsx index 8df4c7118..c4444b1bd 100644 --- a/web-app/src/containers/dialogs/DeleteMessageDialog.tsx +++ b/web-app/src/containers/dialogs/DeleteMessageDialog.tsx @@ -41,9 +41,19 @@ export function DeleteMessageDialog({ onDelete }: DeleteMessageDialogProps) { const trigger = ( - +

{t('delete')}

diff --git a/web-app/src/containers/dialogs/EditMessageDialog.tsx b/web-app/src/containers/dialogs/EditMessageDialog.tsx index 8faa32128..ade50befe 100644 --- a/web-app/src/containers/dialogs/EditMessageDialog.tsx +++ b/web-app/src/containers/dialogs/EditMessageDialog.tsx @@ -64,9 +64,19 @@ export function EditMessageDialog({ const defaultTrigger = ( - +

{t('edit')}

diff --git a/web-app/src/containers/dialogs/ErrorDialog.tsx b/web-app/src/containers/dialogs/ErrorDialog.tsx index cd6ca879a..9f4784ad0 100644 --- a/web-app/src/containers/dialogs/ErrorDialog.tsx +++ b/web-app/src/containers/dialogs/ErrorDialog.tsx @@ -61,9 +61,17 @@ export default function ErrorDialog() {
- +
{isDetailExpanded && (
- +
{isDetailExpanded && (
- +

{t('metadata')}

From 654e566dcbc228da94fb59b2fcf46aeaa0ea127d Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Fri, 12 Sep 2025 13:43:31 +0530 Subject: [PATCH 02/32] fix: correct context shift flag handling in LlamaCPP extension (#6404) (#6431) * fix: correct context shift flag handling in LlamaCPP extension The previous implementation added the `--no-context-shift` flag when `cfg.ctx_shift` was disabled, which conflicted with the llama.cpp CLI where the presence of `--context-shift` enables the feature. The logic is updated to push `--context-shift` only when `cfg.ctx_shift` is true, ensuring the extension passes the correct argument and behaves as expected. * feat: detect model out of context during generation --------- Co-authored-by: Dinh Long Nguyen --- extensions/llamacpp-extension/src/index.ts | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index a086b74db..1d98d4213 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -42,6 +42,9 @@ import { } from '@janhq/tauri-plugin-llamacpp-api' import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api' +// Error message constant - matches web-app/src/utils/error.ts +const OUT_OF_CONTEXT_SIZE = 'the request exceeds the available context size.' + type LlamacppConfig = { version_backend: string auto_update_engine: boolean @@ -1541,7 +1544,7 @@ export default class llamacpp_extension extends AIEngine { args.push('--main-gpu', String(cfg.main_gpu)) // Boolean flags - if (!cfg.ctx_shift) args.push('--no-context-shift') + if (cfg.ctx_shift) args.push('--context-shift') if (Number(version.replace(/^b/, '')) >= 6325) { if (!cfg.flash_attn) args.push('--flash-attn', 'off') //default: auto = ON when supported } else { @@ -1739,6 +1742,13 @@ export default class llamacpp_extension extends AIEngine { try { const data = JSON.parse(jsonStr) const chunk = data as chatCompletionChunk + + // Check for out-of-context error conditions + if (chunk.choices?.[0]?.finish_reason === 'length') { + // finish_reason 'length' indicates context limit was hit + throw new Error(OUT_OF_CONTEXT_SIZE) + } + yield chunk } catch (e) { logger.error('Error parsing JSON from stream or server error:', e) @@ -1817,7 +1827,15 @@ export default class llamacpp_extension extends AIEngine { ) } - return (await response.json()) as chatCompletion + const completionResponse = (await response.json()) as chatCompletion + + // Check for out-of-context error conditions + if (completionResponse.choices?.[0]?.finish_reason === 'length') { + // finish_reason 'length' indicates context limit was hit + throw new Error(OUT_OF_CONTEXT_SIZE) + } + + return completionResponse } override async delete(modelId: string): Promise { From 6959329fd6d6f3a069b76702d43787b30c5da0c5 Mon Sep 17 00:00:00 2001 From: Minh141120 Date: Mon, 15 Sep 2025 09:32:51 +0700 Subject: [PATCH 03/32] chore: add install-rust-targets step for macOS universal builds --- .../template-tauri-build-macos-external.yml | 1 - .github/workflows/template-tauri-build-macos.yml | 1 - Makefile | 16 ++++++++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/template-tauri-build-macos-external.yml b/.github/workflows/template-tauri-build-macos-external.yml index e69e298a6..8f61b86fa 100644 --- a/.github/workflows/template-tauri-build-macos-external.yml +++ b/.github/workflows/template-tauri-build-macos-external.yml @@ -89,7 +89,6 @@ jobs: - name: Build app run: | - rustup target add x86_64-apple-darwin make build env: APP_PATH: '.' diff --git a/.github/workflows/template-tauri-build-macos.yml b/.github/workflows/template-tauri-build-macos.yml index 40cf4e839..332ecc42f 100644 --- a/.github/workflows/template-tauri-build-macos.yml +++ b/.github/workflows/template-tauri-build-macos.yml @@ -167,7 +167,6 @@ jobs: - name: Build app run: | - rustup target add x86_64-apple-darwin make build env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/Makefile b/Makefile index 457f314ef..65fc67fc4 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,17 @@ endif yarn build:core yarn build:extensions && yarn build:extensions-web +# Install required Rust targets for macOS universal builds +install-rust-targets: +ifeq ($(shell uname -s),Darwin) + @echo "Detected macOS, installing universal build targets..." + rustup target add x86_64-apple-darwin + rustup target add aarch64-apple-darwin + @echo "Rust targets installed successfully!" +else + @echo "Not macOS; skipping Rust target installation." +endif + dev: install-and-build yarn download:bin yarn download:lib @@ -68,11 +79,12 @@ test: lint cargo test --manifest-path src-tauri/utils/Cargo.toml # Builds and publishes the app -build-and-publish: install-and-build +build-and-publish: install-and-build install-rust-targets yarn build # Build -build: install-and-build +build: install-and-build install-rust-targets + install-rust-targets yarn download:lib yarn build From 4fa78fa8920f08f366759f1d3e721b055d4147aa Mon Sep 17 00:00:00 2001 From: Minh141120 Date: Mon, 15 Sep 2025 09:44:21 +0700 Subject: [PATCH 04/32] fix: make install-rust-targets a dependency --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index 65fc67fc4..72aca58c1 100644 --- a/Makefile +++ b/Makefile @@ -84,7 +84,6 @@ build-and-publish: install-and-build install-rust-targets # Build build: install-and-build install-rust-targets - install-rust-targets yarn download:lib yarn build From 44893bc3c3d210e24dbb623599e81bbb4262339e Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Mon, 15 Sep 2025 10:33:05 +0700 Subject: [PATCH 05/32] enhancement: copy MCP permission --- web-app/src/containers/dialogs/ToolApproval.tsx | 7 +++++-- web-app/src/locales/de-DE/mcp-servers.json | 2 +- web-app/src/locales/de-DE/tool-approval.json | 2 +- web-app/src/locales/de-DE/tools.json | 5 +++-- web-app/src/locales/en/mcp-servers.json | 2 +- web-app/src/locales/en/tool-approval.json | 6 +++--- web-app/src/locales/en/tools.json | 5 +++-- web-app/src/locales/id/mcp-servers.json | 2 +- web-app/src/locales/id/tool-approval.json | 2 +- web-app/src/locales/id/tools.json | 5 +++-- web-app/src/locales/pl/mcp-servers.json | 2 +- web-app/src/locales/pl/tool-approval.json | 2 +- web-app/src/locales/pl/tools.json | 5 +++-- web-app/src/locales/vn/tool-approval.json | 2 +- web-app/src/locales/vn/tools.json | 5 +++-- web-app/src/locales/zh-CN/mcp-servers.json | 2 +- web-app/src/locales/zh-CN/tool-approval.json | 2 +- web-app/src/locales/zh-CN/tools.json | 5 +++-- web-app/src/locales/zh-TW/mcp-servers.json | 2 +- web-app/src/locales/zh-TW/tool-approval.json | 2 +- web-app/src/locales/zh-TW/tools.json | 5 +++-- 21 files changed, 41 insertions(+), 31 deletions(-) diff --git a/web-app/src/containers/dialogs/ToolApproval.tsx b/web-app/src/containers/dialogs/ToolApproval.tsx index 3aaa622f7..96bf0590a 100644 --- a/web-app/src/containers/dialogs/ToolApproval.tsx +++ b/web-app/src/containers/dialogs/ToolApproval.tsx @@ -52,7 +52,10 @@ export default function ToolApproval() { {t('tools:toolApproval.title')} {t('tools:toolApproval.description')}{' '} - {toolName} + {toolName}.  + + {t('tools:toolApproval.permissionScope')} +
@@ -85,7 +88,7 @@ export default function ToolApproval() { > {t('tools:toolApproval.deny')} -
+
-
From a4483b7eb72eac58c360a47e79facfa947394c8d Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Mon, 15 Sep 2025 10:38:36 +0700 Subject: [PATCH 07/32] Update web-app/src/locales/en/tool-approval.json Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --- web-app/src/locales/en/tool-approval.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web-app/src/locales/en/tool-approval.json b/web-app/src/locales/en/tool-approval.json index bdbf08fbe..2d0c8b549 100644 --- a/web-app/src/locales/en/tool-approval.json +++ b/web-app/src/locales/en/tool-approval.json @@ -1,6 +1,6 @@ { "title": "Tool Call Request", - "description": "The assistant wants to use the tool: {{toolName}} hello", + "description": "The assistant wants to use the tool: {{toolName}}", "securityNotice": "Security Notice: Malicious tools or conversation content could potentially trick the assistant into attempting harmful actions. Review each tool call carefully before approving.", "deny": "Deny", "allowOnce": "Allow Once", From 1db67ea9a2d2638d866cd77eb86c32c2e2d9d234 Mon Sep 17 00:00:00 2001 From: Minh141120 Date: Mon, 15 Sep 2025 11:24:11 +0700 Subject: [PATCH 08/32] chore: simplify macos workflow --- .../workflows/template-tauri-build-macos.yml | 25 ------------------- Makefile | 1 + 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/.github/workflows/template-tauri-build-macos.yml b/.github/workflows/template-tauri-build-macos.yml index 332ecc42f..4646041cf 100644 --- a/.github/workflows/template-tauri-build-macos.yml +++ b/.github/workflows/template-tauri-build-macos.yml @@ -92,31 +92,6 @@ jobs: run: | cargo install ctoml - - name: Create bun and uv universal - run: | - mkdir -p ./src-tauri/resources/bin/ - cd ./src-tauri/resources/bin/ - curl -L -o bun-darwin-x64.zip https://github.com/oven-sh/bun/releases/download/bun-v1.2.10/bun-darwin-x64.zip - curl -L -o bun-darwin-aarch64.zip https://github.com/oven-sh/bun/releases/download/bun-v1.2.10/bun-darwin-aarch64.zip - unzip bun-darwin-x64.zip - unzip bun-darwin-aarch64.zip - lipo -create -output bun-universal-apple-darwin bun-darwin-x64/bun bun-darwin-aarch64/bun - cp -f bun-darwin-aarch64/bun bun-aarch64-apple-darwin - cp -f bun-darwin-x64/bun bun-x86_64-apple-darwin - cp -f bun-universal-apple-darwin bun - - curl -L -o uv-x86_64.tar.gz https://github.com/astral-sh/uv/releases/download/0.6.17/uv-x86_64-apple-darwin.tar.gz - curl -L -o uv-arm64.tar.gz https://github.com/astral-sh/uv/releases/download/0.6.17/uv-aarch64-apple-darwin.tar.gz - tar -xzf uv-x86_64.tar.gz - tar -xzf uv-arm64.tar.gz - mv uv-x86_64-apple-darwin uv-x86_64 - mv uv-aarch64-apple-darwin uv-aarch64 - lipo -create -output uv-universal-apple-darwin uv-x86_64/uv uv-aarch64/uv - cp -f uv-x86_64/uv uv-x86_64-apple-darwin - cp -f uv-aarch64/uv uv-aarch64-apple-darwin - cp -f uv-universal-apple-darwin uv - ls -la - - name: Update app version based on latest release tag with build number run: | echo "Version: ${{ inputs.new_version }}" diff --git a/Makefile b/Makefile index 72aca58c1..b960b67aa 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,7 @@ build-and-publish: install-and-build install-rust-targets # Build build: install-and-build install-rust-targets + yarn download:bin yarn download:lib yarn build From 489c5a3d9c3e588db18fb4c3d311b68bbf283fe5 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 15 Sep 2025 10:16:13 +0530 Subject: [PATCH 09/32] fix: KVCache size calculation and refactor (#6438) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Removed the unused `getKVCachePerToken` helper and replaced it with a unified `estimateKVCache` that returns both total size and per‑token size. - Fixed the KV cache size calculation to account for all layers, correcting previous under‑estimation. - Added proper clamping of user‑requested context lengths to the model’s maximum. - Refactored VRAM budgeting: introduced explicit reserves, fixed engine overhead, and separate multipliers for VRAM and system RAM based on memory mode. - Implemented a more robust planning flow with clear GPU, Hybrid, and CPU pathways, including fallback configurations when resources are insufficient. - Updated default context length handling and safety buffers to prevent OOM situations. - Adjusted usable memory percentage to 90 % and refined logging for easier debugging. --- extensions/llamacpp-extension/src/index.ts | 353 +++++++++------------ 1 file changed, 143 insertions(+), 210 deletions(-) diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 1d98d4213..efccca679 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -1742,13 +1742,13 @@ export default class llamacpp_extension extends AIEngine { try { const data = JSON.parse(jsonStr) const chunk = data as chatCompletionChunk - + // Check for out-of-context error conditions if (chunk.choices?.[0]?.finish_reason === 'length') { // finish_reason 'length' indicates context limit was hit throw new Error(OUT_OF_CONTEXT_SIZE) } - + yield chunk } catch (e) { logger.error('Error parsing JSON from stream or server error:', e) @@ -1828,13 +1828,13 @@ export default class llamacpp_extension extends AIEngine { } const completionResponse = (await response.json()) as chatCompletion - + // Check for out-of-context error conditions if (completionResponse.choices?.[0]?.finish_reason === 'length') { // finish_reason 'length' indicates context limit was hit throw new Error(OUT_OF_CONTEXT_SIZE) } - + return completionResponse } @@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine { totalMemory, } } - private async getKVCachePerToken( - meta: Record - ): Promise { - const arch = meta['general.architecture'] - const nLayer = Number(meta[`${arch}.block_count`]) - const nHead = Number(meta[`${arch}.attention.head_count`]) - - // Get head dimensions - const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead - const embeddingLen = Number(meta[`${arch}.embedding_length`]) - const headDim = embeddingLen / nHead - - // KV cache uses head_count_kv (for GQA models) or head_count - // Each token needs K and V, both are fp16 (2 bytes) - const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers - - return bytesPerToken - } private async getLayerSize( path: string, @@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine { gguf.metadata ) - // Fixed KV cache calculation - const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata) + const kvCachePerToken = (await this.estimateKVCache(gguf.metadata)) + .perTokenSize - // Debug logging logger.info( `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}` ) @@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine { throw new Error(`Invalid layer size: ${layerSize}`) } - // GPU overhead factor (20% reserved for GPU operations, alignment, etc.) - const GPU_OVERHEAD_FACTOR = 0.8 - - // VRAM budget with overhead consideration + // Reserve memory for OS, other applications, and fixed engine overhead. const VRAM_RESERVE_GB = 0.5 const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024 - const usableVRAM = Math.max( - 0, - (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR - ) + const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc. // Get model's maximum context length const arch = gguf.metadata['general.architecture'] const modelMaxContextLength = - Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback + Number(gguf.metadata[`${arch}.context_length`]) || 8192 - // Set minimum context length - const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility + const MIN_CONTEXT_LENGTH = 1024 - // System RAM budget + // Memory percentages applied to both VRAM and RAM const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 } logger.info( `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}` ) - // Validate memory info if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) { throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`) } @@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine { ) } - // Calculate actual system RAM - const actualSystemRAM = Math.max( + // Apply memory mode to both VRAM and RAM separately + const memoryModeMultiplier = memoryPercentages[this.memoryMode] + const usableVRAM = Math.max( 0, - memoryInfo.totalMemory - memoryInfo.totalVRAM + memoryInfo.totalVRAM * memoryModeMultiplier - + VRAM_RESERVE_BYTES - + ENGINE_FIXED_OVERHEAD_BYTES ) - const usableSystemMemory = - actualSystemRAM * memoryPercentages[this.memoryMode] + + const actualSystemRAM = Math.max(0, memoryInfo.totalRAM) + const usableSystemMemory = actualSystemRAM * memoryModeMultiplier logger.info( - `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}` + `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}` ) - // --- Priority 1: Allocate mmproj (if exists) --- - let offloadMmproj = false - let remainingVRAM = usableVRAM - - if (mmprojSize > 0) { - if (mmprojSize <= remainingVRAM) { - offloadMmproj = true - remainingVRAM -= mmprojSize - logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`) - } else { - logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`) - } - } - - // --- Priority 2: Calculate optimal layer/context balance --- let gpuLayers = 0 - let maxContextLength = MIN_CONTEXT_LENGTH + let maxContextLength = 0 let noOffloadKVCache = false let mode: ModelPlan['mode'] = 'Unsupported' + let offloadMmproj = false - // Calculate how much VRAM we need for different context sizes - const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072] - const targetContext = requestedCtx || modelMaxContextLength - - // Find the best balance of layers and context - let bestConfig = { - layers: 0, - context: MIN_CONTEXT_LENGTH, - vramUsed: 0, + let remainingVRAM = usableVRAM + if (mmprojSize > 0 && mmprojSize <= remainingVRAM) { + offloadMmproj = true + remainingVRAM -= mmprojSize } + const vramForMinContext = ( + await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH) + ).size - for (const ctxSize of contextSizes) { - if (ctxSize > targetContext) break - - const kvCacheSize = ctxSize * kvCachePerToken - const availableForLayers = remainingVRAM - kvCacheSize - - if (availableForLayers <= 0) continue - - const possibleLayers = Math.min( - Math.floor(availableForLayers / layerSize), - totalLayers + const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize) + if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) { + logger.error( + `Model unsupported. Not enough resources for model and min context.` ) - - if (possibleLayers > 0) { - const totalVramNeeded = possibleLayers * layerSize + kvCacheSize - - // Verify this fits with some margin - if (totalVramNeeded <= remainingVRAM * 0.95) { - bestConfig = { - layers: possibleLayers, - context: ctxSize, - vramUsed: totalVramNeeded, - } - } + return { + gpuLayers: 0, + maxContextLength: 0, + noOffloadKVCache: true, + mode: 'Unsupported', + offloadMmproj: false, } } - // Apply the best configuration found - if (bestConfig.layers > 0) { - gpuLayers = bestConfig.layers - maxContextLength = bestConfig.context + const targetContext = Math.min( + requestedCtx || modelMaxContextLength, + modelMaxContextLength + ) + + let targetContextSize = ( + await this.estimateKVCache(gguf.metadata, targetContext) + ).size + + // Use `kvCachePerToken` for all VRAM calculations + if (modelSize + targetContextSize <= remainingVRAM) { + mode = 'GPU' + gpuLayers = totalLayers + maxContextLength = targetContext noOffloadKVCache = false - mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid' + logger.info( + 'Planning: Ideal case fits. All layers and target context in VRAM.' + ) + } else if (modelSize <= remainingVRAM) { + mode = 'GPU' + gpuLayers = totalLayers + noOffloadKVCache = false + const vramLeftForContext = remainingVRAM - modelSize + maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken) + + // Add safety check to prevent OOM + const safetyBuffer = 0.9 // Use 90% of calculated context to be safe + maxContextLength = Math.floor(maxContextLength * safetyBuffer) logger.info( - `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` + - `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes` + `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}` ) } else { - // Fallback: Try minimal GPU layers with KV cache on CPU - gpuLayers = Math.min( - Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers - totalLayers - ) + const vramAvailableForLayers = remainingVRAM - vramForMinContext - if (gpuLayers > 0) { - // Calculate available system RAM for KV cache - const cpuLayers = totalLayers - gpuLayers - const modelCPUSize = cpuLayers * layerSize - const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0 - const systemRAMUsed = modelCPUSize + mmprojCPUSize - const availableSystemRAMForKVCache = Math.max( - 0, - usableSystemMemory - systemRAMUsed + if (vramAvailableForLayers >= layerSize) { + mode = 'Hybrid' + gpuLayers = Math.min( + Math.floor(vramAvailableForLayers / layerSize), + totalLayers ) + noOffloadKVCache = false + const vramUsedByLayers = gpuLayers * layerSize + const vramLeftForContext = remainingVRAM - vramUsedByLayers + maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken) - // Calculate context that fits in system RAM - const systemRAMContext = Math.min( - Math.floor(availableSystemRAMForKVCache / kvCachePerToken), - targetContext + logger.info( + 'Planning: Hybrid mode. Offloading layers to fit context in VRAM.' ) + } + } - if (systemRAMContext >= MIN_CONTEXT_LENGTH) { - maxContextLength = systemRAMContext - noOffloadKVCache = true + // Fallback logic: try different configurations if no VRAM-based plan worked + if (mode === 'Unsupported') { + logger.info('Planning: Trying fallback configurations...') + + // Try putting some layers on GPU with KV cache in RAM + const possibleGpuLayers = Math.floor(remainingVRAM / layerSize) + if (possibleGpuLayers > 0) { + gpuLayers = Math.min(possibleGpuLayers, totalLayers) + const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize + const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0 + const availableRamForKv = + usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj) + // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific + const contextInRam = Math.floor(availableRamForKv / kvCachePerToken) + + if (contextInRam >= MIN_CONTEXT_LENGTH) { mode = 'Hybrid' - - logger.info( - `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` + - `${maxContextLength} context on CPU RAM` - ) - } else { - // Can't fit reasonable context even with CPU RAM - // Reduce GPU layers further - gpuLayers = Math.floor(gpuLayers / 2) - maxContextLength = MIN_CONTEXT_LENGTH + maxContextLength = contextInRam noOffloadKVCache = true - mode = gpuLayers > 0 ? 'Hybrid' : 'CPU' + logger.info( + `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}` + ) } - } else { - // Pure CPU mode + } + + // If still unsupported, try pure CPU mode + if (mode === 'Unsupported') { gpuLayers = 0 noOffloadKVCache = true - - // Calculate context for pure CPU mode - const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0) - const availableForKVCache = Math.max( - 0, - usableSystemMemory - totalCPUMemoryNeeded - ) - - maxContextLength = Math.min( - Math.max( - MIN_CONTEXT_LENGTH, - Math.floor(availableForKVCache / kvCachePerToken) - ), - targetContext - ) - - mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported' - } - } - - // Safety check: Verify total GPU memory usage - if (gpuLayers > 0 && !noOffloadKVCache) { - const estimatedGPUUsage = - gpuLayers * layerSize + - maxContextLength * kvCachePerToken + - (offloadMmproj ? mmprojSize : 0) - - if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { - logger.warn( - `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...` - ) - - // Reduce context first - while ( - maxContextLength > MIN_CONTEXT_LENGTH && - estimatedGPUUsage > memoryInfo.totalVRAM * 0.9 - ) { - maxContextLength = Math.floor(maxContextLength / 2) - const newEstimate = - gpuLayers * layerSize + - maxContextLength * kvCachePerToken + - (offloadMmproj ? mmprojSize : 0) - if (newEstimate <= memoryInfo.totalVRAM * 0.9) break - } - - // If still too much, reduce layers - if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { - gpuLayers = Math.floor(gpuLayers * 0.7) - mode = gpuLayers > 0 ? 'Hybrid' : 'CPU' - noOffloadKVCache = true // Move KV cache to CPU + offloadMmproj = false + const ramUsedByModel = modelSize + mmprojSize + const availableRamForKv = usableSystemMemory - ramUsedByModel + maxContextLength = Math.floor(availableRamForKv / kvCachePerToken) + if (maxContextLength >= MIN_CONTEXT_LENGTH) { + mode = 'CPU' + logger.info(`Planning: CPU mode - Context: ${maxContextLength}`) } } } - // Apply user-requested context limit if specified + if (mode === 'CPU' || noOffloadKVCache) { + offloadMmproj = false + } + if (requestedCtx && requestedCtx > 0) { maxContextLength = Math.min(maxContextLength, requestedCtx) - logger.info( - `User requested context: ${requestedCtx}, final: ${maxContextLength}` - ) } - // Ensure we never exceed model's maximum context maxContextLength = Math.min(maxContextLength, modelMaxContextLength) - // Final validation - if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) { + if (maxContextLength < MIN_CONTEXT_LENGTH) { mode = 'Unsupported' } - // Ensure maxContextLength is valid - maxContextLength = isNaN(maxContextLength) - ? MIN_CONTEXT_LENGTH - : Math.max(MIN_CONTEXT_LENGTH, maxContextLength) + if (mode === 'Unsupported') { + gpuLayers = 0 + maxContextLength = 0 + } + + maxContextLength = isNaN(maxContextLength) + ? 0 + : Math.floor(maxContextLength) - // Log final plan const mmprojInfo = mmprojPath ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}` : '' @@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine { offloadMmproj, } } - /** * estimate KVCache size from a given metadata */ private async estimateKVCache( meta: Record, ctx_size?: number - ): Promise { + ): Promise<{ size: number; perTokenSize: number }> { const arch = meta['general.architecture'] if (!arch) throw new Error('Invalid metadata: architecture not found') @@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine { ) } - let ctxLen: number - if (!ctx_size) { - ctxLen = Number(meta[`${arch}.context_length`]) - } else { - ctxLen = ctx_size - } + const maxCtx = Number(meta[`${arch}.context_length`]) + if (!maxCtx) throw new Error('Invalid metadata: context_length not found') + + // b) If the user supplied a value, clamp it to the model's max + let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx + + logger.info(`Final context length used for KV size: ${ctxLen}`) + logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`) logger.info(`ctxLen: ${ctxLen}`) logger.info(`nLayer: ${nLayer}`) @@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine { // fp16 = 8 bits * 2 = 16 const bytesPerElement = 2 - // Total KV cache size per token = nHead * headDim * bytesPerElement - const kvPerToken = nHead * headDim * bytesPerElement + // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer + const kvPerToken = nHead * headDim * bytesPerElement * nLayer - return ctxLen * nLayer * kvPerToken + return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken } } private async getModelSize(path: string): Promise { @@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine { const gguf = await readGgufMetadata(path) let kvCacheSize: number if (ctx_size) { - kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size) + kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size } else { - kvCacheSize = await this.estimateKVCache(gguf.metadata) + kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size } // Total memory consumption = model weights + kvcache @@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine { ) // Use 80% of total memory as the usable limit - const USABLE_MEMORY_PERCENTAGE = 0.8 + const USABLE_MEMORY_PERCENTAGE = 0.9 const usableTotalMemory = - memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE + memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE + + memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE // Check if model fits in total memory at all From e80a865def0d4f07414caa347b76fddd136cdad7 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 15 Sep 2025 12:35:24 +0530 Subject: [PATCH 10/32] fix: detect allocation failures as out-of-memory errors (#6459) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Llama.cpp backend can emit the phrase “failed to allocate” when it runs out of memory. Adding this check ensures such messages are correctly classified as out‑of‑memory errors, providing more accurate error handling CPU backends. --- src-tauri/plugins/tauri-plugin-llamacpp/src/error.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/error.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/error.rs index 647b2fead..d26e612fb 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/error.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/error.rs @@ -48,6 +48,7 @@ impl LlamacppError { let lower_stderr = stderr.to_lowercase(); // TODO: add others let is_out_of_memory = lower_stderr.contains("out of memory") + || lower_stderr.contains("failed to allocate") || lower_stderr.contains("insufficient memory") || lower_stderr.contains("erroroutofdevicememory") // vulkan specific || lower_stderr.contains("kiogpucommandbuffercallbackerroroutofmemory") // Metal-specific error code From 18114c0a15ebe54de87d854db05f2a8e3958282f Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Mon, 15 Sep 2025 18:05:11 +0700 Subject: [PATCH 11/32] fix: pathname file install BE --- web-app/src/routes/settings/providers/$providerName.tsx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/web-app/src/routes/settings/providers/$providerName.tsx b/web-app/src/routes/settings/providers/$providerName.tsx index b5a30acdf..533536281 100644 --- a/web-app/src/routes/settings/providers/$providerName.tsx +++ b/web-app/src/routes/settings/providers/$providerName.tsx @@ -357,12 +357,9 @@ function ProviderDetail() { if (selectedFile && typeof selectedFile === 'string') { // Process the file path: replace spaces with dashes and convert to lowercase - const processedFilePath = selectedFile - .replace(/\s+/g, '-') - .toLowerCase() // Install the backend using the llamacpp extension - await installBackend(processedFilePath) + await installBackend(selectedFile) // Extract filename from the selected file path and replace spaces with dashes const fileName = ( From 9e3a77a5597e6788fa3cc6a7d6ee7121da7d3938 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 15 Sep 2025 19:00:46 +0530 Subject: [PATCH 12/32] fix: set default memory mode and clean up unused import (#6463) Use fallback value 'high' for memory_util config and remove unused GgufMetadata import. --- extensions/llamacpp-extension/src/index.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index efccca679..2d5d9272b 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -37,7 +37,6 @@ import { invoke } from '@tauri-apps/api/core' import { getProxyConfig } from './util' import { basename } from '@tauri-apps/api/path' import { - GgufMetadata, readGgufMetadata, } from '@janhq/tauri-plugin-llamacpp-api' import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api' @@ -178,7 +177,7 @@ export default class llamacpp_extension extends AIEngine { provider: string = 'llamacpp' autoUnload: boolean = true llamacpp_env: string = '' - memoryMode: string = 'high' + memoryMode: string = '' readonly providerId: string = 'llamacpp' private config: LlamacppConfig @@ -210,7 +209,7 @@ export default class llamacpp_extension extends AIEngine { this.autoUnload = this.config.auto_unload this.llamacpp_env = this.config.llamacpp_env - this.memoryMode = this.config.memory_util + this.memoryMode = this.config.memory_util || 'high' // This sets the base directory where model files for this provider are stored. this.providerPath = await joinPath([ From 5736d7b11083e7d3adf43d82371aea07bdd5676e Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Mon, 15 Sep 2025 20:51:27 +0700 Subject: [PATCH 13/32] fix: auto update should not block popup --- web-app/src/containers/dialogs/BackendUpdater.tsx | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/web-app/src/containers/dialogs/BackendUpdater.tsx b/web-app/src/containers/dialogs/BackendUpdater.tsx index 071559c6c..c91a37bc4 100644 --- a/web-app/src/containers/dialogs/BackendUpdater.tsx +++ b/web-app/src/containers/dialogs/BackendUpdater.tsx @@ -35,15 +35,25 @@ const BackendUpdater = () => { }) useEffect(() => { + console.log('BackendUpdater state update:', { + remindMeLater: updateState.remindMeLater, + isUpdateAvailable: updateState.isUpdateAvailable, + autoUpdateEnabled: updateState.autoUpdateEnabled, + updateInfo: updateState.updateInfo, + }) setBackendUpdateState({ remindMeLater: updateState.remindMeLater, isUpdateAvailable: updateState.isUpdateAvailable, }) }, [updateState]) - // Don't show if user clicked remind me later or auto update is enabled - if (backendUpdateState.remindMeLater || updateState.autoUpdateEnabled) + // Don't show if user clicked remind me later + if (backendUpdateState.remindMeLater) { + console.log('BackendUpdater: Not showing notification due to:', { + remindMeLater: backendUpdateState.remindMeLater, + }) return null + } return ( <> From e02be47aae4f5ed7b0e9b3efee4ea69dabef1c65 Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Mon, 15 Sep 2025 21:09:08 +0700 Subject: [PATCH 14/32] fix: remove log --- web-app/src/containers/dialogs/BackendUpdater.tsx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/web-app/src/containers/dialogs/BackendUpdater.tsx b/web-app/src/containers/dialogs/BackendUpdater.tsx index c91a37bc4..d59cfd7c9 100644 --- a/web-app/src/containers/dialogs/BackendUpdater.tsx +++ b/web-app/src/containers/dialogs/BackendUpdater.tsx @@ -35,12 +35,6 @@ const BackendUpdater = () => { }) useEffect(() => { - console.log('BackendUpdater state update:', { - remindMeLater: updateState.remindMeLater, - isUpdateAvailable: updateState.isUpdateAvailable, - autoUpdateEnabled: updateState.autoUpdateEnabled, - updateInfo: updateState.updateInfo, - }) setBackendUpdateState({ remindMeLater: updateState.remindMeLater, isUpdateAvailable: updateState.isUpdateAvailable, From 3b22f0b7c0af01b7b765cc12baa92be9b2307b18 Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Mon, 15 Sep 2025 21:48:01 +0700 Subject: [PATCH 15/32] fix: imporove edit message with attachment image --- web-app/src/containers/ThreadContent.tsx | 7 ++- .../containers/dialogs/EditMessageDialog.tsx | 58 ++++++++++++++++--- web-app/src/routes/threads/$threadId.tsx | 51 +++++++++++----- 3 files changed, 93 insertions(+), 23 deletions(-) diff --git a/web-app/src/containers/ThreadContent.tsx b/web-app/src/containers/ThreadContent.tsx index 0316ee764..ede897624 100644 --- a/web-app/src/containers/ThreadContent.tsx +++ b/web-app/src/containers/ThreadContent.tsx @@ -71,7 +71,7 @@ export const ThreadContent = memo( streamTools?: any contextOverflowModal?: React.ReactNode | null - updateMessage?: (item: ThreadMessage, message: string) => void + updateMessage?: (item: ThreadMessage, message: string, imageUrls?: string[]) => void } ) => { const { t } = useTranslation() @@ -276,9 +276,10 @@ export const ThreadContent = memo( item.content?.find((c) => c.type === 'text')?.text?.value || '' } - onSave={(message) => { + imageUrls={item.content?.filter((c) => c.type === 'image_url' && c.image_url?.url).map((c) => c.image_url!.url)} + onSave={(message, imageUrls) => { if (item.updateMessage) { - item.updateMessage(item, message) + item.updateMessage(item, message, imageUrls) } }} /> diff --git a/web-app/src/containers/dialogs/EditMessageDialog.tsx b/web-app/src/containers/dialogs/EditMessageDialog.tsx index ade50befe..2df850023 100644 --- a/web-app/src/containers/dialogs/EditMessageDialog.tsx +++ b/web-app/src/containers/dialogs/EditMessageDialog.tsx @@ -11,7 +11,7 @@ import { } from '@/components/ui/dialog' import { Button } from '@/components/ui/button' import { Textarea } from '@/components/ui/textarea' -import { IconPencil } from '@tabler/icons-react' +import { IconPencil, IconX } from '@tabler/icons-react' import { Tooltip, TooltipContent, @@ -20,23 +20,27 @@ import { interface EditMessageDialogProps { message: string - onSave: (message: string) => void + imageUrls?: string[] + onSave: (message: string, imageUrls?: string[]) => void triggerElement?: React.ReactNode } export function EditMessageDialog({ message, + imageUrls, onSave, triggerElement, }: EditMessageDialogProps) { const { t } = useTranslation() const [isOpen, setIsOpen] = useState(false) const [draft, setDraft] = useState(message) + const [keptImages, setKeptImages] = useState(imageUrls || []) const textareaRef = useRef(null) useEffect(() => { setDraft(message) - }, [message]) + setKeptImages(imageUrls || []) + }, [message, imageUrls]) useEffect(() => { if (isOpen && textareaRef.current) { @@ -48,8 +52,15 @@ export function EditMessageDialog({ }, [isOpen]) const handleSave = () => { - if (draft !== message && draft.trim()) { - onSave(draft) + const hasTextChanged = draft !== message && draft.trim() + const hasImageChanged = + JSON.stringify(imageUrls || []) !== JSON.stringify(keptImages) + + if (hasTextChanged || hasImageChanged) { + onSave( + draft.trim() || message, + keptImages.length > 0 ? keptImages : undefined + ) setIsOpen(false) } } @@ -64,7 +75,7 @@ export function EditMessageDialog({ const defaultTrigger = ( -
{t('common:dialogs.editMessage.title')} + {keptImages.length > 0 && ( +
+
+ {keptImages.map((imageUrl, index) => ( +
+ {`Attached +
+ setKeptImages((prev) => + prev.filter((_, i) => i !== index) + ) + } + > + +
+
+ ))} +
+
+ )}