Merge pull request #6516 from menloresearch/release/v0.6.10

2025-09-18 19:15:54 +07:00 · 2025-09-18 19:15:54 +07:00 · 645548e931
commit 645548e931
parent 86a92ead85 b9f658f2ae
46 changed files with 1018 additions and 701 deletions
--- a/.github/workflows/template-tauri-build-macos-external.yml
+++ b/.github/workflows/template-tauri-build-macos-external.yml
@ -89,7 +89,6 @@ jobs:

      - name: Build app
        run: |
-          rustup target add x86_64-apple-darwin
          make build
        env:
          APP_PATH: '.'
--- a/.github/workflows/template-tauri-build-macos.yml
+++ b/.github/workflows/template-tauri-build-macos.yml
@ -92,31 +92,6 @@ jobs:
        run: |
          cargo install ctoml

-      - name: Create bun and uv universal
-        run: |
-          mkdir -p ./src-tauri/resources/bin/
-          cd ./src-tauri/resources/bin/
-          curl -L -o bun-darwin-x64.zip https://github.com/oven-sh/bun/releases/download/bun-v1.2.10/bun-darwin-x64.zip
-          curl -L -o bun-darwin-aarch64.zip https://github.com/oven-sh/bun/releases/download/bun-v1.2.10/bun-darwin-aarch64.zip
-          unzip bun-darwin-x64.zip
-          unzip bun-darwin-aarch64.zip
-          lipo -create -output bun-universal-apple-darwin bun-darwin-x64/bun bun-darwin-aarch64/bun
-          cp -f bun-darwin-aarch64/bun bun-aarch64-apple-darwin 
-          cp -f bun-darwin-x64/bun bun-x86_64-apple-darwin
-          cp -f bun-universal-apple-darwin bun
-
-          curl -L -o uv-x86_64.tar.gz https://github.com/astral-sh/uv/releases/download/0.6.17/uv-x86_64-apple-darwin.tar.gz
-          curl -L -o uv-arm64.tar.gz https://github.com/astral-sh/uv/releases/download/0.6.17/uv-aarch64-apple-darwin.tar.gz
-          tar -xzf uv-x86_64.tar.gz
-          tar -xzf uv-arm64.tar.gz
-          mv uv-x86_64-apple-darwin uv-x86_64
-          mv uv-aarch64-apple-darwin uv-aarch64
-          lipo -create -output uv-universal-apple-darwin uv-x86_64/uv uv-aarch64/uv
-          cp -f uv-x86_64/uv uv-x86_64-apple-darwin
-          cp -f uv-aarch64/uv uv-aarch64-apple-darwin
-          cp -f uv-universal-apple-darwin uv
-          ls -la
-
      - name: Update app version based on latest release tag with build number
        run: |
          echo "Version: ${{ inputs.new_version }}"
@ -167,7 +142,6 @@ jobs:

      - name: Build app
        run: |
-          rustup target add x86_64-apple-darwin
          make build
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/16
+++ b/16
@ -30,6 +30,17 @@ endif
 	yarn build:core
 	yarn build:extensions && yarn build:extensions-web

+# Install required Rust targets for macOS universal builds
+install-rust-targets:
+ifeq ($(shell uname -s),Darwin)
+	@echo "Detected macOS, installing universal build targets..."
+	rustup target add x86_64-apple-darwin
+	rustup target add aarch64-apple-darwin
+	@echo "Rust targets installed successfully!"
+else
+	@echo "Not macOS; skipping Rust target installation."
+endif
+
 dev: install-and-build
 	yarn download:bin
 	yarn download:lib
@ -70,11 +81,12 @@ test: lint
 	cargo test --manifest-path src-tauri/utils/Cargo.toml

 # Builds and publishes the app
-build-and-publish: install-and-build
+build-and-publish: install-and-build install-rust-targets
 	yarn build

 # Build
-build: install-and-build
+build: install-and-build install-rust-targets
+	yarn download:bin
 	yarn download:lib
 	yarn build

--- a/core/src/browser/extension.ts
+++ b/core/src/browser/extension.ts
@ -126,16 +126,17 @@ export abstract class BaseExtension implements ExtensionType {
        settings.forEach((setting) => {
          // Keep setting value
          if (setting.controllerProps && Array.isArray(oldSettings))
-            setting.controllerProps.value = oldSettings.find(
-              (e: any) => e.key === setting.key
-            )?.controllerProps?.value
+            setting.controllerProps.value =
+              oldSettings.find((e: any) => e.key === setting.key)?.controllerProps?.value ??
+              setting.controllerProps.value
          if ('options' in setting.controllerProps)
            setting.controllerProps.options = setting.controllerProps.options?.length
              ? setting.controllerProps.options
              : oldSettings.find((e: any) => e.key === setting.key)?.controllerProps?.options
          if ('recommended' in setting.controllerProps) {
-            const oldRecommended = oldSettings.find((e: any) => e.key === setting.key)?.controllerProps?.recommended
-            if (oldRecommended !== undefined && oldRecommended !== "") {
+            const oldRecommended = oldSettings.find((e: any) => e.key === setting.key)
+              ?.controllerProps?.recommended
+            if (oldRecommended !== undefined && oldRecommended !== '') {
              setting.controllerProps.recommended = oldRecommended
            }
          }
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -36,12 +36,12 @@ import {
 import { invoke } from '@tauri-apps/api/core'
 import { getProxyConfig } from './util'
 import { basename } from '@tauri-apps/api/path'
-import {
-  GgufMetadata,
-  readGgufMetadata,
-} from '@janhq/tauri-plugin-llamacpp-api'
+import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
 import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api'

+// Error message constant - matches web-app/src/utils/error.ts
+const OUT_OF_CONTEXT_SIZE = 'the request exceeds the available context size.'
+
 type LlamacppConfig = {
  version_backend: string
  auto_update_engine: boolean
@ -175,7 +175,7 @@ export default class llamacpp_extension extends AIEngine {
  provider: string = 'llamacpp'
  autoUnload: boolean = true
  llamacpp_env: string = ''
-  memoryMode: string = 'high'
+  memoryMode: string = ''
  readonly providerId: string = 'llamacpp'

  private config: LlamacppConfig
@ -207,7 +207,7 @@ export default class llamacpp_extension extends AIEngine {

    this.autoUnload = this.config.auto_unload
    this.llamacpp_env = this.config.llamacpp_env
-    this.memoryMode = this.config.memory_util
+    this.memoryMode = this.config.memory_util || 'high'

    // This sets the base directory where model files for this provider are stored.
    this.providerPath = await joinPath([
@ -1541,7 +1541,7 @@ export default class llamacpp_extension extends AIEngine {
      args.push('--main-gpu', String(cfg.main_gpu))

    // Boolean flags
-    if (!cfg.ctx_shift) args.push('--no-context-shift')
+    if (cfg.ctx_shift) args.push('--context-shift')
    if (Number(version.replace(/^b/, '')) >= 6325) {
      if (!cfg.flash_attn) args.push('--flash-attn', 'off') //default: auto = ON when supported
    } else {
@ -1739,6 +1739,13 @@ export default class llamacpp_extension extends AIEngine {
          try {
            const data = JSON.parse(jsonStr)
            const chunk = data as chatCompletionChunk
+
+            // Check for out-of-context error conditions
+            if (chunk.choices?.[0]?.finish_reason === 'length') {
+              // finish_reason 'length' indicates context limit was hit
+              throw new Error(OUT_OF_CONTEXT_SIZE)
+            }
+
            yield chunk
          } catch (e) {
            logger.error('Error parsing JSON from stream or server error:', e)
@ -1817,7 +1824,15 @@ export default class llamacpp_extension extends AIEngine {
      )
    }

-    return (await response.json()) as chatCompletion
+    const completionResponse = (await response.json()) as chatCompletion
+
+    // Check for out-of-context error conditions
+    if (completionResponse.choices?.[0]?.finish_reason === 'length') {
+      // finish_reason 'length' indicates context limit was hit
+      throw new Error(OUT_OF_CONTEXT_SIZE)
+    }
+
+    return completionResponse
  }

  override async delete(modelId: string): Promise<void> {
@ -2018,24 +2033,6 @@ export default class llamacpp_extension extends AIEngine {
      totalMemory,
    }
  }
-  private async getKVCachePerToken(
-    meta: Record<string, string>
-  ): Promise<number> {
-    const arch = meta['general.architecture']
-    const nLayer = Number(meta[`${arch}.block_count`])
-    const nHead = Number(meta[`${arch}.attention.head_count`])
-
-    // Get head dimensions
-    const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
-    const embeddingLen = Number(meta[`${arch}.embedding_length`])
-    const headDim = embeddingLen / nHead
-
-    // KV cache uses head_count_kv (for GQA models) or head_count
-    // Each token needs K and V, both are fp16 (2 bytes)
-    const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
-
-    return bytesPerToken
-  }

  private async getLayerSize(
    path: string,
@ -2082,10 +2079,9 @@ export default class llamacpp_extension extends AIEngine {
      gguf.metadata
    )

-    // Fixed KV cache calculation
-    const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
+    const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
+      .perTokenSize

-    // Debug logging
    logger.info(
      `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
    )
@ -2101,33 +2097,25 @@ export default class llamacpp_extension extends AIEngine {
      throw new Error(`Invalid layer size: ${layerSize}`)
    }

-    // GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
-    const GPU_OVERHEAD_FACTOR = 0.8
-
-    // VRAM budget with overhead consideration
+    // Reserve memory for OS, other applications, and fixed engine overhead.
    const VRAM_RESERVE_GB = 0.5
    const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
-    const usableVRAM = Math.max(
-      0,
-      (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
-    )
+    const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.

    // Get model's maximum context length
    const arch = gguf.metadata['general.architecture']
    const modelMaxContextLength =
-      Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
+      Number(gguf.metadata[`${arch}.context_length`]) || 8192

-    // Set minimum context length
-    const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
+    const MIN_CONTEXT_LENGTH = 1024

-    // System RAM budget
+    // Memory percentages applied to both VRAM and RAM
    const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }

    logger.info(
      `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
    )

-    // Validate memory info
    if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
      throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
    }
@ -2140,208 +2128,166 @@ export default class llamacpp_extension extends AIEngine {
      )
    }

-    // Calculate actual system RAM
-    const actualSystemRAM = Math.max(
+    // Apply memory mode to both VRAM and RAM separately
+    const memoryModeMultiplier = memoryPercentages[this.memoryMode]
+    const usableVRAM = Math.max(
      0,
-      memoryInfo.totalMemory - memoryInfo.totalVRAM
+      memoryInfo.totalVRAM * memoryModeMultiplier -
+        VRAM_RESERVE_BYTES -
+        ENGINE_FIXED_OVERHEAD_BYTES
    )
-    const usableSystemMemory =
-      actualSystemRAM * memoryPercentages[this.memoryMode]
+
+    const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
+    const usableSystemMemory = actualSystemRAM * memoryModeMultiplier

    logger.info(
-      `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
+      `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
    )

-    // --- Priority 1: Allocate mmproj (if exists) ---
-    let offloadMmproj = false
-    let remainingVRAM = usableVRAM
-
-    if (mmprojSize > 0) {
-      if (mmprojSize <= remainingVRAM) {
-        offloadMmproj = true
-        remainingVRAM -= mmprojSize
-        logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
-      } else {
-        logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
-      }
-    }
-
-    // --- Priority 2: Calculate optimal layer/context balance ---
    let gpuLayers = 0
-    let maxContextLength = MIN_CONTEXT_LENGTH
+    let maxContextLength = 0
    let noOffloadKVCache = false
    let mode: ModelPlan['mode'] = 'Unsupported'
+    let offloadMmproj = false

-    // Calculate how much VRAM we need for different context sizes
-    const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
-    const targetContext = requestedCtx || modelMaxContextLength
-
-    // Find the best balance of layers and context
-    let bestConfig = {
-      layers: 0,
-      context: MIN_CONTEXT_LENGTH,
-      vramUsed: 0,
+    let remainingVRAM = usableVRAM
+    if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
+      offloadMmproj = true
+      remainingVRAM -= mmprojSize
    }
+    const vramForMinContext = (
+      await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
+    ).size

-    for (const ctxSize of contextSizes) {
-      if (ctxSize > targetContext) break
-
-      const kvCacheSize = ctxSize * kvCachePerToken
-      const availableForLayers = remainingVRAM - kvCacheSize
-
-      if (availableForLayers <= 0) continue
-
-      const possibleLayers = Math.min(
-        Math.floor(availableForLayers / layerSize),
-        totalLayers
+    const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
+    if (ramForModel + vramForMinContext > usableSystemMemory + usableVRAM) {
+      logger.error(
+        `Model unsupported. Not enough resources for model and min context.`
      )
-
-      if (possibleLayers > 0) {
-        const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
-
-        // Verify this fits with some margin
-        if (totalVramNeeded <= remainingVRAM * 0.95) {
-          bestConfig = {
-            layers: possibleLayers,
-            context: ctxSize,
-            vramUsed: totalVramNeeded,
-          }
-        }
+      return {
+        gpuLayers: 0,
+        maxContextLength: 0,
+        noOffloadKVCache: true,
+        mode: 'Unsupported',
+        offloadMmproj: false,
      }
    }

-    // Apply the best configuration found
-    if (bestConfig.layers > 0) {
-      gpuLayers = bestConfig.layers
-      maxContextLength = bestConfig.context
+    const targetContext = Math.min(
+      requestedCtx || modelMaxContextLength,
+      modelMaxContextLength
+    )
+
+    let targetContextSize = (
+      await this.estimateKVCache(gguf.metadata, targetContext)
+    ).size
+
+    // Use `kvCachePerToken` for all VRAM calculations
+    if (modelSize + targetContextSize <= remainingVRAM) {
+      mode = 'GPU'
+      gpuLayers = totalLayers
+      maxContextLength = targetContext
      noOffloadKVCache = false
-      mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
+      logger.info(
+        'Planning: Ideal case fits. All layers and target context in VRAM.'
+      )
+    } else if (modelSize <= remainingVRAM) {
+      mode = 'GPU'
+      gpuLayers = totalLayers
+      noOffloadKVCache = false
+      const vramLeftForContext = remainingVRAM - modelSize
+      maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
+
+      // Add safety check to prevent OOM
+      const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
+      maxContextLength = Math.floor(maxContextLength * safetyBuffer)

      logger.info(
-        `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
-          `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
+        `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
      )
    } else {
-      // Fallback: Try minimal GPU layers with KV cache on CPU
-      gpuLayers = Math.min(
-        Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
-        totalLayers
-      )
+      const vramAvailableForLayers = remainingVRAM - vramForMinContext

-      if (gpuLayers > 0) {
-        // Calculate available system RAM for KV cache
-        const cpuLayers = totalLayers - gpuLayers
-        const modelCPUSize = cpuLayers * layerSize
-        const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
-        const systemRAMUsed = modelCPUSize + mmprojCPUSize
-        const availableSystemRAMForKVCache = Math.max(
-          0,
-          usableSystemMemory - systemRAMUsed
+      if (vramAvailableForLayers >= layerSize) {
+        mode = 'Hybrid'
+        gpuLayers = Math.min(
+          Math.floor(vramAvailableForLayers / layerSize),
+          totalLayers
        )
+        noOffloadKVCache = false
+        const vramUsedByLayers = gpuLayers * layerSize
+        const vramLeftForContext = remainingVRAM - vramUsedByLayers
+        maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)

-        // Calculate context that fits in system RAM
-        const systemRAMContext = Math.min(
-          Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
-          targetContext
+        logger.info(
+          'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
        )
+      }
+    }

-        if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
-          maxContextLength = systemRAMContext
-          noOffloadKVCache = true
+    // Fallback logic: try different configurations if no VRAM-based plan worked
+    if (mode === 'Unsupported') {
+      logger.info('Planning: Trying fallback configurations...')
+
+      // Try putting some layers on GPU with KV cache in RAM
+      const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
+      if (possibleGpuLayers > 0) {
+        gpuLayers = Math.min(possibleGpuLayers, totalLayers)
+        const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
+        const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
+        const availableRamForKv =
+          usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
+        // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
+        const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
+
+        if (contextInRam >= MIN_CONTEXT_LENGTH) {
          mode = 'Hybrid'
-
-          logger.info(
-            `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
-              `${maxContextLength} context on CPU RAM`
-          )
-        } else {
-          // Can't fit reasonable context even with CPU RAM
-          // Reduce GPU layers further
-          gpuLayers = Math.floor(gpuLayers / 2)
-          maxContextLength = MIN_CONTEXT_LENGTH
+          maxContextLength = contextInRam
          noOffloadKVCache = true
-          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
+          logger.info(
+            `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
+          )
        }
-      } else {
-        // Pure CPU mode
+      }
+
+      // If still unsupported, try pure CPU mode
+      if (mode === 'Unsupported') {
        gpuLayers = 0
        noOffloadKVCache = true
-
-        // Calculate context for pure CPU mode
-        const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
-        const availableForKVCache = Math.max(
-          0,
-          usableSystemMemory - totalCPUMemoryNeeded
-        )
-
-        maxContextLength = Math.min(
-          Math.max(
-            MIN_CONTEXT_LENGTH,
-            Math.floor(availableForKVCache / kvCachePerToken)
-          ),
-          targetContext
-        )
-
-        mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
-      }
-    }
-
-    // Safety check: Verify total GPU memory usage
-    if (gpuLayers > 0 && !noOffloadKVCache) {
-      const estimatedGPUUsage =
-        gpuLayers * layerSize +
-        maxContextLength * kvCachePerToken +
-        (offloadMmproj ? mmprojSize : 0)
-
-      if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
-        logger.warn(
-          `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
-        )
-
-        // Reduce context first
-        while (
-          maxContextLength > MIN_CONTEXT_LENGTH &&
-          estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
-        ) {
-          maxContextLength = Math.floor(maxContextLength / 2)
-          const newEstimate =
-            gpuLayers * layerSize +
-            maxContextLength * kvCachePerToken +
-            (offloadMmproj ? mmprojSize : 0)
-          if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
-        }
-
-        // If still too much, reduce layers
-        if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
-          gpuLayers = Math.floor(gpuLayers * 0.7)
-          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
-          noOffloadKVCache = true // Move KV cache to CPU
+        offloadMmproj = false
+        const ramUsedByModel = modelSize + mmprojSize
+        const availableRamForKv = usableSystemMemory - ramUsedByModel
+        maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
+        if (maxContextLength >= MIN_CONTEXT_LENGTH) {
+          mode = 'CPU'
+          logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
        }
      }
    }

-    // Apply user-requested context limit if specified
+    if (mode === 'CPU' || noOffloadKVCache) {
+      offloadMmproj = false
+    }
+
    if (requestedCtx && requestedCtx > 0) {
      maxContextLength = Math.min(maxContextLength, requestedCtx)
-      logger.info(
-        `User requested context: ${requestedCtx}, final: ${maxContextLength}`
-      )
    }

-    // Ensure we never exceed model's maximum context
    maxContextLength = Math.min(maxContextLength, modelMaxContextLength)

-    // Final validation
-    if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
+    if (maxContextLength < MIN_CONTEXT_LENGTH) {
      mode = 'Unsupported'
    }

-    // Ensure maxContextLength is valid
-    maxContextLength = isNaN(maxContextLength)
-      ? MIN_CONTEXT_LENGTH
-      : Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
+    if (mode === 'Unsupported') {
+      gpuLayers = 0
+      maxContextLength = 0
+    }
+
+    maxContextLength = isNaN(maxContextLength)
+      ? 0
+      : Math.floor(maxContextLength)

-    // Log final plan
    const mmprojInfo = mmprojPath
      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
      : ''
@ -2360,14 +2306,13 @@ export default class llamacpp_extension extends AIEngine {
      offloadMmproj,
    }
  }
-
  /**
   * estimate KVCache size from a given metadata
   */
  private async estimateKVCache(
    meta: Record<string, string>,
    ctx_size?: number
-  ): Promise<number> {
+  ): Promise<{ size: number; perTokenSize: number }> {
    const arch = meta['general.architecture']
    if (!arch) throw new Error('Invalid metadata: architecture not found')

@ -2403,12 +2348,14 @@ export default class llamacpp_extension extends AIEngine {
      )
    }

-    let ctxLen: number
-    if (!ctx_size) {
-      ctxLen = Number(meta[`${arch}.context_length`])
-    } else {
-      ctxLen = ctx_size
-    }
+    const maxCtx = Number(meta[`${arch}.context_length`])
+    if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
+
+    // b) If the user supplied a value, clamp it to the model's max
+    let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
+
+    logger.info(`Final context length used for KV size: ${ctxLen}`)
+    logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)

    logger.info(`ctxLen: ${ctxLen}`)
    logger.info(`nLayer: ${nLayer}`)
@ -2421,10 +2368,10 @@ export default class llamacpp_extension extends AIEngine {
    // fp16 = 8 bits * 2 = 16
    const bytesPerElement = 2

-    // Total KV cache size per token = nHead * headDim * bytesPerElement
-    const kvPerToken = nHead * headDim * bytesPerElement
+    // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
+    const kvPerToken = nHead * headDim * bytesPerElement * nLayer

-    return ctxLen * nLayer * kvPerToken
+    return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
  }

  private async getModelSize(path: string): Promise<number> {
@ -2458,9 +2405,9 @@ export default class llamacpp_extension extends AIEngine {
      const gguf = await readGgufMetadata(path)
      let kvCacheSize: number
      if (ctx_size) {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
      } else {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
      }

      // Total memory consumption = model weights + kvcache
@ -2470,14 +2417,15 @@ export default class llamacpp_extension extends AIEngine {
      )

      // Use 80% of total memory as the usable limit
-      const USABLE_MEMORY_PERCENTAGE = 0.8
+      const USABLE_MEMORY_PERCENTAGE = 0.9
      const usableTotalMemory =
-        memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
+        memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
+        memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
      const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE

-      // Check if model fits in total memory at all
-      if (modelSize > usableTotalMemory) {
-        return 'RED'
+      // Check if model fits in total memory at all (this is the hard limit)
+      if (totalRequired > usableTotalMemory) {
+        return 'RED' // Truly impossible to run
      }

      // Check if everything fits in VRAM (ideal case)
@ -2485,14 +2433,11 @@ export default class llamacpp_extension extends AIEngine {
        return 'GREEN'
      }

-      // Check if model fits in VRAM but total requirement exceeds VRAM
-      // OR if total requirement fits in total memory but not in VRAM
-      if (modelSize <= usableVRAM || totalRequired <= usableTotalMemory) {
-        return 'YELLOW'
-      }
-
-      // If we get here, nothing fits properly
-      return 'RED'
+      // If we get here, it means:
+      // - Total requirement fits in combined memory
+      // - But doesn't fit entirely in VRAM
+      // This is the CPU-GPU hybrid scenario
+      return 'YELLOW'
    } catch (e) {
      throw new Error(String(e))
    }
--- a/specs/QA-checklist.md
+++ b/specs/QA-checklist.md
@ -1,191 +0,0 @@
-# Regression test
-
-**Release Version:** v0.6.0
-
-**Operating System:**
-
---
-
-## A. Installation, Update, and Uninstallation
-
-### 1. Users install app (New user flow)
-
- [ ] :rocket: Installation package is not corrupted and passes all security checks.
- [ ] :key: App launches successfully after installation.
-
-### 2. Users update app (Existing user flow)
-
- [ ] :key: Validate that the update does not corrupt user data or settings.
- [ ] :key: App restarts or prompts the user to restart after an update.
- [ ] When updating the app, check if the `/models` directory has any JSON/YML files that change according to the update.
- [ ] Updating the app also updates extensions correctly, test functionality changes.
-
-### 3. Users uninstall / close app
-
- [ ] :key: After closing the app, all models are unloaded.
- [ ] :key::warning: Uninstallation process removes the app successfully from the system.
- [ ] Clean the data folder and open the app to check if it creates all the necessary folders, especially models and extensions.
-
-## B. Overview
-
-### 1. Shortcut key
-
- [ ] :key: Test each shortcut key to confirm it works as described (My models, navigating, opening, closing, etc.).
-
-### 2. Users check the `active model`
-
- [ ] :key: The app correctly displays the state of the loading model (e.g., loading, ready, error).
- [ ] :key: Confirm that the app allows users to switch between models if multiple are available.
- [ ] Check that the app provides feedback or instructions if the model fails to load.
- [ ] Verify the troubleshooting assistant correctly capture hardware / log info [#1784](https://github.com/menloresearch/jan/issues/1784)
-
-## C. Thread
-
-### 1. Users can chat with Jan, the default assistant
-
- [ ] :key: Sending a message enables users to receive responses from model.
- [ ] :key: Conversation thread is maintained without any loss of data upon sending multiple messages.
- [ ] ‌Users should be able to edit msg and the assistant will re-generate the answer based on the edited version of the message.
- [ ] Test for the ability to send different types of messages (e.g., text, emojis, code blocks).
- [ ] Check the output format of the AI (code blocks, JSON, markdown, ...).
- [ ] :key: Validate the scroll functionality in the chat window for lengthy conversations.
- [ ] User can copy / delete the response.
- [ ] :key: Check the `clear message` / `delete entire chat` button works.
- [ ] Deleting all the chat retains the model instruction and settings.
- [ ] :key: Appropriate error handling and messaging if the assistant fails to respond.
- [ ] Test assistant's ability to maintain context over multiple exchanges.
- [ ] :key: Check the `create new chat` button, and new conversation will have an automatically generated thread title based on users msg.
- [ ] Changing `models` mid-thread the app can still handle it.
- [ ] Check the `regenerate` button renews the response (single / multiple times).
- [ ] Check the `Instructions` update correctly after the user updates it midway (mid-thread).
-
-### 2. Users can customize chat settings like model parameters via both the GUI & model.yml
-
- [ ] Adjust model parameters (e.g., Temperature, Top K, Top P) from the GUI and verify they are reflected in the chat behavior.
- [ ] :key: Changes can be saved and persisted between sessions.
- [ ] Users can access and modify the model.yml file.
- [ ] :key: Changes made in model.yml are correctly applied to the chat session upon reload or restart.
- [ ] Check the maximum and minimum limits of the adjustable parameters and how they affect the assistant's responses.
- [ ] :key: Users switch between threads with different models, the app can handle it.
-
-### 3. Model dropdown
-
- :key: Model list should highlight recommended based on user RAM (this is not really correct, I think it's based on static formula)
- [ ] Model size should display (for both installed and imported models)
-
-### 4. Users can click on a history thread
-
- [ ] Chat window displays the entire conversation from the selected history thread without any missing messages.
- [ ] Historical threads reflect the exact state of the chat at that time, including settings.
- [ ] :key: Ability to delete or clean old threads.
- [ ] Changing the title of the thread updates correctly.
-
-### 5. Users can config instructions for the assistant.
-
- [ ] Instructions set by the user are being followed by the assistant in subsequent conversations.
- [ ] :key: Changes to instructions are updated in real time and do not require a restart of the application or session.
- [ ] :key: Ability to reset instructions to default or clear them completely.
- [ ] :key: RAG - Users can import documents and the system should process queries about the uploaded file, providing accurate and appropriate responses in the conversation thread.
- [ ] :key: Jan can see - Users can import image and Model with vision can generate responses (e.g. LLaVa model). [#294](https://github.com/menloresearch/jan/issues/294)
-
-## D. Hub
-
-### 1. Users can discover recommended models
-
- :key: Each model's recommendations are consistent with the user’s activity and preferences.
- [ ] Search models and verify results / action on the results
-
-### 2. Users can download models suitable for their devices, e.g. compatible with their RAM
-
- [ ] Model list should be in order: Featured > Remote > Local
- [ ] :key: Ensure that models are labeled with RAM requirements.
- [ ] :key: Check the download model functionality and validate if the cancel download feature works correctly.
-
-### 3. Users can download models via a HuggingFace URL [#1740](https://github.com/menloresearch/jan/issues/1740)
-
- [ ] :key: Import via Hugging Face Id / full HuggingFace URL, check the progress bar reflects the download process
- [ ] :key: Test deeplink import [#2876](https://github.com/menloresearch/jan/issues/2876)
- [ ] :key: Users can use / remove the imported model.
-
-### 4. Users can import new models to the Hub
-
- [ ] :key: Ensure import successfully via drag / drop or upload GGUF.
- [ ] :key: Verify Move model binary file / Keep Original Files & Symlink option are working
- [ ] Users can add more info to the imported model / edit name
- [ ] :key: Ensure the new model updates after restarting the app.
-
-### 5. Users can use the model as they want
-
- [ ] :key: Check `start` / `stop` / `delete` button response exactly what it does.
- [ ] Check if starting another model stops the other model entirely.
- [ ] :rocket: Navigate to `hub` > Click `Use` button to use model. Expect to jump to thread and see the model in dropdown model selector.
- [ ] :key: Check when deleting a model it will delete all the files on the user's computer.
- [ ] :warning:The recommended tags should present right for the user's hardware.
-
-### 6. Users can Integrate With a Remote Server
-
- [ ] :key: Import openAI GPT model https://jan.ai/guides/using-models/integrate-with-remote-server/ and the model displayed in Hub / Thread dropdown
- [ ] Users can use the remote model properly (openAI GPT, Groq)
-
-## E. System Monitor
-
-### 1. Users can see disk and RAM utilization
-
- [ ] :key: Verify that the RAM and VRAM utilization graphs accurately reported in real time.
- [ ] :key: Validate that the utilization percentages reflect the actual usage compared to the system's total available resources.
- [ ] :key: Ensure that the system monitors updates dynamically as the models run and stop.
-
-### 2. Users can start and stop models based on system health
-
- [ ] :key: Verify the `Start/Stop` action for a model, the system resource usage reflects this change.
- [ ] Confirm that any changes in model status (start/stop) are logged or reported to the user for transparency.
- [ ] :key: Check the functionality of `App log` to ensure it opens the correct folder in the system file explorer.
-
-## F. Settings
-
-### 1. Appearance
-
- [ ] :key: Test the `Light`, `Dark`, and `System` theme settings to ensure they are functioning as expected.
- [ ] Confirm that the application saves the theme preference and persists it across sessions.
- [ ] Validate that all elements of the UI are compatible with the theme changes and maintain legibility and contrast.
-
-### 2. Extensions [TBU]
-
- Validate the `Install Extensions` process by selecting and installing a plugin file.
- [ ] Enable / disable extensions and the UI should reflex accordingly
-
-### 3. Extension group
-
- [ ] :key: Users can set valid Endpoint and API Key to use remote models
- [ ] Monitoring extension should allow users to enable / disable log and set log Cleaning Interval
-
-### 4. Advanced settings
-
- [ ] :key: Test the `Experimental Mode` toggle to confirm it enables or disables experimental features as intended.
- [ ] :key: Check the functionality of `Open App Directory` to ensure it opens the correct folder in the system file explorer.
- [ ] Users can move **Jan data folder**
- [ ] Validate that changes in advanced settings are applied immediately or provide appropriate instructions if a restart is needed.
- [ ] Attemp to test downloading model from hub using **HTTP Proxy** [guideline](https://github.com/menloresearch/jan/pull/1562)
- [ ] Logs that are older than 7 days or exceed 1MB in size will be automatically cleared upon starting the application.
- [ ] Users can click on Reset button to **factory reset** app settings to its original state & delete all usage data.
-  - [ ] Keep the current app data location
-  - [ ] Reset the current app data location
- [ ] Users can enable the setting and chat using quick ask.
-
-### 5. Engine
-
- [ ] :key: TensorRT Engine - Users able to chat with the model
- [ ] :key: Onnx Engine - Users able to chat with the model
- [ ] :key: Other remote Engine - Users able to chat with the model
-
-## G. Local API server
-
-### 1. Local Server Usage with Server Options
-
- [ ] :key: Explore API Reference: Swagger API for sending/receiving requests
-  - [ ] Use default server option
-  - [ ] Configure and use custom server options
- [ ] Test starting/stopping the local API server with different Model/Model settings
- [ ] Server logs captured with correct Server Options provided
- [ ] Verify functionality of Open logs/Clear feature
- [ ] Ensure that threads and other functions impacting the model are disabled while the local server is running
--- a/src-tauri/plugins/tauri-plugin-llamacpp/src/error.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/error.rs
@ -48,6 +48,7 @@ impl LlamacppError {
        let lower_stderr = stderr.to_lowercase();
        // TODO: add others
        let is_out_of_memory = lower_stderr.contains("out of memory")
+            || lower_stderr.contains("failed to allocate")
            || lower_stderr.contains("insufficient memory")
            || lower_stderr.contains("erroroutofdevicememory") // vulkan specific
            || lower_stderr.contains("kiogpucommandbuffercallbackerroroutofmemory") // Metal-specific error code
--- a/src-tauri/tauri.windows.conf.json
+++ b/src-tauri/tauri.windows.conf.json
@ -4,7 +4,11 @@
    "resources": ["resources/pre-install/**/*"],
    "externalBin": ["resources/bin/bun", "resources/bin/uv"],
    "windows": {
-      "signCommand": "powershell -ExecutionPolicy Bypass -File ./sign.ps1 %1"
+      "signCommand": "powershell -ExecutionPolicy Bypass -File ./sign.ps1 %1",
+      "webviewInstallMode": {
+        "silent": true,
+        "type": "downloadBootstrapper"
+      }
    }
  }
 }
--- a/tests/checklist.md
+++ b/tests/checklist.md
@ -0,0 +1,275 @@
+# I. Before release 
+
+## A. Initial update / migration Data check
+
+Before testing, set-up the following in the old version to make sure that we can see the data is properly migrated:
+- [ ] Changing appearance / theme to something that is obviously different from default set-up 
+- [ ] Ensure there are a few chat threads
+- [ ] Ensure there are a few favourites / star threads 
+- [ ] Ensure there are 2 model downloaded 
+- [ ] Ensure there are 2 import on local provider (llama.cpp) 
+- [ ] Modify MCP servers list and add some ENV value to MCP servers
+- [ ] Modify Local API Server 
+- [ ] HTTPS proxy config value 
+- [ ] Add 2 custom assistants to Jan 
+- [ ] Create a new chat with the custom assistant 
+- [ ] Change the `App Data` to some other folder
+- [ ] Create a Custom Provider 
+- [ ] Disable some model providers
+- [NEW] Change llama.cpp setting of 2 models 
+#### Validate that the update does not corrupt existing user data or settings (before and after update show the same information):
+- [ ] Threads
+	- [ ] Previously used model and assistants is shown correctly 
+	- [ ] Can resume chat in threads with the previous context 
+- [ ] Assistants
+- Settings:
+	- [ ] Appearance 
+	- [ ] MCP Servers 
+	- [ ] Local API Server 
+	- [ ] HTTPS Proxy
+- [ ] Custom Provider Set-up
+
+#### In `Hub`:
+- [ ] Can see model from HF listed properly 
+- [ ] Downloaded model will show `Use` instead of `Download` 
+- [ ] Toggling on `Downloaded` on the right corner show the correct list of downloaded models 
+
+#### In `Settings -> General`:
+- [ ] Ensure the `App Data` path is the same  
+- [ ] Click Open Logs, App Log will show 
+	
+#### In `Settings -> Model Providers`:
+- [ ] Llama.cpp still listed downloaded models and user can chat with the models
+- [ ] Llama.cpp still listed imported models and user can chat with the models
+- [ ] Remote model still retain previously set up API keys and user can chat with model from the provider without having to re-enter API keys
+- [ ] Enabled and Disabled Model Providers stay the same as before update 
+
+#### In `Settings -> Extensions`, check that following exists:
+- [ ] Conversational 
+- [ ] Jan Assistant
+- [ ] Download Manager 
+- [ ] llama.cpp Inference Engine
+
+## B. `Settings` 
+
+#### In `General`:
+- [ ] Ensure `Community` links work and point to the correct website 
+- [ ] Ensure the `Check for Updates` function detect the correct latest version 
+- [ ] [ENG] Create a folder with un-standard character as title (e.g. Chinese character) => change the `App data` location to that folder => test that model is still able to load and run properly.
+#### In `Appearance`:
+- [ ] Toggle between different `Theme` options to check that they change accordingly and that all elements of the UI are legible with the right contrast:
+	- [ ] Light 
+	- [ ] Dark 
+	- [ ] System (should follow your OS system settings) 
+- [ ] Change the following values => close the application => re-open the application => ensure that the change is persisted across session:
+	- [ ] Theme
+	- [ ] Font Size
+	- [ ] Window Background
+	- [ ] App Main View
+	- [ ] Primary
+	- [ ] Accent
+	- [ ] Destructive
+	- [ ] Chat Width
+		- [ ] Ensure that when this value is changed, there is no broken UI caused by it
+	- [ ] Code Block
+	- [ ] Show Line Numbers
+- [ENG] Ensure that when click on `Reset` in the `Appearance` section, it reset back to the default values
+- [ENG] Ensure that when click on `Reset` in the `Code Block` section, it reset back to the default values
+
+#### In `Model Providers`:
+
+In `Llama.cpp`:
+- [ ] After downloading a model from hub, the model is listed with the correct name under `Models`
+- [ ] Can import `gguf` model with no error
+- [ ] Imported model will be listed with correct name under the `Models`
+- [ ] Check that when click `delete` the model will be removed from the list
+- [ ] Deleted model doesn't appear in the selectable models section in chat input (even in old threads that use the model previously)
+- [ ] Ensure that user can re-import deleted imported models
+- [ ] Enable `Auto-Unload Old Models`, and ensure that only one model can run / start at a time. If there are two model running at the time of enable, both of them will be stopped. 
+- [ ] Disable `Auto-Unload Old Models`, and ensure that multiple models can run at the same time.
+- [ ] Enable  `Context Shift` and ensure that context can run for long without encountering memory error. Use the `banana test` by turn on fetch MCP => ask local model to fetch and summarize the history of banana (banana has a very long history on wiki it turns out). It should run out of context memory sufficiently fast if `Context Shift` is not enabled.
+- [ ] [0.6.8] Ensure that user can change the Jinja chat template of individual model and it doesn't affect the template of other model
+- [ ] [0.6.8] Ensure that there is a recommended `llama.cpp` for each system and that it works out of the box for users.
+- [ ] [0.6.8] Ensure we can override Tensor Buffer Type in the model settings to offload layers between GPU and CPU => Download any MoE Model (i.e., gpt-oss-20b) => Set tensor buffer type as `blk\\.([0-30]*[02468])\\.ffn_.*_exps\\.=CPU` => check if those tensors are in cpu and run inference (you can view the app.log if it contains `--override-tensor", "blk\\\\.([0-30]*[02468])\\\\.ffn_.*_exps\\\\.=CPU`)
+- [ ] [0.6.9] Take a `gguf` file and delete the `.gguf` extensions from the file name, import it into Jan and verify that it works.
+- [ ] [0.6.10] Can import vlm models and chat with images
+- [ ] [0.6.10] Import model on mmproj field should show validation error
+- [ ] [0.6.10] Import mmproj from different models should not be able to chat with the models
+- [ ] [0.6.10] Change to an older version of llama.cpp backend. Click on `Check for Llamacpp Updates` it should alert that there is a new version.
+- [ ] [0.6.10] Try `Install backend from file` for a backend and it should show as an option for backend
+
+In Remote Model Providers:
+- [ ] Check that the following providers are presence:
+	- [ ] OpenAI
+	- [ ] Anthropic
+	- [ ] Cohere
+	- [ ] OpenRouter
+	- [ ] Mistral
+	- [ ] Groq
+	- [ ] Gemini
+	- [ ] Hugging Face
+- [ ] Models should appear as available on the selectable dropdown in chat input once some value is input in the API key field. (it could be the wrong API key)
+- [ ] Once a valid API key is used, user can select a model from that provider and chat without any error. 
+- [ ] Delete a model and ensure that it doesn't show up in the `Models` list view or in the selectable dropdown in chat input.
+- [ ] Ensure that a deleted model also not selectable or appear in old threads that used it.
+- [ ] Adding of new model manually works and user can chat with the newly added model without error (you can add back the model you just delete for testing)
+- [ ] [0.6.9] Make sure that Ollama set-up  as a custom provider work with Jan
+In Custom Providers:
+- [ ] Ensure that user can create a new custom providers with the right baseURL and API key.
+- [ ] Click `Refresh` should retrieve a list of available models from the Custom Providers.
+- [ ] User can chat with the custom providers
+- [ ] Ensure that Custom Providers can be deleted and won't reappear in a new session
+
+In general:
+- [ ] Disabled Model Provider should not show up as selectable in chat input of new thread and old thread alike (old threads' chat input should show `Select Model` instead of disabled model)
+
+#### In `Shortcuts`:
+
+Make sure the following shortcut key combo is visible and works:
+- [ ] New chat
+- [ ] Toggle Sidebar
+- [ ] Zoom In
+- [ ] Zoom Out
+- [ ] Send Message
+- [ ] New Line
+- [ ] Navigation
+
+#### In `Hardware`:
+Ensure that the following section information show up for hardware
+- [ ] Operating System 
+- [ ] CPU
+- [ ] Memory
+- [ ] GPU (If the machine has one)
+	- [ ] Enabling and Disabling GPUs and ensure that model still run correctly in both mode
+	- [ ] Enabling or Disabling GPU should not affect the UI of the application
+
+#### In `MCP Servers`:
+- [ ] Ensure that an user can create a MCP server successfully when enter in the correct information
+- [ ] Ensure that `Env` value is masked by `*` in the quick view.
+- [ ] If an `Env` value is missing, there should be a error pop up.
+- [ ] Ensure that deleted MCP server disappear from the `MCP Server` list without any error
+- [ ] Ensure that before a MCP is deleted, it will be disable itself first and won't appear on the tool list after deleted.
+- [ ] Ensure that when the content of a MCP server is edited, it will be updated and reflected accordingly in the UI and when running it.
+- [ ] Toggling enable and disabled of a MCP server work properly
+- [ ] A disabled MCP should not appear in the available tool list in chat input
+- [ ] An disabled MCP should not be callable even when forced prompt by the model (ensure there is no ghost MCP server)
+- [ ] Ensure that enabled MCP server start automatically upon starting of the application
+- [ ] An enabled MCP should show functions in the available tool list
+- [ ] User can use a model and call different tool from multiple enabled MCP servers in the same thread
+- [ ] If `Allow All MCP Tool Permissions` is disabled, in every new thread, before a tool is called, there should be a confirmation dialog pop up to confirm the action.
+- [ ] When the user click `Deny`, the tool call will not be executed and return a message indicate so in the tool call result.
+- [ ] When the user click `Allow Once` on the pop up, a confirmation dialog will appear again when the tool is called next time.
+- [ ] When the user click `Always Allow` on the pop up, the tool will retain permission and won't ask for confirmation again. (this applied at an individual tool level, not at the MCP server level)
+- [ ] If `Allow All MCP Tool Permissions` is enabled, in every new thread,  there should not be any confirmation dialog pop up when a tool is called.
+- [ ] When the pop-up appear, make sure that the `Tool Parameters` is also shown with detail in the pop-up
+- [ ] [0.6.9] Go to Enter JSON configuration when created a new MCp => paste the JSON config inside => click `Save` => server works
+- [ ] [0.6.9] If individual JSON config format is failed, the MCP server should not be activated
+- [ ] [0.6.9] Make sure that MCP server can be used with streamable-http transport => connect to Smithery and test MCP server
+
+#### In `Local API Server`:
+- [ ] User can `Start Server` and chat with the default endpoint
+	- [ ] User should see the correct model name at `v1/models`
+	- [ ] User should be able to chat with it at `v1/chat/completions`
+- [ ] `Open Logs` show the correct query log send to the server and return from the server
+- [ ] Make sure that changing all the parameter in `Server Configuration` is reflected when `Start Server`
+- [ ] [0.6.9] When the startup configuration, the last used model is also automatically start (users does not have to manually start a model before starting the server)
+- [ ] [0.6.9] Make sure that you can send an image to a Local API Server and it also works (can set up Local API Server as a Custom Provider in Jan to test)
+- [ ] [0.6.10] Make sure you are still able to see API key when server local status is running
+
+#### In `HTTPS Proxy`:
+- [ ] Model download request goes through proxy endpoint
+
+## C. Hub
+- [ ] User can click `Download` to download a model
+- [ ] User can cancel a model in the middle of downloading
+- [ ] User can add a Hugging Face model detail to the list by pasting a model name / model url into the search bar and press enter
+- [ ] Clicking on a listing will open up the model card information within Jan and render the HTML properly
+- [ ] Clicking download work on the `Show variants` section
+- [ ] Clicking download work inside the Model card HTML
+- [ ] [0.6.9] Check that the model recommendation base on user hardware work as expected in the Model Hub
+- [ ] [0.6.10] Check that model of the same name but different author can be found in the Hub catalog (test with [https://huggingface.co/unsloth/Qwen3-4B-Thinking-2507-GGUF](https://huggingface.co/unsloth/Qwen3-4B-Thinking-2507-GGUF))
+
+## D. Threads
+
+#### In the left bar:
+- [ ] User can delete an old thread, and it won't reappear even when app restart
+- [ ] Change the title of the thread should update its last modification date and re-organise its position in the correct chronological order on the left bar.
+- [ ] The title of a new thread is the first message from the user.
+- [ ] Users can starred / un-starred threads accordingly
+- [ ] Starred threads should move to `Favourite` section and other threads should stay in `Recent`
+- [ ] Ensure that the search thread feature return accurate result based on thread titles and contents (including from both `Favourite` and `Recent`)
+- [ ] `Delete All` should delete only threads in the `Recents` section
+- [ ] `Unstar All` should un-star all of the `Favourites` threads and return them to `Recent`
+
+#### In a thread:
+- [ ] When `New Chat` is clicked, the assistant is set as the last selected assistant, the model selected is set as the last used model, and the user can immediately chat with the model. 
+- [ ] User can conduct multi-turn conversation in a single thread without lost of data (given that `Context Shift` is not enabled)
+- [ ] User can change to a different model in the middle of a conversation in a thread and the model work.
+- [ ] User can click on `Regenerate` button on a returned message from the model to get a new response base on the previous context.
+- [ ] User can change `Assistant` in the middle of a conversation in a thread and the new assistant setting will be applied instead.
+- [ ] The chat windows can render and show all the content of a selected threads (including scroll up and down on long threads)
+- [ ] Old thread retained their setting as of the last update / usage
+	- [ ] Assistant option
+	- [ ] Model option (except if the model / model provider has been deleted or disabled)
+- [ ] User can send message with different type of text content (e.g text, emoji, ...)
+- [ ] When request model to generate a markdown table, the table is correctly formatted as returned from the model.
+- [ ] When model generate code, ensure that the code snippets is properly formatted according to the `Appearance -> Code Block` setting.
+- [ ] Users can edit their old message and user can regenerate the answer based on the new message
+- [ ] User can click `Copy` to copy the model response
+- [ ] User can click `Delete` to delete either the user message or the model response.
+- [ ] The token speed appear when a response from model is being generated and the final value is show under the response. 
+- [ ] Make sure that user when using IME keyboard to type Chinese and Japanese character and they press `Enter`, the `Send` button doesn't trigger automatically after each words.
+- [ ] [0.6.9] Attach an image to the chat input and see if you can chat with it using a remote model
+- [ ] [0.6.9] Attach an image to the chat input and see if you can chat with it using a local model
+- [ ] [0.6.9] Check that you can paste an image to text box from your system clipboard (Copy - Paste)
+- [ ] [0.6.9] Make sure that user can favourite a model in the llama.cpp list and see the favourite model selection in chat input
+- [ ] [0.6.10] User can click mode's setting on chat, enable Auto-Optimize Settings, and continue chatting with the model without interruption.
+  - [ ] Verify this works with at least two models of different sizes (e.g., 1B and 7B).
+- [ ] [0.6.10] User can Paste (e.g Ctrl + v) text into chat input when it is a vision model
+- [ ] [0.6.10] When click on copy code block from model generation, it will only copy one code-block at a time instead of multiple code block at once
+## E. Assistants
+- [ ] There is always at least one default Assistant which is Jan
+- [ ] The default Jan assistant has `stream = True` by default 
+- [ ] User can create / edit a new assistant with different parameters and instructions choice.
+- [ ] When user delete the default Assistant, the next Assistant in line will be come the default Assistant and apply their setting to new chat accordingly.
+- [ ] User can create / edit assistant from within a Chat windows (on the top left)
+
+## F. After checking everything else
+
+In `Settings -> General`:
+- [ ] Change the location of the `App Data` to some other path that is not the default path
+- [ ] Click on `Reset` button in `Other` to factory reset the app:
+	- [ ] All threads deleted
+	- [ ] All Assistant deleted except for default Jan Assistant
+	- [ ] `App Data` location is reset back to default path
+	- [ ] Appearance reset
+	- [ ] Model Providers information all reset
+		- [ ] Llama.cpp setting reset
+		- [ ] API keys cleared
+		- [ ] All Custom Providers deleted
+	- [ ] MCP Servers reset
+	- [ ] Local API Server reset
+	- [ ] HTTPS Proxy reset
+- [ ] After closing the app, all models are unloaded properly
+- [ ] Locate to the data folder using the `App Data` path information => delete the folder => reopen the app to check that all the folder is re-created with all the necessary data.
+- [ ] Ensure that the uninstallation process removes the app successfully from the system.
+## G. New App Installation
+- [ ] Clean up by deleting all the left over folder created by Jan
+	- [ ] On MacOS
+		- [ ] `~/Library/Application Support/Jan`
+		- [ ] `~/Library/Caches/jan.ai.app`
+	- [ ] On Windows
+		- [ ] `C:\Users<Username>\AppData\Roaming\Jan\`
+		- [ ] `C:\Users<Username>\AppData\Local\jan.ai.app`
+	- [ ] On Linux
+		- [ ] `~/.cache/Jan`
+		- [ ] `~/.cache/jan.ai.app`
+		- [ ] `~/.local/share/Jan`
+		- [ ] `~/.local/share/jan.ai.app`
+- [ ] Ensure that the fresh install of Jan launch
+- [ ] Do some basic check to see that all function still behaved as expected. To be extra careful, you can go through the whole list again. However, it is more advisable to just check to make sure that all the core functionality like `Thread` and `Model Providers` work as intended.
+
+# II. After release
+- [ ] Check that the App Updater works and user can update to the latest release without any problem
+- [ ] App restarts after the user finished an update
+- [ ] Repeat section `A. Initial update / migration Data check` above to verify that update is done correctly on live version
--- a/web-app/src/containers/DropdownModelProvider.tsx
+++ b/web-app/src/containers/DropdownModelProvider.tsx
@ -1,3 +1,4 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { useEffect, useState, useRef, useMemo, useCallback } from 'react'
 import {
  Popover,
@ -121,17 +122,20 @@ const DropdownModelProvider = ({

              // Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
              // Check if model has a custom capabilities config flag
-              // eslint-disable-next-line @typescript-eslint/no-explicit-any
-              const hasUserConfiguredCapabilities = (model as any)._userConfiguredCapabilities === true
-              
-              if (!capabilities.includes('vision') && !hasUserConfiguredCapabilities) {
+
+              const hasUserConfiguredCapabilities =
+                (model as any)._userConfiguredCapabilities === true
+
+              if (
+                !capabilities.includes('vision') &&
+                !hasUserConfiguredCapabilities
+              ) {
                const updatedModels = [...provider.models]
                updatedModels[modelIndex] = {
                  ...model,
                  capabilities: [...capabilities, 'vision'],
                  // Mark this as auto-detected, not user-configured
                  _autoDetectedVision: true,
-                  // eslint-disable-next-line @typescript-eslint/no-explicit-any
                } as any

                updateProvider('llamacpp', { models: updatedModels })
@ -385,6 +389,11 @@ const DropdownModelProvider = ({

  const handleSelect = useCallback(
    async (searchableModel: SearchableModel) => {
+      // Immediately update display to prevent double-click issues
+      setDisplayModel(searchableModel.model.id)
+      setSearchValue('')
+      setOpen(false)
+
      selectModelProvider(
        searchableModel.provider.provider,
        searchableModel.model.id
@ -394,19 +403,6 @@ const DropdownModelProvider = ({
        provider: searchableModel.provider.provider,
      })

-      // Check mmproj existence for llamacpp models
-      if (searchableModel.provider.provider === 'llamacpp') {
-        await serviceHub
-          .models()
-          .checkMmprojExistsAndUpdateOffloadMMprojSetting(
-            searchableModel.model.id,
-            updateProvider,
-            getProviderByName
-          )
-        // Also check vision capability
-        await checkAndUpdateModelVisionCapability(searchableModel.model.id)
-      }
-
      // Store the selected model as last used
      if (useLastUsedModel) {
        setLastUsedModel(
@ -414,8 +410,35 @@ const DropdownModelProvider = ({
          searchableModel.model.id
        )
      }
-      setSearchValue('')
-      setOpen(false)
+
+      // Check mmproj existence for llamacpp models (async, don't block UI)
+      if (searchableModel.provider.provider === 'llamacpp') {
+        serviceHub
+          .models()
+          .checkMmprojExistsAndUpdateOffloadMMprojSetting(
+            searchableModel.model.id,
+            updateProvider,
+            getProviderByName
+          )
+          .catch((error) => {
+            console.debug(
+              'Error checking mmproj for model:',
+              searchableModel.model.id,
+              error
+            )
+          })
+
+        // Also check vision capability (async, don't block UI)
+        checkAndUpdateModelVisionCapability(searchableModel.model.id).catch(
+          (error) => {
+            console.debug(
+              'Error checking vision capability for model:',
+              searchableModel.model.id,
+              error
+            )
+          }
+        )
+      }
    },
    [
      selectModelProvider,
--- a/web-app/src/containers/ThreadContent.tsx
+++ b/web-app/src/containers/ThreadContent.tsx
@ -71,7 +71,7 @@ export const ThreadContent = memo(

      streamTools?: any
      contextOverflowModal?: React.ReactNode | null
-      updateMessage?: (item: ThreadMessage, message: string) => void
+      updateMessage?: (item: ThreadMessage, message: string, imageUrls?: string[]) => void
    }
  ) => {
    const { t } = useTranslation()
@ -276,9 +276,10 @@ export const ThreadContent = memo(
                  item.content?.find((c) => c.type === 'text')?.text?.value ||
                  ''
                }
-                onSave={(message) => {
+                imageUrls={item.content?.filter((c) => c.type === 'image_url' && c.image_url?.url).map((c) => c.image_url!.url).filter((url): url is string => url !== undefined) || []}
+                onSave={(message, imageUrls) => {
                  if (item.updateMessage) {
-                    item.updateMessage(item, message)
+                    item.updateMessage(item, message, imageUrls)
                  }
                }}
              />
--- a/web-app/src/containers/dialogs/AddEditAssistant.tsx
+++ b/web-app/src/containers/dialogs/AddEditAssistant.tsx
@ -236,7 +236,11 @@ export default function AddEditAssistant({

  return (
    <Dialog open={open} onOpenChange={onOpenChange}>
-      <DialogContent>
+      <DialogContent
+        onInteractOutside={(e) => {
+          e.preventDefault()
+        }}
+      >
        <DialogHeader>
          <DialogTitle>
            {editingKey
--- a/web-app/src/containers/dialogs/AddEditMCPServer.tsx
+++ b/web-app/src/containers/dialogs/AddEditMCPServer.tsx
@ -285,10 +285,39 @@ export default function AddEditMCPServer({
          setError(t('mcp-servers:editJson.errorFormat'))
          return
        }
-        // For each server in the JSON, call onSave
-        Object.entries(parsedData).forEach(([serverName, config]) => {
-          onSave(serverName.trim(), config as MCPServerConfig)
-        })
+        // Check if this looks like a server config object instead of the expected format
+        if (parsedData.command || parsedData.url) {
+          setError(t('mcp-servers:editJson.errorMissingServerNameKey'))
+          return
+        }
+
+        // For each server in the JSON, validate serverName and config
+        for (const [serverName, config] of Object.entries(parsedData)) {
+          const trimmedServerName = serverName.trim()
+          if (!trimmedServerName) {
+            setError(t('mcp-servers:editJson.errorServerName'))
+            return
+          }
+
+          // Validate the config object
+          const serverConfig = config as MCPServerConfig
+
+          // Validate type field if present
+          if (
+            serverConfig.type &&
+            !['stdio', 'http', 'sse'].includes(serverConfig.type)
+          ) {
+            setError(
+              t('mcp-servers:editJson.errorInvalidType', {
+                serverName: trimmedServerName,
+                type: serverConfig.type,
+              })
+            )
+            return
+          }
+
+          onSave(trimmedServerName, serverConfig as MCPServerConfig)
+        }
        onOpenChange(false)
        resetForm()
        setError(null)
@ -342,7 +371,12 @@ export default function AddEditMCPServer({

  return (
    <Dialog open={open} onOpenChange={onOpenChange}>
-      <DialogContent showCloseButton={false}>
+      <DialogContent
+        showCloseButton={false}
+        onInteractOutside={(e) => {
+          e.preventDefault()
+        }}
+      >
        <DialogHeader>
          <DialogTitle className="flex items-center justify-between">
            <span>
--- a/web-app/src/containers/dialogs/AddProviderDialog.tsx
+++ b/web-app/src/containers/dialogs/AddProviderDialog.tsx
@ -102,4 +102,4 @@ export function AddProviderDialog({
      </DialogContent>
    </Dialog>
  )
-}
+}
--- a/web-app/src/containers/dialogs/BackendUpdater.tsx
+++ b/web-app/src/containers/dialogs/BackendUpdater.tsx
@ -41,9 +41,13 @@ const BackendUpdater = () => {
    })
  }, [updateState])

-  // Don't show if user clicked remind me later or auto update is enabled
-  if (backendUpdateState.remindMeLater || updateState.autoUpdateEnabled)
+  // Don't show if user clicked remind me later
+  if (backendUpdateState.remindMeLater) {
+    console.log('BackendUpdater: Not showing notification due to:', {
+      remindMeLater: backendUpdateState.remindMeLater,
+    })
    return null
+  }

  return (
    <>
--- a/web-app/src/containers/dialogs/DeleteMessageDialog.tsx
+++ b/web-app/src/containers/dialogs/DeleteMessageDialog.tsx
@ -41,9 +41,19 @@ export function DeleteMessageDialog({ onDelete }: DeleteMessageDialogProps) {
  const trigger = (
    <Tooltip>
      <TooltipTrigger asChild>
-        <button className="flex items-center gap-1 hover:text-accent transition-colors cursor-pointer group relative">
+        <div 
+          className="flex items-center gap-1 hover:text-accent transition-colors cursor-pointer group relative"
+          role="button"
+          tabIndex={0}
+          onKeyDown={(e) => {
+            if (e.key === 'Enter' || e.key === ' ') {
+              e.preventDefault()
+              setIsOpen(true)
+            }
+          }}
+        >
          <IconTrash size={16} />
-        </button>
+        </div>
      </TooltipTrigger>
      <TooltipContent>
        <p>{t('delete')}</p>
--- a/web-app/src/containers/dialogs/EditJsonMCPserver.tsx
+++ b/web-app/src/containers/dialogs/EditJsonMCPserver.tsx
@ -61,7 +61,11 @@ export default function EditJsonMCPserver({

  return (
    <Dialog open={open} onOpenChange={onOpenChange}>
-      <DialogContent>
+      <DialogContent
+        onInteractOutside={(e) => {
+          e.preventDefault()
+        }}
+      >
        <DialogHeader>
          <DialogTitle>
            {serverName
--- a/web-app/src/containers/dialogs/EditMessageDialog.tsx
+++ b/web-app/src/containers/dialogs/EditMessageDialog.tsx
@ -11,7 +11,7 @@ import {
 } from '@/components/ui/dialog'
 import { Button } from '@/components/ui/button'
 import { Textarea } from '@/components/ui/textarea'
-import { IconPencil } from '@tabler/icons-react'
+import { IconPencil, IconX } from '@tabler/icons-react'
 import {
  Tooltip,
  TooltipContent,
@ -20,23 +20,27 @@ import {

 interface EditMessageDialogProps {
  message: string
-  onSave: (message: string) => void
+  imageUrls?: string[]
+  onSave: (message: string, imageUrls?: string[]) => void
  triggerElement?: React.ReactNode
 }

 export function EditMessageDialog({
  message,
+  imageUrls,
  onSave,
  triggerElement,
 }: EditMessageDialogProps) {
  const { t } = useTranslation()
  const [isOpen, setIsOpen] = useState(false)
  const [draft, setDraft] = useState(message)
+  const [keptImages, setKeptImages] = useState<string[]>(imageUrls || [])
  const textareaRef = useRef<HTMLTextAreaElement>(null)

  useEffect(() => {
    setDraft(message)
-  }, [message])
+    setKeptImages(imageUrls || [])
+  }, [message, imageUrls])

  useEffect(() => {
    if (isOpen && textareaRef.current) {
@ -48,8 +52,15 @@ export function EditMessageDialog({
  }, [isOpen])

  const handleSave = () => {
-    if (draft !== message && draft.trim()) {
-      onSave(draft)
+    const hasTextChanged = draft !== message && draft.trim()
+    const hasImageChanged =
+      JSON.stringify(imageUrls || []) !== JSON.stringify(keptImages)
+
+    if (hasTextChanged || hasImageChanged) {
+      onSave(
+        draft.trim() || message,
+        keptImages.length > 0 ? keptImages : undefined
+      )
      setIsOpen(false)
    }
  }
@ -64,9 +75,19 @@ export function EditMessageDialog({
  const defaultTrigger = (
    <Tooltip>
      <TooltipTrigger asChild>
-        <button className="flex outline-0 items-center gap-1 hover:text-accent transition-colors cursor-pointer group relative">
+        <div
+          className="flex outline-0 items-center gap-1 hover:text-accent transition-colors cursor-pointer group relative"
+          role="button"
+          tabIndex={0}
+          onKeyDown={(e) => {
+            if (e.key === 'Enter' || e.key === ' ') {
+              e.preventDefault()
+              setIsOpen(true)
+            }
+          }}
+        >
          <IconPencil size={16} />
-        </button>
+        </div>
      </TooltipTrigger>
      <TooltipContent>
        <p>{t('edit')}</p>
@ -80,6 +101,34 @@ export function EditMessageDialog({
      <DialogContent>
        <DialogHeader>
          <DialogTitle>{t('common:dialogs.editMessage.title')}</DialogTitle>
+          {keptImages.length > 0 && (
+            <div className="mt-2 space-y-2">
+              <div className="flex gap-3 flex-wrap">
+                {keptImages.map((imageUrl, index) => (
+                  <div
+                    key={index}
+                    className="relative border border-main-view-fg/5 rounded-lg size-14"
+                  >
+                    <img
+                      className="object-cover w-full h-full rounded-lg"
+                      src={imageUrl}
+                      alt={`Attached image ${index + 1}`}
+                    />
+                    <div
+                      className="absolute -top-1 -right-2.5 bg-destructive size-5 flex rounded-full items-center justify-center cursor-pointer"
+                      onClick={() =>
+                        setKeptImages((prev) =>
+                          prev.filter((_, i) => i !== index)
+                        )
+                      }
+                    >
+                      <IconX className="text-destructive-fg" size={16} />
+                    </div>
+                  </div>
+                ))}
+              </div>
+            </div>
+          )}
          <Textarea
            ref={textareaRef}
            value={draft}
@ -96,7 +145,12 @@ export function EditMessageDialog({
              </Button>
            </DialogClose>
            <Button
-              disabled={draft === message || !draft.trim()}
+              disabled={
+                draft === message &&
+                JSON.stringify(imageUrls || []) ===
+                  JSON.stringify(keptImages) &&
+                !draft.trim()
+              }
              onClick={handleSave}
              size="sm"
              className="w-full sm:w-auto"
--- a/web-app/src/containers/dialogs/ErrorDialog.tsx
+++ b/web-app/src/containers/dialogs/ErrorDialog.tsx
@ -61,9 +61,17 @@ export default function ErrorDialog() {

        <div className="bg-main-view-fg/2 p-2 border border-main-view-fg/5 rounded-lg space-y-2">
          <div>
-            <button
+            <div
              onClick={() => setIsDetailExpanded(!isDetailExpanded)}
              className="flex items-center gap-1 text-sm text-main-view-fg/60 hover:text-main-view-fg/80 transition-colors cursor-pointer"
+              role="button"
+              tabIndex={0}
+              onKeyDown={(e) => {
+                if (e.key === 'Enter' || e.key === ' ') {
+                  e.preventDefault()
+                  setIsDetailExpanded(!isDetailExpanded)
+                }
+              }}
            >
              {isDetailExpanded ? (
                <ChevronDown className="size-3" />
@ -71,7 +79,7 @@ export default function ErrorDialog() {
                <ChevronRight className="size-3" />
              )}
              Details
-            </button>
+            </div>

            {isDetailExpanded && (
              <div
--- a/web-app/src/containers/dialogs/ImportVisionModelDialog.tsx
+++ b/web-app/src/containers/dialogs/ImportVisionModelDialog.tsx
@ -10,7 +10,7 @@ import {
 import { Button } from '@/components/ui/button'
 import { Switch } from '@/components/ui/switch'
 import { useServiceHub } from '@/hooks/useServiceHub'
-import { useState } from 'react'
+import { useState, useEffect, useCallback } from 'react'
 import { toast } from 'sonner'
 import {
  IconLoader2,
@ -44,129 +44,142 @@ export const ImportVisionModelDialog = ({
  >(null)
  const [isValidatingMmproj, setIsValidatingMmproj] = useState(false)

-  const validateGgufFile = async (
-    filePath: string,
-    fileType: 'model' | 'mmproj'
-  ): Promise<void> => {
-    if (fileType === 'model') {
-      setIsValidating(true)
-      setValidationError(null)
-    } else {
-      setIsValidatingMmproj(true)
-      setMmprojValidationError(null)
-    }
-
-    try {
-      console.log(`Reading GGUF metadata for ${fileType}:`, filePath)
-
-      // Handle validation differently for model files vs mmproj files
+  const validateGgufFile = useCallback(
+    async (filePath: string, fileType: 'model' | 'mmproj'): Promise<void> => {
      if (fileType === 'model') {
-        // For model files, use the standard validateGgufFile method
-        if (typeof serviceHub.models().validateGgufFile === 'function') {
-          const result = await serviceHub.models().validateGgufFile(filePath)
+        setIsValidating(true)
+        setValidationError(null)
+      } else {
+        setIsValidatingMmproj(true)
+        setMmprojValidationError(null)
+      }

-          if (result.metadata) {
-            // Log full metadata for debugging
-            console.log(
-              `Full GGUF metadata for ${fileType}:`,
-              JSON.stringify(result.metadata, null, 2)
-            )
+      try {
+        // Handle validation differently for model files vs mmproj files
+        if (fileType === 'model') {
+          // For model files, use the standard validateGgufFile method
+          if (typeof serviceHub.models().validateGgufFile === 'function') {
+            const result = await serviceHub.models().validateGgufFile(filePath)

-            // Check architecture from metadata
-            const architecture =
-              result.metadata.metadata?.['general.architecture']
-            console.log(`${fileType} architecture:`, architecture)
+            if (result.metadata) {
+              // Check architecture from metadata
+              const architecture =
+                result.metadata.metadata?.['general.architecture']

-            // Model files should NOT be clip
-            if (architecture === 'clip') {
-              const errorMessage =
-                'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
-              setValidationError(errorMessage)
-              console.error(
-                'CLIP architecture detected in model file:',
-                architecture
-              )
-            } else {
-              console.log(
-                'Model validation passed. Architecture:',
-                architecture
-              )
+              // Extract baseName and use it as model name if available
+              const baseName = result.metadata.metadata?.['general.basename']
+
+              if (baseName) {
+                setModelName(baseName)
+              }
+
+              // Model files should NOT be clip
+              if (architecture === 'clip') {
+                const errorMessage =
+                  'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
+                setValidationError(errorMessage)
+                console.error(
+                  'CLIP architecture detected in model file:',
+                  architecture
+                )
+              }
+            }
+
+            if (!result.isValid) {
+              setValidationError(result.error || 'Model validation failed')
+              console.error('Model validation failed:', result.error)
            }
          }
+        } else {
+          // For mmproj files, we need to manually validate since validateGgufFile rejects CLIP models
+          try {
+            // Import the readGgufMetadata function directly from Tauri
+            const { invoke } = await import('@tauri-apps/api/core')

-          if (!result.isValid) {
-            setValidationError(result.error || 'Model validation failed')
-            console.error('Model validation failed:', result.error)
+            const metadata = await invoke(
+              'plugin:llamacpp|read_gguf_metadata',
+              {
+                path: filePath,
+              }
+            )
+
+            // Check if architecture matches expected type
+            const architecture = (
+              metadata as { metadata?: Record<string, string> }
+            ).metadata?.['general.architecture']
+
+            // Get general.baseName from metadata
+            const baseName = (metadata as { metadata?: Record<string, string> })
+              .metadata?.['general.basename']
+
+            // MMProj files MUST be clip
+            if (architecture !== 'clip') {
+              const errorMessage = `This MMProj file has "${architecture}" architecture but should have "clip" architecture. MMProj files must be CLIP models for vision processing.`
+              setMmprojValidationError(errorMessage)
+              console.error(
+                'Non-CLIP architecture detected in mmproj file:',
+                architecture
+              )
+            } else if (
+              baseName &&
+              modelName &&
+              !modelName.toLowerCase().includes(baseName.toLowerCase()) &&
+              !baseName.toLowerCase().includes(modelName.toLowerCase())
+            ) {
+              // Validate that baseName and model name are compatible (one should contain the other)
+              const errorMessage = `MMProj file baseName "${baseName}" does not match model name "${modelName}". The MMProj file should be compatible with the selected model.`
+              setMmprojValidationError(errorMessage)
+              console.error('BaseName mismatch in mmproj file:', {
+                baseName,
+                modelName,
+              })
+            }
+          } catch (directError) {
+            console.error(
+              'Failed to validate mmproj file directly:',
+              directError
+            )
+            const errorMessage = `Failed to read MMProj metadata: ${
+              directError instanceof Error
+                ? directError.message
+                : 'Unknown error'
+            }`
+            setMmprojValidationError(errorMessage)
          }
        }
-      } else {
-        // For mmproj files, we need to manually validate since validateGgufFile rejects CLIP models
-        try {
-          // Import the readGgufMetadata function directly from Tauri
-          const { invoke } = await import('@tauri-apps/api/core')
+      } catch (error) {
+        console.error(`Failed to validate ${fileType} file:`, error)
+        const errorMessage = `Failed to read ${fileType} metadata: ${error instanceof Error ? error.message : 'Unknown error'}`

-          const metadata = await invoke('plugin:llamacpp|read_gguf_metadata', {
-            path: filePath,
-          })
-
-          console.log(
-            `Full GGUF metadata for ${fileType}:`,
-            JSON.stringify(metadata, null, 2)
-          )
-
-          // Check if architecture matches expected type
-          const architecture = (
-            metadata as { metadata?: Record<string, string> }
-          ).metadata?.['general.architecture']
-          console.log(`${fileType} architecture:`, architecture)
-
-          // MMProj files MUST be clip
-          if (architecture !== 'clip') {
-            const errorMessage = `This MMProj file has "${architecture}" architecture but should have "clip" architecture. MMProj files must be CLIP models for vision processing.`
-            setMmprojValidationError(errorMessage)
-            console.error(
-              'Non-CLIP architecture detected in mmproj file:',
-              architecture
-            )
-          } else {
-            console.log(
-              'MMProj validation passed. Architecture:',
-              architecture
-            )
-          }
-        } catch (directError) {
-          console.error('Failed to validate mmproj file directly:', directError)
-          const errorMessage = `Failed to read MMProj metadata: ${
-            directError instanceof Error ? directError.message : 'Unknown error'
-          }`
+        if (fileType === 'model') {
+          setValidationError(errorMessage)
+        } else {
          setMmprojValidationError(errorMessage)
        }
+      } finally {
+        if (fileType === 'model') {
+          setIsValidating(false)
+        } else {
+          setIsValidatingMmproj(false)
+        }
      }
-    } catch (error) {
-      console.error(`Failed to validate ${fileType} file:`, error)
-      const errorMessage = `Failed to read ${fileType} metadata: ${error instanceof Error ? error.message : 'Unknown error'}`
+    },
+    [modelName, serviceHub]
+  )

-      if (fileType === 'model') {
-        setValidationError(errorMessage)
-      } else {
-        setMmprojValidationError(errorMessage)
-      }
-    } finally {
-      if (fileType === 'model') {
-        setIsValidating(false)
-      } else {
-        setIsValidatingMmproj(false)
-      }
-    }
-  }
+  const validateModelFile = useCallback(
+    async (filePath: string): Promise<void> => {
+      await validateGgufFile(filePath, 'model')
+    },
+    [validateGgufFile]
+  )

-  const validateModelFile = async (filePath: string): Promise<void> => {
-    await validateGgufFile(filePath, 'model')
-  }
-
-  const validateMmprojFile = async (filePath: string): Promise<void> => {
-    await validateGgufFile(filePath, 'mmproj')
-  }
+  const validateMmprojFile = useCallback(
+    async (filePath: string): Promise<void> => {
+      await validateGgufFile(filePath, 'mmproj')
+    },
+    [validateGgufFile]
+  )

  const handleFileSelect = async (type: 'model' | 'mmproj') => {
    const selectedFile = await serviceHub.dialog().open({
@ -179,14 +192,14 @@ export const ImportVisionModelDialog = ({

      if (type === 'model') {
        setModelFile(selectedFile)
-        // Auto-generate model name from GGUF file
+        // Set temporary model name from filename (will be overridden by baseName from metadata if available)
        const sanitizedName = fileName
          .replace(/\s/g, '-')
          .replace(/\.(gguf|GGUF)$/, '')
          .replace(/[^a-zA-Z0-9/_.-]/g, '') // Remove any characters not allowed in model IDs
        setModelName(sanitizedName)

-        // Validate the selected model file
+        // Validate the selected model file (this will update model name with baseName from metadata)
        await validateModelFile(selectedFile)
      } else {
        setMmProjFile(selectedFile)
@ -272,6 +285,13 @@ export const ImportVisionModelDialog = ({
    setIsValidatingMmproj(false)
  }

+  // Re-validate MMProj file when model name changes
+  useEffect(() => {
+    if (mmProjFile && modelName && isVisionModel) {
+      validateMmprojFile(mmProjFile)
+    }
+  }, [modelName, mmProjFile, isVisionModel, validateMmprojFile])
+
  const handleOpenChange = (newOpen: boolean) => {
    if (!importing) {
      setOpen(newOpen)
@ -284,7 +304,11 @@ export const ImportVisionModelDialog = ({
  return (
    <Dialog open={open} onOpenChange={handleOpenChange}>
      <DialogTrigger asChild>{trigger}</DialogTrigger>
-      <DialogContent>
+      <DialogContent
+        onInteractOutside={(e) => {
+          e.preventDefault()
+        }}
+      >
        <DialogHeader>
          <DialogTitle className="flex items-center gap-2">
            Import Model
--- a/web-app/src/containers/dialogs/LoadModelErrorDialog.tsx
+++ b/web-app/src/containers/dialogs/LoadModelErrorDialog.tsx
@ -131,9 +131,17 @@ export default function LoadModelErrorDialog() {

          {hasErrorDetail(modelLoadError) && (
            <div>
-              <button
+              <div
                onClick={() => setIsDetailExpanded(!isDetailExpanded)}
                className="flex items-center gap-1 text-sm text-main-view-fg/60 hover:text-main-view-fg/80 transition-colors cursor-pointer"
+                role="button"
+                tabIndex={0}
+                onKeyDown={(e) => {
+                  if (e.key === 'Enter' || e.key === ' ') {
+                    e.preventDefault()
+                    setIsDetailExpanded(!isDetailExpanded)
+                  }
+                }}
              >
                {isDetailExpanded ? (
                  <ChevronDown className="size-3" />
@ -141,7 +149,7 @@ export default function LoadModelErrorDialog() {
                  <ChevronRight className="size-3" />
                )}
                Details
-              </button>
+              </div>

              {isDetailExpanded && (
                <div
--- a/web-app/src/containers/dialogs/MessageMetadataDialog.tsx
+++ b/web-app/src/containers/dialogs/MessageMetadataDialog.tsx
@ -32,9 +32,19 @@ export function MessageMetadataDialog({
  const defaultTrigger = (
    <Tooltip>
      <TooltipTrigger asChild>
-        <button className="outline-0 focus:outline-0 flex items-center gap-1 hover:text-accent transition-colors cursor-pointer group relative">
+        <div 
+          className="outline-0 focus:outline-0 flex items-center gap-1 hover:text-accent transition-colors cursor-pointer group relative"
+          role="button"
+          tabIndex={0}
+          onKeyDown={(e) => {
+            if (e.key === 'Enter' || e.key === ' ') {
+              e.preventDefault()
+              setIsOpen(true)
+            }
+          }}
+        >
          <IconInfoCircle size={16} />
-        </button>
+        </div>
      </TooltipTrigger>
      <TooltipContent>
        <p>{t('metadata')}</p>
--- a/web-app/src/containers/dialogs/ToolApproval.tsx
+++ b/web-app/src/containers/dialogs/ToolApproval.tsx
@ -52,7 +52,10 @@ export default function ToolApproval() {
              <DialogTitle>{t('tools:toolApproval.title')}</DialogTitle>
              <DialogDescription className="mt-1 text-main-view-fg/70">
                {t('tools:toolApproval.description')}{' '}
-                <span className="font-semibold">{toolName}</span>
+                <span className="font-semibold">{toolName}</span>.&nbsp;
+                <span className="text-sm">
+                  {t('tools:toolApproval.permissionScope')}
+                </span>
              </DialogDescription>
            </div>
          </div>
@ -85,7 +88,7 @@ export default function ToolApproval() {
          >
            {t('tools:toolApproval.deny')}
          </Button>
-          <div className="flex flex-col sm:flex-row sm:gap-2 sm:items-center">
+          <div className="flex flex-col sm:flex-row gap-2 items-center">
            <Button
              variant="link"
              onClick={handleAllowOnce}
@ -93,7 +96,12 @@ export default function ToolApproval() {
            >
              {t('tools:toolApproval.allowOnce')}
            </Button>
-            <Button variant="default" onClick={handleAllow} autoFocus>
+            <Button
+              variant="default"
+              onClick={handleAllow}
+              autoFocus
+              className="capitalize"
+            >
              {t('tools:toolApproval.alwaysAllow')}
            </Button>
          </div>
--- a/web-app/src/locales/de-DE/mcp-servers.json
+++ b/web-app/src/locales/de-DE/mcp-servers.json
@ -26,6 +26,9 @@
    "errorParse": "Fehler beim Parsen der initialen Daten",
    "errorPaste": "Ungültiges JSON Format in dem eingefügten Inhalt",
    "errorFormat": "Ungültiges JSON Format",
+    "errorServerName": "Servername ist erforderlich und darf nicht leer sein",
+    "errorMissingServerNameKey": "JSON muss als {\"serverName\": {config}} strukturiert sein - fehlender Servername-Schlüssel",
+    "errorInvalidType": "Ungültiger Typ '{{type}}' für Server '{{serverName}}'. Typ muss 'stdio', 'http' oder 'sse' sein",
    "save": "Speichern"
  },
  "checkParams": "Bitte überprüfe die Parameter gemäß dem Tutorial.",
@ -34,7 +37,7 @@
  "editAllJson": "JSON aller Server bearbeiten",
  "findMore": "Finde mehr MCP Server bei",
  "allowPermissions": "Erlaube allen MCP Werkzeugen den Zugriff",
-  "allowPermissionsDesc": "Wenn aktiviert, werden alle MCP-Werkzeug-Aufrufe automatisch genehmigt, ohne dass Berechtigungsdialoge angezeigt werden.",
+  "allowPermissionsDesc": "Wenn aktiviert, werden alle MCP-Werkzeug-Aufrufe automatisch genehmigt, ohne dass Berechtigungsdialoge angezeigt werden. Diese Einstellung gilt global für alle Gespräche, einschließlich neuer Chats.",
  "noServers": "Keine MCP Server gefunden",
  "args": "Argumente",
  "env": "Umgebung",
--- a/web-app/src/locales/de-DE/tool-approval.json
+++ b/web-app/src/locales/de-DE/tool-approval.json
@ -4,7 +4,7 @@
  "securityNotice": "<strong>Sicherheitshinweis:</strong> Schädliche Werkzeuge oder Konversationsinhalte könnten den Assistenten möglicherweise zu schädlichen Aktionen verleiten. Überprüfe jeden Werkzeug-Aufruf sorgfältig, bevor Du ihn genehmigst.",
  "deny": "Verweigern",
  "allowOnce": "Einmal erlauben",
-  "alwaysAllow": "Immer erlauben",
+  "alwaysAllow": "Im Thread erlauben",
  "permissions": "Berechtigungen",
  "approve": "Genehmigen",
  "reject": "Ablehnen",
--- a/web-app/src/locales/de-DE/tools.json
+++ b/web-app/src/locales/de-DE/tools.json
@ -5,7 +5,8 @@
    "securityNotice": "Dieses Werkzeug möchte eine Aktion ausführen. Bitte überprüfen und genehmigen oder Ablehnen.",
    "deny": "Ablehnen",
    "allowOnce": "Einmal erlauben",
-    "alwaysAllow": "Immer erlauben",
-    "parameters": "Werkzeug-Parameter"
+    "alwaysAllow": "Im Thread erlauben",
+    "parameters": "Werkzeug-Parameter",
+    "permissionScope": "Erteilte Berechtigungen gelten nur für dieses Gespräch."
  }
 }
--- a/web-app/src/locales/en/mcp-servers.json
+++ b/web-app/src/locales/en/mcp-servers.json
@ -26,6 +26,9 @@
    "errorParse": "Failed to parse initial data",
    "errorPaste": "Invalid JSON format in pasted content",
    "errorFormat": "Invalid JSON format",
+    "errorServerName": "Server name is required and cannot be empty",
+    "errorMissingServerNameKey": "JSON must be structured as {\"serverName\": {config}} - missing server name key",
+    "errorInvalidType": "Invalid type '{{type}}' for server '{{serverName}}'. Type must be 'stdio', 'http', or 'sse'",
    "save": "Save"
  },
  "checkParams": "Please check the parameters according to the tutorial.",
@ -34,7 +37,7 @@
  "editAllJson": "Edit All Servers JSON",
  "findMore": "Find more MCP servers at",
  "allowPermissions": "Allow All MCP Tool Permissions",
-  "allowPermissionsDesc": "When enabled, all MCP tool calls will be automatically approved without showing permission dialogs.",
+  "allowPermissionsDesc": "When enabled, all MCP tool calls will be automatically approved without showing permission dialogs. This setting applies globally to all conversations, including new chats.",
  "noServers": "No MCP servers found",
  "args": "Args",
  "env": "Env",
--- a/web-app/src/locales/en/tool-approval.json
+++ b/web-app/src/locales/en/tool-approval.json
@ -4,9 +4,9 @@
  "securityNotice": "<strong>Security Notice:</strong> Malicious tools or conversation content could potentially trick the assistant into attempting harmful actions. Review each tool call carefully before approving.",
  "deny": "Deny",
  "allowOnce": "Allow Once",
-  "alwaysAllow": "Always Allow",
+  "alwaysAllow": "Allow in thread",
  "permissions": "Permissions",
  "approve": "Approve",
  "reject": "Reject",
  "parameters": "Tool Parameters"
-}
+}
--- a/web-app/src/locales/en/tools.json
+++ b/web-app/src/locales/en/tools.json
@ -5,7 +5,8 @@
    "securityNotice": "Malicious tools or conversation content could potentially trick the assistant into attempting harmful actions. Review each tool call carefully before approving.",
    "deny": "Deny",
    "allowOnce": "Allow Once",
-    "alwaysAllow": "Always Allow",
-    "parameters": "Tool Parameters"
+    "alwaysAllow": "Allow in thread",
+    "parameters": "Tool Parameters",
+    "permissionScope": "Permissions granted apply only to this conversation."
  }
 }
--- a/web-app/src/locales/id/mcp-servers.json
+++ b/web-app/src/locales/id/mcp-servers.json
@ -26,6 +26,9 @@
    "errorParse": "Gagal mengurai data awal",
    "errorPaste": "Format JSON tidak valid pada konten yang ditempel",
    "errorFormat": "Format JSON tidak valid",
+    "errorServerName": "Nama server wajib diisi dan tidak boleh kosong",
+    "errorMissingServerNameKey": "JSON harus berstruktur sebagai {\"serverName\": {config}} - kunci nama server hilang",
+    "errorInvalidType": "Tipe '{{type}}' untuk server '{{serverName}}' tidak valid. Tipe harus 'stdio', 'http', atau 'sse'",
    "save": "Simpan"
  },
  "checkParams": "Silakan periksa parameter sesuai dengan tutorial.",
@ -34,7 +37,7 @@
  "editAllJson": "Edit Semua JSON Server",
  "findMore": "Temukan lebih banyak server MCP di",
  "allowPermissions": "Izinkan Semua Izin Alat MCP",
-  "allowPermissionsDesc": "Jika diaktifkan, semua panggilan alat MCP akan disetujui secara otomatis tanpa menampilkan dialog izin.",
+  "allowPermissionsDesc": "Jika diaktifkan, semua panggilan alat MCP akan disetujui secara otomatis tanpa menampilkan dialog izin. Pengaturan ini berlaku secara global untuk semua percakapan, termasuk chat baru.",
  "noServers": "Tidak ada server MCP yang ditemukan",
  "args": "Argumen",
  "env": "Lingkungan",
--- a/web-app/src/locales/id/tool-approval.json
+++ b/web-app/src/locales/id/tool-approval.json
@ -4,7 +4,7 @@
  "securityNotice": "<strong>Pemberitahuan Keamanan:</strong> Alat berbahaya atau konten percakapan dapat menipu asisten untuk mencoba melakukan tindakan yang merugikan. Tinjau setiap permintaan penggunaan alat dengan cermat sebelum menyetujui.",
  "deny": "Tolak",
  "allowOnce": "Izinkan Sekali",
-  "alwaysAllow": "Selalu Izinkan",
+  "alwaysAllow": "Izinkan di thread",
  "permissions": "Izin",
  "approve": "Setujui",
  "reject": "Tolak",
--- a/web-app/src/locales/id/tools.json
+++ b/web-app/src/locales/id/tools.json
@ -4,8 +4,9 @@
    "securityNotice": "Alat ini ingin melakukan suatu tindakan. Harap tinjau dan setujui.",
    "deny": "Tolak",
    "allowOnce": "Izinkan Sekali",
-    "alwaysAllow": "Selalu Izinkan",
+    "alwaysAllow": "Izinkan di thread",
    "description": "Asisten ingin menggunakan <strong>{{toolName}}</strong>",
-    "parameters": "Parameter Alat"
+    "parameters": "Parameter Alat",
+    "permissionScope": "Izin yang diberikan hanya berlaku untuk percakapan ini."
  }
 }
--- a/web-app/src/locales/pl/mcp-servers.json
+++ b/web-app/src/locales/pl/mcp-servers.json
@ -26,6 +26,9 @@
    "errorParse": "Błąd parsowania wstępnych danych",
    "errorPaste": "Wprowadzono JSON o niepoprawnym formacie",
    "errorFormat": "Niepoprawny format JSON",
+    "errorServerName": "Nazwa serwera jest wymagana i nie może być pusta",
+    "errorMissingServerNameKey": "JSON musi być w formacie {\"serverName\": {config}} - brakuje klucza nazwy serwera",
+    "errorInvalidType": "Niepoprawny typ '{{type}}' dla serwera '{{serverName}}'. Typ musi być 'stdio', 'http' lub 'sse'",
    "save": "Zapisz"
  },
  "checkParams": "Proszę sprawdzić parametry we wprowadzeniu.",
@ -34,7 +37,7 @@
  "editAllJson": "Edytuj JSON Wszystkich Serwerów",
  "findMore": "Znajdź więcej serwerów MCP na",
  "allowPermissions": "Pozwalaj na Wszystkie Użycia Narzędzi MCP",
-  "allowPermissionsDesc": "Po włączeniu będzie automatycznie zezwalać na wszystkie użycia narzędzi MCP bez wyświetlania próśb o zgodę.",
+  "allowPermissionsDesc": "Po włączeniu będzie automatycznie zezwalać na wszystkie użycia narzędzi MCP bez wyświetlania próśb o zgodę. To ustawienie obowiązuje globalnie dla wszystkich rozmów, w tym nowych czatów.",
  "noServers": "Nie znaleziono serwerów MCP",
  "args": "Argumenty",
  "env": "Zmienne Środowiskowe",
--- a/web-app/src/locales/pl/tool-approval.json
+++ b/web-app/src/locales/pl/tool-approval.json
@ -4,7 +4,7 @@
  "securityNotice": "<strong>Ostrzeżenie Bezpieczeństwa:</strong> Złośliwe narzędzia lub treści rozmowy mają potencjał nakłonić agenta do szkodliwych działań. Dokładnie przejrzyj każdą prośbę o użycie narzędzia przed wyrażeniem zgody.",
  "deny": "Odmów",
  "allowOnce": "Pozwól Raz",
-  "alwaysAllow": "Zawsze Pozwalaj",
+  "alwaysAllow": "Pozwól w wątku",
  "permissions": "Pozwolenia",
  "approve": "Zaakceptuj",
  "reject": "Odrzuć",
--- a/web-app/src/locales/pl/tools.json
+++ b/web-app/src/locales/pl/tools.json
@ -5,7 +5,8 @@
    "securityNotice": "Złośliwe narzędzia lub treści rozmowy mają potencjał nakłonić agenta do szkodliwych działań. Dokładnie przejrzyj każdą prośbę o użycie narzędzia przed wyrażeniem zgody.",
    "deny": "Odmów",
    "allowOnce": "Pozwól Raz",
-    "alwaysAllow": "Zawsze Pozwalaj",
-    "parameters": "Parametry Narzędzia"
+    "alwaysAllow": "Pozwól w wątku",
+    "parameters": "Parametry Narzędzia",
+    "permissionScope": "Udzielone pozwolenia dotyczą tylko tej rozmowy."
  }
 }
--- a/web-app/src/locales/vn/mcp-servers.json
+++ b/web-app/src/locales/vn/mcp-servers.json
@ -26,6 +26,9 @@
    "errorParse": "Không thể phân tích cú pháp dữ liệu ban đầu",
    "errorPaste": "Định dạng JSON không hợp lệ trong nội dung đã dán",
    "errorFormat": "Định dạng JSON không hợp lệ",
+    "errorServerName": "Tên máy chủ là bắt buộc và không được để trống",
+    "errorMissingServerNameKey": "JSON phải có cấu trúc {\"serverName\": {config}} - thiếu khóa tên máy chủ",
+    "errorInvalidType": "Loại '{{type}}' cho máy chủ '{{serverName}}' không hợp lệ. Loại phải là 'stdio', 'http' hoặc 'sse'",
    "save": "Lưu"
  },
  "checkParams": "Vui lòng kiểm tra các tham số theo hướng dẫn.",
--- a/web-app/src/locales/vn/tool-approval.json
+++ b/web-app/src/locales/vn/tool-approval.json
@ -4,7 +4,7 @@
  "securityNotice": "<strong>Thông báo bảo mật:</strong> Các công cụ độc hại hoặc nội dung cuộc trò chuyện có khả năng lừa trợ lý thực hiện các hành động có hại. Hãy xem xét kỹ từng lệnh gọi công cụ trước khi phê duyệt.",
  "deny": "Từ chối",
  "allowOnce": "Cho phép một lần",
-  "alwaysAllow": "Luôn cho phép",
+  "alwaysAllow": "Cho phép trong chuỗi",
  "permissions": "Quyền",
  "approve": "Phê duyệt",
  "reject": "Từ chối",
--- a/web-app/src/locales/vn/tools.json
+++ b/web-app/src/locales/vn/tools.json
@ -4,8 +4,9 @@
    "securityNotice": "Công cụ này muốn thực hiện một hành động. Vui lòng xem xét và phê duyệt.",
    "deny": "Từ chối",
    "allowOnce": "Cho phép một lần",
-    "alwaysAllow": "Luôn cho phép",
+    "alwaysAllow": "Cho phép trong chuỗi",
    "description": "Trợ lý muốn sử dụng <strong>{{toolName}}</strong>",
-    "parameters": "Tham số công cụ"
+    "parameters": "Tham số công cụ",
+    "permissionScope": "Quyền được cấp chỉ áp dụng cho cuộc trò chuyện này."
  }
 }
--- a/web-app/src/locales/zh-CN/mcp-servers.json
+++ b/web-app/src/locales/zh-CN/mcp-servers.json
@ -26,6 +26,9 @@
    "errorParse": "解析初始数据失败",
    "errorPaste": "粘贴内容中的 JSON 格式无效",
    "errorFormat": "JSON 格式无效",
+    "errorServerName": "服务器名称为必填项，不能为空",
+    "errorMissingServerNameKey": "JSON 必须按 {\"serverName\": {config}} 格式结构化 - 缺少服务器名称键",
+    "errorInvalidType": "服务器 '{{serverName}}' 的类型 '{{type}}' 无效。类型必须是 'stdio'、'http' 或 'sse'",
    "save": "保存"
  },
  "checkParams": "请根据教程检查参数。",
@ -34,7 +37,7 @@
  "editAllJson": "编辑所有服务器的 JSON",
  "findMore": "在以下位置查找更多 MCP 服务器",
  "allowPermissions": "允许所有 MCP 工具权限",
-  "allowPermissionsDesc": "启用后，所有 MCP 工具调用都将自动批准，而不会显示权限对话框。",
+  "allowPermissionsDesc": "启用后，所有 MCP 工具调用都将自动批准，而不会显示权限对话框。此设置全局适用于所有对话，包括新聊天。",
  "noServers": "未找到 MCP 服务器",
  "args": "参数",
  "env": "环境",
--- a/web-app/src/locales/zh-CN/tool-approval.json
+++ b/web-app/src/locales/zh-CN/tool-approval.json
@ -4,7 +4,7 @@
  "securityNotice": "<strong>安全警告：</strong>恶意的工具或对话内容可能会诱使助手尝试有害操作。在批准之前，请仔细审查每个工具调用。",
  "deny": "拒绝",
  "allowOnce": "允许一次",
-  "alwaysAllow": "始终允许",
+  "alwaysAllow": "在线程中允许",
  "permissions": "权限",
  "approve": "批准",
  "reject": "拒绝",
--- a/web-app/src/locales/zh-CN/tools.json
+++ b/web-app/src/locales/zh-CN/tools.json
@ -4,8 +4,9 @@
    "securityNotice": "此工具想要执行一个操作。请审查并批准。",
    "deny": "拒绝",
    "allowOnce": "允许一次",
-    "alwaysAllow": "始终允许",
+    "alwaysAllow": "在线程中允许",
    "description": "助手想要使用 <strong>{{toolName}}</strong>",
-    "parameters": "工具参数"
+    "parameters": "工具参数",
+    "permissionScope": "授予的权限仅适用于此对话。"
  }
 }
--- a/web-app/src/locales/zh-TW/mcp-servers.json
+++ b/web-app/src/locales/zh-TW/mcp-servers.json
@ -26,6 +26,9 @@
    "errorParse": "解析初始資料失敗",
    "errorPaste": "貼上內容的 JSON 格式無效",
    "errorFormat": "JSON 格式無效",
+    "errorServerName": "伺服器名稱為必填項目，不能為空",
+    "errorMissingServerNameKey": "JSON 必須依照 {\"serverName\": {config}} 結構 - 缺少伺服器名稱鍵值",
+    "errorInvalidType": "伺服器 '{{serverName}}' 的類型 '{{type}}' 無效。類型必須是 'stdio'、'http' 或 'sse'",
    "save": "儲存"
  },
  "checkParams": "請根據教學檢查參數。",
@ -34,7 +37,7 @@
  "editAllJson": "編輯所有伺服器的 JSON",
  "findMore": "在以下位置尋找更多 MCP 伺服器",
  "allowPermissions": "允許所有 MCP 工具權限",
-  "allowPermissionsDesc": "啟用後，所有 MCP 工具呼叫將自動核准，而不會顯示權限對話方塊。",
+  "allowPermissionsDesc": "啟用後，所有 MCP 工具呼叫將自動核准，而不會顯示權限對話方塊。此設定全域適用於所有對話，包括新聊天。",
  "noServers": "找不到 MCP 伺服器",
  "args": "參數",
  "env": "環境",
--- a/web-app/src/locales/zh-TW/tool-approval.json
+++ b/web-app/src/locales/zh-TW/tool-approval.json
@ -4,7 +4,7 @@
  "securityNotice": "<strong>安全性通知：</strong>惡意的工具或對話內容可能會誘騙助理嘗試有害的操作。在核准之前，請仔細檢閱每個工具呼叫。",
  "deny": "拒絕",
  "allowOnce": "允許一次",
-  "alwaysAllow": "一律允許",
+  "alwaysAllow": "在討論串中允許",
  "permissions": "權限",
  "approve": "核准",
  "reject": "拒絕",
--- a/web-app/src/locales/zh-TW/tools.json
+++ b/web-app/src/locales/zh-TW/tools.json
@ -4,8 +4,9 @@
    "securityNotice": "此工具想要執行一個動作。請檢閱並核准。",
    "deny": "拒絕",
    "allowOnce": "允許一次",
-    "alwaysAllow": "一律允許",
+    "alwaysAllow": "在討論串中允許",
    "description": "助理想要使用 <strong>{{toolName}}</strong>",
-    "parameters": "工具參數"
+    "parameters": "工具參數",
+    "permissionScope": "授予的權限僅適用於此對話。"
  }
 }
--- a/web-app/src/routes/settings/providers/$providerName.tsx
+++ b/web-app/src/routes/settings/providers/$providerName.tsx
@ -83,6 +83,7 @@ function ProviderDetail() {
  const [refreshingModels, setRefreshingModels] = useState(false)
  const [isCheckingBackendUpdate, setIsCheckingBackendUpdate] = useState(false)
  const [isInstallingBackend, setIsInstallingBackend] = useState(false)
+  const [importingModel, setImportingModel] = useState<string | null>(null)
  const { checkForUpdate: checkForBackendUpdate, installBackend } =
    useBackendUpdater()
  const { providerName } = useParams({ from: Route.id })
@ -102,58 +103,66 @@ function ProviderDetail() {
    )

  const handleModelImportSuccess = async (importedModelName?: string) => {
-    // Refresh the provider to update the models list
-    await serviceHub.providers().getProviders().then(setProviders)
+    if (importedModelName) {
+      setImportingModel(importedModelName)
+    }

-    // If a model was imported and it might have vision capabilities, check and update
-    if (importedModelName && providerName === 'llamacpp') {
-      try {
-        const mmprojExists = await serviceHub
-          .models()
-          .checkMmprojExists(importedModelName)
-        if (mmprojExists) {
-          // Get the updated provider after refresh
-          const { getProviderByName, updateProvider: updateProviderState } =
-            useModelProvider.getState()
-          const llamacppProvider = getProviderByName('llamacpp')
+    try {
+      // Refresh the provider to update the models list
+      await serviceHub.providers().getProviders().then(setProviders)

-          if (llamacppProvider) {
-            const modelIndex = llamacppProvider.models.findIndex(
-              (m: Model) => m.id === importedModelName
-            )
-            if (modelIndex !== -1) {
-              const model = llamacppProvider.models[modelIndex]
-              const capabilities = model.capabilities || []
+      // If a model was imported and it might have vision capabilities, check and update
+      if (importedModelName && providerName === 'llamacpp') {
+        try {
+          const mmprojExists = await serviceHub
+            .models()
+            .checkMmprojExists(importedModelName)
+          if (mmprojExists) {
+            // Get the updated provider after refresh
+            const { getProviderByName, updateProvider: updateProviderState } =
+              useModelProvider.getState()
+            const llamacppProvider = getProviderByName('llamacpp')

-              // Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
-              // Check if model has a custom capabilities config flag
+            if (llamacppProvider) {
+              const modelIndex = llamacppProvider.models.findIndex(
+                (m: Model) => m.id === importedModelName
+              )
+              if (modelIndex !== -1) {
+                const model = llamacppProvider.models[modelIndex]
+                const capabilities = model.capabilities || []

-              const hasUserConfiguredCapabilities =
-                (model as any)._userConfiguredCapabilities === true
+                // Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
+                // Check if model has a custom capabilities config flag

-              if (
-                !capabilities.includes('vision') &&
-                !hasUserConfiguredCapabilities
-              ) {
-                const updatedModels = [...llamacppProvider.models]
-                updatedModels[modelIndex] = {
-                  ...model,
-                  capabilities: [...capabilities, 'vision'],
-                  // Mark this as auto-detected, not user-configured
-                  _autoDetectedVision: true,
-                } as any
+                const hasUserConfiguredCapabilities =
+                  (model as any)._userConfiguredCapabilities === true

-                updateProviderState('llamacpp', { models: updatedModels })
-                console.log(
-                  `Vision capability added to model after provider refresh: ${importedModelName}`
-                )
+                if (
+                  !capabilities.includes('vision') &&
+                  !hasUserConfiguredCapabilities
+                ) {
+                  const updatedModels = [...llamacppProvider.models]
+                  updatedModels[modelIndex] = {
+                    ...model,
+                    capabilities: [...capabilities, 'vision'],
+                    // Mark this as auto-detected, not user-configured
+                    _autoDetectedVision: true,
+                  } as any
+
+                  updateProviderState('llamacpp', { models: updatedModels })
+                  console.log(
+                    `Vision capability added to model after provider refresh: ${importedModelName}`
+                  )
+                }
              }
            }
          }
+        } catch (error) {
+          console.error('Error checking mmproj existence after import:', error)
        }
-      } catch (error) {
-        console.error('Error checking mmproj existence after import:', error)
      }
+    } finally {
+      // The importing state will be cleared by the useEffect when model appears in list
    }
  }

@ -175,6 +184,29 @@ function ProviderDetail() {
    return () => clearInterval(intervalId)
  }, [serviceHub, setActiveModels])

+  // Clear importing state when model appears in the provider's model list
+  useEffect(() => {
+    if (importingModel && provider?.models) {
+      const modelExists = provider.models.some(
+        (model) => model.id === importingModel
+      )
+      if (modelExists) {
+        setImportingModel(null)
+      }
+    }
+  }, [importingModel, provider?.models])
+
+  // Fallback: Clear importing state after 10 seconds to prevent infinite loading
+  useEffect(() => {
+    if (importingModel) {
+      const timeoutId = setTimeout(() => {
+        setImportingModel(null)
+      }, 10000) // 10 seconds fallback
+
+      return () => clearTimeout(timeoutId)
+    }
+  }, [importingModel])
+
  // Auto-refresh provider settings to get updated backend configuration
  const refreshSettings = useCallback(async () => {
    if (!provider) return
@ -357,12 +389,9 @@ function ProviderDetail() {

      if (selectedFile && typeof selectedFile === 'string') {
        // Process the file path: replace spaces with dashes and convert to lowercase
-        const processedFilePath = selectedFile
-          .replace(/\s+/g, '-')
-          .toLowerCase()

        // Install the backend using the llamacpp extension
-        await installBackend(processedFilePath)
+        await installBackend(selectedFile)

        // Extract filename from the selected file path and replace spaces with dashes
        const fileName = (
@ -834,6 +863,28 @@ function ProviderDetail() {
                      </p>
                    </div>
                  )}
+                  {/* Show importing skeleton first if there's one */}
+                  {importingModel && (
+                    <CardItem
+                      key="importing-skeleton"
+                      title={
+                        <div className="flex items-center gap-2">
+                          <div className="flex items-center gap-2 animate-pulse">
+                            <div className="bg-accent/20 flex gap-2 text-accent px-2 py-1 rounded-full text-xs">
+                              <IconLoader
+                                size={16}
+                                className="animate-spin text-accent"
+                              />
+                              Importing...
+                            </div>
+                            <h1 className="font-medium line-clamp-1">
+                              {importingModel}
+                            </h1>
+                          </div>
+                        </div>
+                      }
+                    />
+                  )}
                </Card>
              </div>
            </div>
--- a/web-app/src/routes/threads/$threadId.tsx
+++ b/web-app/src/routes/threads/$threadId.tsx
@ -1,3 +1,4 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { useEffect, useMemo, useRef, useState } from 'react'
 import { createFileRoute, useParams } from '@tanstack/react-router'
 import { UIEventHandler } from 'react'
@ -89,12 +90,15 @@ function ThreadDetail() {
  }, [threadId, currentThreadId, assistants])

  useEffect(() => {
-    serviceHub.messages().fetchMessages(threadId).then((fetchedMessages) => {
-      if (fetchedMessages) {
-        // Update the messages in the store
-        setMessages(threadId, fetchedMessages)
-      }
-    })
+    serviceHub
+      .messages()
+      .fetchMessages(threadId)
+      .then((fetchedMessages) => {
+        if (fetchedMessages) {
+          // Update the messages in the store
+          setMessages(threadId, fetchedMessages)
+        }
+      })
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [threadId, serviceHub])

@ -139,17 +143,21 @@ function ThreadDetail() {
  useEffect(() => {
    // Track streaming state changes
    const isCurrentlyStreaming = !!streamingContent
-    const justFinishedStreaming = wasStreamingRef.current && !isCurrentlyStreaming
+    const justFinishedStreaming =
+      wasStreamingRef.current && !isCurrentlyStreaming
    wasStreamingRef.current = isCurrentlyStreaming

    // If streaming just finished and user had an intended position, restore it
    if (justFinishedStreaming && userIntendedPositionRef.current !== null) {
      // Small delay to ensure DOM has updated
      setTimeout(() => {
-        if (scrollContainerRef.current && userIntendedPositionRef.current !== null) {
+        if (
+          scrollContainerRef.current &&
+          userIntendedPositionRef.current !== null
+        ) {
          scrollContainerRef.current.scrollTo({
            top: userIntendedPositionRef.current,
-            behavior: 'smooth'
+            behavior: 'smooth',
          })
          userIntendedPositionRef.current = null
          setIsUserScrolling(false)
@ -198,7 +206,7 @@ function ThreadDetail() {
    // Detect if this is a user-initiated scroll
    if (Math.abs(scrollTop - lastScrollTopRef.current) > 10) {
      setIsUserScrolling(!isBottom)
-      
+
      // If user scrolls during streaming and moves away from bottom, record their intended position
      if (streamingContent && !isBottom) {
        userIntendedPositionRef.current = scrollTop
@ -220,7 +228,7 @@ function ThreadDetail() {
    // Detect if this is a user-initiated scroll
    if (Math.abs(scrollTop - lastScrollTopRef.current) > 10) {
      setIsUserScrolling(!isBottom)
-      
+
      // If user scrolls during streaming and moves away from bottom, record their intended position
      if (streamingContent && !isBottom) {
        userIntendedPositionRef.current = scrollTop
@ -231,11 +239,15 @@ function ThreadDetail() {
    lastScrollTopRef.current = scrollTop
  }

-  const updateMessage = (item: ThreadMessage, message: string) => {
+  const updateMessage = (
+    item: ThreadMessage,
+    message: string,
+    imageUrls?: string[]
+  ) => {
    const newMessages: ThreadMessage[] = messages.map((m) => {
      if (m.id === item.id) {
        const msg: ThreadMessage = cloneDeep(m)
-        msg.content = [
+        const newContent = [
          {
            type: ContentType.Text,
            text: {
@ -244,6 +256,20 @@ function ThreadDetail() {
            },
          },
        ]
+
+        // Add image content if imageUrls are provided
+        if (imageUrls && imageUrls.length > 0) {
+          imageUrls.forEach((url) => {
+            newContent.push({
+              type: 'image_url' as ContentType,
+              image_url: {
+                url: url,
+              },
+            } as any)
+          })
+        }
+
+        msg.content = newContent
        return msg
      }
      return m