feat: Smart model management (#6390)

* feat: Smart model management * **New UI option** – `memory_util` added to `settings.json` with a dropdown (high / medium / low) to let users control how aggressively the engine uses system memory. * **Configuration updates** – `LlamacppConfig` now includes `memory_util`; the extension class stores it in a new `memoryMode` property and handles updates through `updateConfig`. * **System memory handling** * Introduced `SystemMemory` interface and `getTotalSystemMemory()` to report combined VRAM + RAM. * Added helper methods `getKVCachePerToken`, `getLayerSize`, and a new `ModelPlan` type. * **Smart model‑load planner** – `planModelLoad()` computes: * Number of GPU layers that can fit in usable VRAM. * Maximum context length based on KV‑cache size and the selected memory utilization mode (high/medium/low). * Whether KV‑cache must be off‑loaded to CPU and the overall loading mode (GPU, Hybrid, CPU, Unsupported). * Detailed logging of the planning decision. * **Improved support check** – `isModelSupported()` now: * Uses the combined VRAM/RAM totals from `getTotalSystemMemory()`. * Applies an 80% usable‑memory heuristic. * Returns **GREEN** only when both weights and KV‑cache fit in VRAM, **YELLOW** when they fit only in total memory or require CPU off‑load, and **RED** when the model cannot fit at all. * **Cleanup** – Removed unused `GgufMetadata` import; updated imports and type definitions accordingly. * **Documentation/comments** – Added explanatory JSDoc comments for the new methods and clarified the return semantics of `isModelSupported`. * chore: migrate no_kv_offload from llamacpp setting to model setting * chore: add UI auto optimize model setting * feat: improve model loading planner with mmproj support and smarter memory budgeting * Extend `ModelPlan` with optional `noOffloadMmproj` flag to indicate when a multimodal projector can stay in VRAM. * Add `mmprojPath` parameter to `planModelLoad` and calculate its size, attempting to keep it on GPU when possible. * Refactor system memory detection: * Use `used_memory` (actual free RAM) instead of total RAM for budgeting. * Introduced `usableRAM` placeholder for future use. * Rewrite KV‑cache size calculation: * Properly handle GQA models via `attention.head_count_kv`. * Compute bytes per token as `nHeadKV * headDim * 2 * 2 * nLayer`. * Replace the old 70 % VRAM heuristic with a more flexible budget: * Reserve a fixed VRAM amount and apply an overhead factor. * Derive usable system RAM from total memory minus VRAM. * Implement a robust allocation algorithm: * Prioritize placing the mmproj in VRAM. * Search for the best balance of GPU layers and context length. * Fallback strategies for hybrid and pure‑CPU modes with detailed safety checks. * Add extensive validation of model size, KV‑cache size, layer size, and memory mode. * Improve logging throughout the planning process for easier debugging. * Adjust final plan return shape to include the new `noOffloadMmproj` field. * remove unused variable --------- Co-authored-by: Faisal Amir <urmauur@gmail.com>
2025-09-11 09:48:03 +05:30 · 2025-09-11 09:48:03 +05:30 · 7a174e621a
commit 7a174e621a
parent 3158722a63
8 changed files with 741 additions and 94 deletions
--- a/extensions/llamacpp-extension/settings.json
+++ b/extensions/llamacpp-extension/settings.json
@ -36,6 +36,21 @@
    "controllerType": "checkbox",
    "controllerProps": { "value": true }
  },
  {
    "key": "memory_util",
    "title": "Smart Memory utilization",
    "description": "Smart memory utilization mode for running local GGUF models",
    "controllerType": "dropdown",
    "controllerProps": {
      "value": "high",
      "options": [
        { "value": "high", "name": "High" },
        { "value": "medium", "name": "Medium" },
        { "value": "low", "name": "Low" }
      ],
      "recommended": "high"
    }
  },
  {
    "key": "threads",
    "title": "Threads",
@ -178,15 +193,6 @@
      "value": false
    }
  },
  {
    "key": "no_kv_offload",
    "title": "Disable KV Offload",
    "description": "Disable KV cache offload to GPU (if GPU is used).",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": false
    }
  },
  {
    "key": "cache_type_k",
    "title": "KV Cache K Type",
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -35,10 +35,7 @@ import {
 import { invoke } from '@tauri-apps/api/core'
 import { getProxyConfig } from './util'
 import { basename } from '@tauri-apps/api/path'
-import {
+import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
  GgufMetadata,
  readGgufMetadata,
 } from '@janhq/tauri-plugin-llamacpp-api'
 import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
 type LlamacppConfig = {
@ -46,6 +43,7 @@ type LlamacppConfig = {
  auto_update_engine: boolean
  auto_unload: boolean
  llamacpp_env: string
  memory_util: string
  chat_template: string
  n_gpu_layers: number
  offload_mmproj: boolean
@ -74,6 +72,14 @@ type LlamacppConfig = {
  ctx_shift: boolean
 }
 type ModelPlan = {
  gpuLayers: number
  maxContextLength: number
  noOffloadKVCache: boolean
  noOffloadMmproj?: boolean
  mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
 }
 interface DownloadItem {
  url: string
  save_path: string
@ -116,6 +122,12 @@ interface DeviceList {
  free: number
 }
 interface SystemMemory {
  totalVRAM: number
  totalRAM: number
  totalMemory: number
 }
 /**
 * Override the default app.log function to use Jan's logging system.
 * @param args
@ -159,6 +171,7 @@ export default class llamacpp_extension extends AIEngine {
  provider: string = 'llamacpp'
  autoUnload: boolean = true
  llamacpp_env: string = ''
  memoryMode: string = 'high'
  readonly providerId: string = 'llamacpp'
  private config: LlamacppConfig
@ -190,6 +203,7 @@ export default class llamacpp_extension extends AIEngine {
    this.autoUnload = this.config.auto_unload
    this.llamacpp_env = this.config.llamacpp_env
    this.memoryMode = this.config.memory_util
    // This sets the base directory where model files for this provider are stored.
    this.providerPath = await joinPath([
@ -836,6 +850,8 @@ export default class llamacpp_extension extends AIEngine {
      this.autoUnload = value as boolean
    } else if (key === 'llamacpp_env') {
      this.llamacpp_env = value as string
    } else if (key === 'memory_util') {
      this.memoryMode = value as string
    }
  }
@ -1864,10 +1880,368 @@ export default class llamacpp_extension extends AIEngine {
      'tokenizer.chat_template'
    ]?.includes('tools')
  }
  /**
   * Get total system memory including both VRAM and RAM
   */
  private async getTotalSystemMemory(): Promise<SystemMemory> {
    const devices = await this.getDevices()
    let totalVRAM = 0
    if (devices.length > 0) {
      // Sum total VRAM across all GPUs
      totalVRAM = devices
        .map((d) => d.mem * 1024 * 1024)
        .reduce((a, b) => a + b, 0)
    }
    // Get system RAM
    const sys = await getSystemUsage()
    const totalRAM = sys.used_memory * 1024 * 1024
    const totalMemory = totalVRAM + totalRAM
    logger.info(
      `Total VRAM: ${totalVRAM} bytes, Total RAM: ${totalRAM} bytes, Free: ${usableRAM} bytes, Total Memory: ${totalMemory} bytes`
    )
    return {
      totalVRAM,
      totalRAM,
      totalMemory,
    }
  }
  private async getKVCachePerToken(
    meta: Record<string, string>
  ): Promise<number> {
    const arch = meta['general.architecture']
    const nLayer = Number(meta[`${arch}.block_count`])
    const nHead = Number(meta[`${arch}.attention.head_count`])
    // Get head dimensions
    const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
    const embeddingLen = Number(meta[`${arch}.embedding_length`])
    const headDim = embeddingLen / nHead
    // KV cache uses head_count_kv (for GQA models) or head_count
    // Each token needs K and V, both are fp16 (2 bytes)
    const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
    return bytesPerToken
  }
  private async getLayerSize(
    path: string,
    meta: Record<string, string>
  ): Promise<{ layerSize: number; totalLayers: number }> {
    const modelSize = await this.getModelSize(path)
    const arch = meta['general.architecture']
    const totalLayers = Number(meta[`${arch}.block_count`])
    if (!totalLayers) throw new Error('Invalid metadata: block_count not found')
    return { layerSize: modelSize / totalLayers, totalLayers }
  }
  async planModelLoad(
    path: string,
    requestedCtx?: number,
    mmprojPath?: string
  ): Promise<ModelPlan> {
    const modelSize = await this.getModelSize(path)
    const memoryInfo = await this.getTotalSystemMemory()
    const gguf = await readGgufMetadata(path)
    // Get mmproj size if provided
    let mmprojSize = 0
    if (mmprojPath) {
      mmprojSize = await this.getModelSize(mmprojPath)
    }
    const { layerSize, totalLayers } = await this.getLayerSize(
      path,
      gguf.metadata
    )
    // Fixed KV cache calculation
    const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
    // Debug logging
    logger.info(
      `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
    )
    // Validate critical values
    if (!modelSize || modelSize <= 0) {
      throw new Error(`Invalid model size: ${modelSize}`)
    }
    if (!kvCachePerToken || kvCachePerToken <= 0) {
      throw new Error(`Invalid KV cache per token: ${kvCachePerToken}`)
    }
    if (!layerSize || layerSize <= 0) {
      throw new Error(`Invalid layer size: ${layerSize}`)
    }
    // GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
    const GPU_OVERHEAD_FACTOR = 0.8
    // VRAM budget with overhead consideration
    const VRAM_RESERVE_GB = 0.5
    const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
    const usableVRAM = Math.max(
      0,
      (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
    )
    // Get model's maximum context length
    const arch = gguf.metadata['general.architecture']
    const modelMaxContextLength =
      Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
    // Set minimum context length
    const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
    // System RAM budget
    const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
    logger.info(
      `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
    )
    // Validate memory info
    if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
      throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
    }
    if (!memoryInfo.totalVRAM || isNaN(memoryInfo.totalVRAM)) {
      throw new Error(`Invalid total VRAM: ${memoryInfo.totalVRAM}`)
    }
    if (!this.memoryMode || !(this.memoryMode in memoryPercentages)) {
      throw new Error(
        `Invalid memory mode: ${this.memoryMode}. Must be 'high', 'medium', or 'low'`
      )
    }
    // Calculate actual system RAM
    const actualSystemRAM = Math.max(
      0,
      memoryInfo.totalMemory - memoryInfo.totalVRAM
    )
    const usableSystemMemory =
      actualSystemRAM * memoryPercentages[this.memoryMode]
    logger.info(
      `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
    )
    // --- Priority 1: Allocate mmproj (if exists) ---
    let noOffloadMmproj = false
    let remainingVRAM = usableVRAM
    if (mmprojSize > 0) {
      if (mmprojSize <= remainingVRAM) {
        noOffloadMmproj = true
        remainingVRAM -= mmprojSize
        logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
      } else {
        logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
      }
    }
    // --- Priority 2: Calculate optimal layer/context balance ---
    let gpuLayers = 0
    let maxContextLength = MIN_CONTEXT_LENGTH
    let noOffloadKVCache = false
    let mode: ModelPlan['mode'] = 'Unsupported'
    // Calculate how much VRAM we need for different context sizes
    const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
    const targetContext = requestedCtx || modelMaxContextLength
    // Find the best balance of layers and context
    let bestConfig = {
      layers: 0,
      context: MIN_CONTEXT_LENGTH,
      vramUsed: 0,
    }
    for (const ctxSize of contextSizes) {
      if (ctxSize > targetContext) break
      const kvCacheSize = ctxSize * kvCachePerToken
      const availableForLayers = remainingVRAM - kvCacheSize
      if (availableForLayers <= 0) continue
      const possibleLayers = Math.min(
        Math.floor(availableForLayers / layerSize),
        totalLayers
      )
      if (possibleLayers > 0) {
        const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
        // Verify this fits with some margin
        if (totalVramNeeded <= remainingVRAM * 0.95) {
          bestConfig = {
            layers: possibleLayers,
            context: ctxSize,
            vramUsed: totalVramNeeded,
          }
        }
      }
    }
    // Apply the best configuration found
    if (bestConfig.layers > 0) {
      gpuLayers = bestConfig.layers
      maxContextLength = bestConfig.context
      noOffloadKVCache = false
      mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
      logger.info(
        `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
          `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
      )
    } else {
      // Fallback: Try minimal GPU layers with KV cache on CPU
      gpuLayers = Math.min(
        Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
        totalLayers
      )
      if (gpuLayers > 0) {
        // Calculate available system RAM for KV cache
        const cpuLayers = totalLayers - gpuLayers
        const modelCPUSize = cpuLayers * layerSize
        const mmprojCPUSize =
          mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0
        const systemRAMUsed = modelCPUSize + mmprojCPUSize
        const availableSystemRAMForKVCache = Math.max(
          0,
          usableSystemMemory - systemRAMUsed
        )
        // Calculate context that fits in system RAM
        const systemRAMContext = Math.min(
          Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
          targetContext
        )
        if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
          maxContextLength = systemRAMContext
          noOffloadKVCache = true
          mode = 'Hybrid'
          logger.info(
            `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
              `${maxContextLength} context on CPU RAM`
          )
        } else {
          // Can't fit reasonable context even with CPU RAM
          // Reduce GPU layers further
          gpuLayers = Math.floor(gpuLayers / 2)
          maxContextLength = MIN_CONTEXT_LENGTH
          noOffloadKVCache = true
          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
        }
      } else {
        // Pure CPU mode
        gpuLayers = 0
        noOffloadKVCache = true
        // Calculate context for pure CPU mode
        const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
        const availableForKVCache = Math.max(
          0,
          usableSystemMemory - totalCPUMemoryNeeded
        )
        maxContextLength = Math.min(
          Math.max(
            MIN_CONTEXT_LENGTH,
            Math.floor(availableForKVCache / kvCachePerToken)
          ),
          targetContext
        )
        mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
      }
    }
    // Safety check: Verify total GPU memory usage
    if (gpuLayers > 0 && !noOffloadKVCache) {
      const estimatedGPUUsage =
        gpuLayers * layerSize +
        maxContextLength * kvCachePerToken +
        (noOffloadMmproj ? mmprojSize : 0)
      if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
        logger.warn(
          `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
        )
        // Reduce context first
        while (
          maxContextLength > MIN_CONTEXT_LENGTH &&
          estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
        ) {
          maxContextLength = Math.floor(maxContextLength / 2)
          const newEstimate =
            gpuLayers * layerSize +
            maxContextLength * kvCachePerToken +
            (noOffloadMmproj ? mmprojSize : 0)
          if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
        }
        // If still too much, reduce layers
        if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
          gpuLayers = Math.floor(gpuLayers * 0.7)
          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
          noOffloadKVCache = true // Move KV cache to CPU
        }
      }
    }
    // Apply user-requested context limit if specified
    if (requestedCtx && requestedCtx > 0) {
      maxContextLength = Math.min(maxContextLength, requestedCtx)
      logger.info(
        `User requested context: ${requestedCtx}, final: ${maxContextLength}`
      )
    }
    // Ensure we never exceed model's maximum context
    maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
    // Final validation
    if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
      mode = 'Unsupported'
    }
    // Ensure maxContextLength is valid
    maxContextLength = isNaN(maxContextLength)
      ? MIN_CONTEXT_LENGTH
      : Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
    // Log final plan
    const mmprojInfo = mmprojPath
      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}`
      : ''
    logger.info(
      `Final plan for ${path}: gpuLayers=${gpuLayers}/${totalLayers}, ` +
        `maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, ` +
        `mode=${mode}${mmprojInfo}`
    )
    return {
      gpuLayers,
      maxContextLength,
      noOffloadKVCache,
      mode,
      noOffloadMmproj,
    }
  }
  /**
-   *  estimate KVCache size of from a given metadata
+   * estimate KVCache size from a given metadata
   *
   */
  private async estimateKVCache(
    meta: Record<string, string>,
@ -1907,6 +2281,7 @@ export default class llamacpp_extension extends AIEngine {
        `Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}`
      )
    }
    let ctxLen: number
    if (!ctx_size) {
      ctxLen = Number(meta[`${arch}.context_length`])
@ -1941,13 +2316,13 @@ export default class llamacpp_extension extends AIEngine {
    }
  }
-  /*
+  /**
-   * check the support status of a model by its path (local/remote)
+   * Check the support status of a model by its path (local/remote)
   *
-   * * Returns:
+   * Returns:
-   * - "RED"    → weights don't fit
+   * - "RED"    → weights don't fit in total memory
-   * - "YELLOW" → weights fit, KV cache doesn't
+   * - "YELLOW" → weights fit in VRAM but need system RAM, or KV cache doesn't fit
-   * - "GREEN"  → both weights + KV cache fit
+   * - "GREEN"  → both weights + KV cache fit in VRAM
   */
  async isModelSupported(
    path: string,
@ -1955,46 +2330,48 @@ export default class llamacpp_extension extends AIEngine {
  ): Promise<'RED' | 'YELLOW' | 'GREEN'> {
    try {
      const modelSize = await this.getModelSize(path)
      const memoryInfo = await this.getTotalSystemMemory()
      logger.info(`modelSize: ${modelSize}`)
-      let gguf: GgufMetadata
+
-      gguf = await readGgufMetadata(path)
+      const gguf = await readGgufMetadata(path)
      let kvCacheSize: number
      if (ctx_size) {
        kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
      } else {
        kvCacheSize = await this.estimateKVCache(gguf.metadata)
      }
-      // total memory consumption = model weights + kvcache + a small buffer for outputs
+
-      // output buffer is small so not considering here
+      // Total memory consumption = model weights + kvcache
      const totalRequired = modelSize + kvCacheSize
      logger.info(
        `isModelSupported: Total memory requirement: ${totalRequired} for ${path}`
      )
      let totalMemBytes: number
      const devices = await this.getDevices()
      if (devices.length > 0) {
        // Sum total memory across all GPUs
        totalMemBytes = devices
          .map((d) => d.mem * 1024 * 1024)
          .reduce((a, b) => a + b, 0)
      } else {
        // CPU fallback
        const sys = await getSystemUsage()
        totalMemBytes = sys.total_memory * 1024 * 1024
      }
      // Use 80% of total memory as the usable limit
      const USABLE_MEMORY_PERCENTAGE = 0.8
-      const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE
+      const usableTotalMemory =
        memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
      const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
-      // check model size wrt 80% of system memory
+      // Check if model fits in total memory at all
-      if (modelSize > usableMemBytes) {
+      if (modelSize > usableTotalMemory) {
        return 'RED'
-      } else if (modelSize + kvCacheSize > usableMemBytes) {
+      }
-        return 'YELLOW'
+
-      } else {
+      // Check if everything fits in VRAM (ideal case)
      if (totalRequired <= usableVRAM) {
        return 'GREEN'
      }
      // Check if model fits in VRAM but total requirement exceeds VRAM
      // OR if total requirement fits in total memory but not in VRAM
      if (modelSize <= usableVRAM || totalRequired <= usableTotalMemory) {
        return 'YELLOW'
      }
      // If we get here, nothing fits properly
      return 'RED'
    } catch (e) {
      throw new Error(String(e))
    }
@ -2006,39 +2383,42 @@ export default class llamacpp_extension extends AIEngine {
  async validateGgufFile(filePath: string): Promise<{
    isValid: boolean
    error?: string
-    metadata?: GgufMetadata
+    metadata?: any
  }> {
    try {
      logger.info(`Validating GGUF file: ${filePath}`)
      const metadata = await readGgufMetadata(filePath)
-      
+
      // Log full metadata for debugging
      logger.info('Full GGUF metadata:', JSON.stringify(metadata, null, 2))
-      
+
      // Check if architecture is 'clip' which is not supported for text generation
      const architecture = metadata.metadata?.['general.architecture']
      logger.info(`Model architecture: ${architecture}`)
-      
+
      if (architecture === 'clip') {
-        const errorMessage = 'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
+        const errorMessage =
          'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
        logger.error('CLIP architecture detected:', architecture)
        return {
          isValid: false,
          error: errorMessage,
-          metadata
+          metadata,
        }
      }
-      
+
      logger.info('Model validation passed. Architecture:', architecture)
      return {
        isValid: true,
-        metadata
+        metadata,
      }
    } catch (error) {
      logger.error('Failed to validate GGUF file:', error)
      return {
        isValid: false,
-        error: `Failed to read model metadata: ${error instanceof Error ? error.message : 'Unknown error'}`
+        error: `Failed to read model metadata: ${
          error instanceof Error ? error.message : 'Unknown error'
        }`,
      }
    }
  }
--- a/web-app/src/containers/ModelSetting.tsx
+++ b/web-app/src/containers/ModelSetting.tsx
@ -1,5 +1,6 @@
-import { IconSettings } from '@tabler/icons-react'
+import { IconSettings, IconLoader } from '@tabler/icons-react'
 import debounce from 'lodash.debounce'
 import { useState } from 'react'
 import {
  Sheet,
@ -9,6 +10,7 @@ import {
  SheetTitle,
  SheetTrigger,
 } from '@/components/ui/sheet'
 import { Button } from '@/components/ui/button'
 import { DynamicControllerSetting } from '@/containers/dynamicControllerSetting'
 import { useModelProvider } from '@/hooks/useModelProvider'
 import { useServiceHub } from '@/hooks/useServiceHub'
@ -30,11 +32,134 @@ export function ModelSetting({
  const { t } = useTranslation()
  const serviceHub = useServiceHub()
  const [isPlanning, setIsPlanning] = useState(false)
  // Create a debounced version of stopModel that waits 500ms after the last call
  const debouncedStopModel = debounce((modelId: string) => {
    serviceHub.models().stopModel(modelId)
  }, 500)
  const handlePlanModelLoad = async () => {
    if (provider.provider !== 'llamacpp') {
      console.warn('planModelLoad is only available for llamacpp provider')
      return
    }
    setIsPlanning(true)
    try {
      // Read the model config to get the actual model path
      const modelConfig = await serviceHub.app().readYaml<{
        model_path: string
      }>(`llamacpp/models/${model.id}/model.yml`)
      if (modelConfig && modelConfig.model_path) {
        const result = await serviceHub
          .models()
          .planModelLoad(modelConfig.model_path)
        // Apply the recommended settings to the model sequentially to avoid race conditions
        const settingsToUpdate: Array<{
          key: string
          value: number | boolean
        }> = []
        if (model.settings?.ngl && result.gpuLayers !== undefined) {
          settingsToUpdate.push({ key: 'ngl', value: result.gpuLayers })
        }
        if (model.settings?.ctx_len && result.maxContextLength !== undefined) {
          settingsToUpdate.push({
            key: 'ctx_len',
            value: result.maxContextLength,
          })
        }
        if (
          model.settings?.no_kv_offload &&
          result.noOffloadKVCache !== undefined
        ) {
          settingsToUpdate.push({
            key: 'no_kv_offload',
            value: result.noOffloadKVCache,
          })
        }
        // Apply all settings in a single update to avoid race conditions
        if (settingsToUpdate.length > 0) {
          handleMultipleSettingsChange(settingsToUpdate)
        }
      } else {
        console.warn('No model_path found in config for', model.id)
      }
    } catch (error) {
      console.error('Error calling planModelLoad:', error)
    } finally {
      setIsPlanning(false)
    }
  }
  const handleMultipleSettingsChange = (
    settingsToUpdate: Array<{ key: string; value: number | boolean }>
  ) => {
    if (!provider) return
    // Create a copy of the model with ALL updated settings at once
    let updatedModel = { ...model }
    settingsToUpdate.forEach(({ key, value }) => {
      const existingSetting = updatedModel.settings?.[key] as ProviderSetting
      updatedModel = {
        ...updatedModel,
        settings: {
          ...updatedModel.settings,
          [key]: {
            ...existingSetting,
            controller_props: {
              ...existingSetting?.controller_props,
              value: value,
            },
          } as ProviderSetting,
        },
      }
    })
    // Find the model index in the provider's models array
    const modelIndex = provider.models.findIndex((m) => m.id === model.id)
    if (modelIndex !== -1) {
      // Create a copy of the provider's models array
      const updatedModels = [...provider.models]
      // Update the specific model in the array
      updatedModels[modelIndex] = updatedModel as Model
      // Update the provider with the new models array
      updateProvider(provider.provider, {
        models: updatedModels,
      })
      // Check if any of the updated settings require a model restart
      const requiresRestart = settingsToUpdate.some(
        ({ key }) =>
          key === 'ctx_len' ||
          key === 'ngl' ||
          key === 'chat_template' ||
          key === 'offload_mmproj'
      )
      if (requiresRestart) {
        // Check if model is running before stopping it
        serviceHub
          .models()
          .getActiveModels()
          .then((activeModels) => {
            if (activeModels.includes(model.id)) {
              debouncedStopModel(model.id)
            }
          })
      }
    }
  }
  const handleSettingChange = (
    key: string,
    value: string | boolean | number
@ -72,8 +197,22 @@ export function ModelSetting({
      })
      // Call debounced stopModel only when updating ctx_len, ngl, chat_template, or offload_mmproj
-      if (key === 'ctx_len' || key === 'ngl' || key === 'chat_template' || key === 'offload_mmproj') {
+      // and only if the model is currently running
-        debouncedStopModel(model.id)
+      if (
        key === 'ctx_len' ||
        key === 'ngl' ||
        key === 'chat_template' ||
        key === 'offload_mmproj'
      ) {
        // Check if model is running before stopping it
        serviceHub
          .models()
          .getActiveModels()
          .then((activeModels) => {
            if (activeModels.includes(model.id)) {
              debouncedStopModel(model.id)
            }
          })
      }
    }
  }
@ -98,7 +237,36 @@ export function ModelSetting({
          <SheetDescription>
            {t('common:modelSettings.description')}
          </SheetDescription>
          {/* Model Load Planning Section - Only show for llamacpp provider */}
          {provider.provider === 'llamacpp' && (
            <div className="pb-4 border-b border-main-view-fg/10 my-4">
              <div>
                <h3 className="font-medium mb-1">Optimize Settings</h3>
                <p className="text-main-view-fg/70 text-xs mb-3">
                  Analyze your system and model, then apply optimal loading
                  settings automatically
                </p>
                <Button
                  onClick={handlePlanModelLoad}
                  disabled={isPlanning}
                  variant="default"
                  className="w-full"
                >
                  {isPlanning ? (
                    <>
                      <IconLoader size={16} className="mr-2 animate-spin" />
                      Optimizing...
                    </>
                  ) : (
                    <>Auto-Optimize Settings</>
                  )}
                </Button>
              </div>
            </div>
          )}
        </SheetHeader>
        <div className="px-4 space-y-6">
          {Object.entries(model.settings || {}).map(([key, value]) => {
            const config = value as ProviderSetting
--- a/web-app/src/hooks/useModelProvider.ts
+++ b/web-app/src/hooks/useModelProvider.ts
@ -93,7 +93,11 @@ export const useModelProvider = create<ModelProviderState>()(
                  ? legacyModels
                  : models
                ).find(
-                  (m) => m.id.split(':').slice(0, 2).join(getServiceHub().path().sep()) === model.id
+                  (m) =>
                    m.id
                      .split(':')
                      .slice(0, 2)
                      .join(getServiceHub().path().sep()) === model.id
                )?.settings || model.settings
              const existingModel = models.find((m) => m.id === model.id)
              return {
@ -227,7 +231,7 @@ export const useModelProvider = create<ModelProviderState>()(
          >
        }
-        if (version === 0 && state?.providers) {
+        if (version <= 1 && state?.providers) {
          state.providers.forEach((provider) => {
            // Update cont_batching description for llamacpp provider
            if (provider.provider === 'llamacpp' && provider.settings) {
@ -270,6 +274,15 @@ export const useModelProvider = create<ModelProviderState>()(
                    },
                  }
                }
                if (!model.settings.no_kv_offload) {
                  model.settings.no_kv_offload = {
                    ...modelSettings.no_kv_offload,
                    controller_props: {
                      ...modelSettings.no_kv_offload.controller_props,
                    },
                  }
                }
              })
            }
          })
@ -277,7 +290,7 @@ export const useModelProvider = create<ModelProviderState>()(
        return state
      },
-      version: 1,
+      version: 2,
    }
  )
 )
--- a/web-app/src/lib/predefined.ts
+++ b/web-app/src/lib/predefined.ts
@ -144,4 +144,13 @@ export const modelSettings = {
      type: 'text',
    },
  },
  no_kv_offload: {
    key: 'no_kv_offload',
    title: 'Disable KV Offload',
    description: 'Disable KV cache offload to GPU (if GPU is used).',
    controller_type: 'checkbox',
    controller_props: {
      value: false,
    },
  },
 }
--- a/web-app/src/routes/settings/providers/$providerName.tsx
+++ b/web-app/src/routes/settings/providers/$providerName.tsx
@ -1,3 +1,4 @@
 /* eslint-disable @typescript-eslint/no-explicit-any */
 import { Card, CardItem } from '@/containers/Card'
 import HeaderPage from '@/containers/HeaderPage'
 import SettingsMenu from '@/containers/SettingsMenu'
@ -116,22 +117,25 @@ function ProviderDetail() {
              // Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
              // Check if model has a custom capabilities config flag
-              // eslint-disable-next-line @typescript-eslint/no-explicit-any
+
-              const hasUserConfiguredCapabilities = (model as any)._userConfiguredCapabilities === true
+              const hasUserConfiguredCapabilities =
-              
+                (model as any)._userConfiguredCapabilities === true
-              if (!capabilities.includes('vision') && !hasUserConfiguredCapabilities) {
+
              if (
                !capabilities.includes('vision') &&
                !hasUserConfiguredCapabilities
              ) {
                const updatedModels = [...llamacppProvider.models]
                updatedModels[modelIndex] = {
                  ...model,
                  capabilities: [...capabilities, 'vision'],
                  // Mark this as auto-detected, not user-configured
                  _autoDetectedVision: true,
                  // eslint-disable-next-line @typescript-eslint/no-explicit-any
                } as any
                updateProviderState('llamacpp', { models: updatedModels })
                console.log(
-                  `Vision capability auto-added to model after provider refresh: ${importedModelName}`
+                  `Vision capability added to model after provider refresh: ${importedModelName}`
                )
              }
            }
@ -257,33 +261,36 @@ function ProviderDetail() {
    }
  }
-  const handleStartModel = (modelId: string) => {
+  const handleStartModel = async (modelId: string) => {
    // Add model to loading state
    setLoadingModels((prev) => [...prev, modelId])
-    if (provider)
+    if (provider) {
-      // Original: startModel(provider, modelId).then(() => { setActiveModels((prevModels) => [...prevModels, modelId]) })
+      try {
-      serviceHub
+        // Start the model with plan result
-        .models()
+        await serviceHub.models().startModel(provider, modelId)
-        .startModel(provider, modelId)
+
-        .then(() => {
+        // Refresh active models after starting
-          // Refresh active models after starting
+        serviceHub
-          serviceHub
+          .models()
-            .models()
+          .getActiveModels()
-            .getActiveModels()
+          .then((models) => setActiveModels(models || []))
-            .then((models) => setActiveModels(models || []))
+      } catch (error) {
-        })
+        console.error('Error starting model:', error)
-        .catch((error) => {
+        if (
-          console.error('Error starting model:', error)
+          error &&
-          if (error && typeof error === 'object' && 'message' in error) {
+          typeof error === 'object' &&
-            setModelLoadError(error)
+          'message' in error &&
-          } else {
+          typeof error.message === 'string'
-            setModelLoadError(`${error}`)
+        ) {
-          }
+          setModelLoadError({ message: error.message })
-        })
+        } else {
-        .finally(() => {
+          setModelLoadError(typeof error === 'string' ? error : `${error}`)
-          // Remove model from loading state
+        }
-          setLoadingModels((prev) => prev.filter((id) => id !== modelId))
+      } finally {
-        })
+        // Remove model from loading state
        setLoadingModels((prev) => prev.filter((id) => id !== modelId))
      }
    }
  }
  const handleStopModel = (modelId: string) => {
--- a/web-app/src/services/models/default.ts
+++ b/web-app/src/services/models/default.ts
@ -17,6 +17,7 @@ import type {
  HuggingFaceRepo,
  CatalogModel,
  ModelValidationResult,
  ModelPlan,
 } from './types'
 // TODO: Replace this with the actual provider later
@ -491,4 +492,47 @@ export class DefaultModelsService implements ModelsService {
      }
    }
  }
  async planModelLoad(
    modelPath: string,
    requestedCtx?: number
  ): Promise<ModelPlan> {
    try {
      const engine = this.getEngine('llamacpp') as AIEngine & {
        planModelLoad?: (
          path: string,
          requestedCtx?: number
        ) => Promise<ModelPlan>
      }
      if (engine && typeof engine.planModelLoad === 'function') {
        // Get the full absolute path to the model file
        const janDataFolderPath = await import('@janhq/core').then((core) =>
          core.getJanDataFolderPath()
        )
        const joinPath = await import('@janhq/core').then(
          (core) => core.joinPath
        )
        const fullModelPath = await joinPath([janDataFolderPath, modelPath])
        return await engine.planModelLoad(fullModelPath, requestedCtx)
      }
      // Fallback if method is not available
      console.warn('planModelLoad method not available in llamacpp engine')
      return {
        gpuLayers: 0,
        maxContextLength: 2048,
        noOffloadKVCache: true,
        mode: 'Unsupported',
      }
    } catch (error) {
      console.error(`Error planning model load for path ${modelPath}:`, error)
      return {
        gpuLayers: 0,
        maxContextLength: 2048,
        noOffloadKVCache: true,
        mode: 'Unsupported',
      }
    }
  }
 }
--- a/web-app/src/services/models/types.ts
+++ b/web-app/src/services/models/types.ts
@ -81,10 +81,20 @@ export interface ModelValidationResult {
  metadata?: GgufMetadata
 }
 export interface ModelPlan {
  gpuLayers: number
  maxContextLength: number
  noOffloadKVCache: boolean
  mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
 }
 export interface ModelsService {
  fetchModels(): Promise<modelInfo[]>
  fetchModelCatalog(): Promise<ModelCatalog>
-  fetchHuggingFaceRepo(repoId: string, hfToken?: string): Promise<HuggingFaceRepo | null>
+  fetchHuggingFaceRepo(
    repoId: string,
    hfToken?: string
  ): Promise<HuggingFaceRepo | null>
  convertHfRepoToCatalogModel(repo: HuggingFaceRepo): CatalogModel
  updateModel(model: Partial<CoreModel>): Promise<void>
  pullModel(
@ -107,14 +117,24 @@ export interface ModelsService {
  getActiveModels(provider?: string): Promise<string[]>
  stopModel(model: string, provider?: string): Promise<void>
  stopAllModels(): Promise<void>
-  startModel(provider: ProviderObject, model: string): Promise<SessionInfo | undefined>
+  startModel(
    provider: ProviderObject,
    model: string
  ): Promise<SessionInfo | undefined>
  isToolSupported(modelId: string): Promise<boolean>
  checkMmprojExistsAndUpdateOffloadMMprojSetting(
    modelId: string,
-    updateProvider?: (providerName: string, data: Partial<ModelProvider>) => void,
+    updateProvider?: (
      providerName: string,
      data: Partial<ModelProvider>
    ) => void,
    getProviderByName?: (providerName: string) => ModelProvider | undefined
  ): Promise<{ exists: boolean; settingsUpdated: boolean }>
  checkMmprojExists(modelId: string): Promise<boolean>
-  isModelSupported(modelPath: string, ctxSize?: number): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
+  isModelSupported(
    modelPath: string,
    ctxSize?: number
  ): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
  validateGgufFile(filePath: string): Promise<ModelValidationResult>
-}
+  planModelLoad(modelPath: string, requestedCtx?: number): Promise<ModelPlan>
 }