diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json
index ddbefa936..2bca12c0f 100644
--- a/extensions/llamacpp-extension/settings.json
+++ b/extensions/llamacpp-extension/settings.json
@@ -36,6 +36,21 @@
     "controllerType": "checkbox",
     "controllerProps": { "value": true }
   },
+  {
+    "key": "memory_util",
+    "title": "Smart Memory utilization",
+    "description": "Smart memory utilization mode for running local GGUF models",
+    "controllerType": "dropdown",
+    "controllerProps": {
+      "value": "high",
+      "options": [
+        { "value": "high", "name": "High" },
+        { "value": "medium", "name": "Medium" },
+        { "value": "low", "name": "Low" }
+      ],
+      "recommended": "high"
+    }
+  },
   {
     "key": "threads",
     "title": "Threads",
@@ -178,15 +193,6 @@
       "value": false
     }
   },
-  {
-    "key": "no_kv_offload",
-    "title": "Disable KV Offload",
-    "description": "Disable KV cache offload to GPU (if GPU is used).",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": false
-    }
-  },
   {
     "key": "cache_type_k",
     "title": "KV Cache K Type",
diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
index 744eed3c4..e706b58ae 100644
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@@ -35,10 +35,7 @@ import {
 import { invoke } from '@tauri-apps/api/core'
 import { getProxyConfig } from './util'
 import { basename } from '@tauri-apps/api/path'
-import {
-  GgufMetadata,
-  readGgufMetadata,
-} from '@janhq/tauri-plugin-llamacpp-api'
+import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
 import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
 
 type LlamacppConfig = {
@@ -46,6 +43,7 @@ type LlamacppConfig = {
   auto_update_engine: boolean
   auto_unload: boolean
   llamacpp_env: string
+  memory_util: string
   chat_template: string
   n_gpu_layers: number
   offload_mmproj: boolean
@@ -74,6 +72,14 @@ type LlamacppConfig = {
   ctx_shift: boolean
 }
 
+type ModelPlan = {
+  gpuLayers: number
+  maxContextLength: number
+  noOffloadKVCache: boolean
+  noOffloadMmproj?: boolean
+  mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
+}
+
 interface DownloadItem {
   url: string
   save_path: string
@@ -116,6 +122,12 @@ interface DeviceList {
   free: number
 }
 
+interface SystemMemory {
+  totalVRAM: number
+  totalRAM: number
+  totalMemory: number
+}
+
 /**
  * Override the default app.log function to use Jan's logging system.
  * @param args
@@ -159,6 +171,7 @@ export default class llamacpp_extension extends AIEngine {
   provider: string = 'llamacpp'
   autoUnload: boolean = true
   llamacpp_env: string = ''
+  memoryMode: string = 'high'
   readonly providerId: string = 'llamacpp'
 
   private config: LlamacppConfig
@@ -190,6 +203,7 @@ export default class llamacpp_extension extends AIEngine {
 
     this.autoUnload = this.config.auto_unload
     this.llamacpp_env = this.config.llamacpp_env
+    this.memoryMode = this.config.memory_util
 
     // This sets the base directory where model files for this provider are stored.
     this.providerPath = await joinPath([
@@ -836,6 +850,8 @@ export default class llamacpp_extension extends AIEngine {
       this.autoUnload = value as boolean
     } else if (key === 'llamacpp_env') {
       this.llamacpp_env = value as string
+    } else if (key === 'memory_util') {
+      this.memoryMode = value as string
     }
   }
 
@@ -1864,10 +1880,368 @@ export default class llamacpp_extension extends AIEngine {
       'tokenizer.chat_template'
     ]?.includes('tools')
   }
+  /**
+   * Get total system memory including both VRAM and RAM
+   */
+  private async getTotalSystemMemory(): Promise<SystemMemory> {
+    const devices = await this.getDevices()
+    let totalVRAM = 0
+
+    if (devices.length > 0) {
+      // Sum total VRAM across all GPUs
+      totalVRAM = devices
+        .map((d) => d.mem * 1024 * 1024)
+        .reduce((a, b) => a + b, 0)
+    }
+
+    // Get system RAM
+    const sys = await getSystemUsage()
+    const totalRAM = sys.used_memory * 1024 * 1024
+
+    const totalMemory = totalVRAM + totalRAM
+
+    logger.info(
+      `Total VRAM: ${totalVRAM} bytes, Total RAM: ${totalRAM} bytes, Free: ${usableRAM} bytes, Total Memory: ${totalMemory} bytes`
+    )
+
+    return {
+      totalVRAM,
+      totalRAM,
+      totalMemory,
+    }
+  }
+  private async getKVCachePerToken(
+    meta: Record<string, string>
+  ): Promise<number> {
+    const arch = meta['general.architecture']
+    const nLayer = Number(meta[`${arch}.block_count`])
+    const nHead = Number(meta[`${arch}.attention.head_count`])
+
+    // Get head dimensions
+    const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
+    const embeddingLen = Number(meta[`${arch}.embedding_length`])
+    const headDim = embeddingLen / nHead
+
+    // KV cache uses head_count_kv (for GQA models) or head_count
+    // Each token needs K and V, both are fp16 (2 bytes)
+    const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
+
+    return bytesPerToken
+  }
+
+  private async getLayerSize(
+    path: string,
+    meta: Record<string, string>
+  ): Promise<{ layerSize: number; totalLayers: number }> {
+    const modelSize = await this.getModelSize(path)
+    const arch = meta['general.architecture']
+    const totalLayers = Number(meta[`${arch}.block_count`])
+    if (!totalLayers) throw new Error('Invalid metadata: block_count not found')
+    return { layerSize: modelSize / totalLayers, totalLayers }
+  }
+
+  async planModelLoad(
+    path: string,
+    requestedCtx?: number,
+    mmprojPath?: string
+  ): Promise<ModelPlan> {
+    const modelSize = await this.getModelSize(path)
+    const memoryInfo = await this.getTotalSystemMemory()
+    const gguf = await readGgufMetadata(path)
+
+    // Get mmproj size if provided
+    let mmprojSize = 0
+    if (mmprojPath) {
+      mmprojSize = await this.getModelSize(mmprojPath)
+    }
+
+    const { layerSize, totalLayers } = await this.getLayerSize(
+      path,
+      gguf.metadata
+    )
+
+    // Fixed KV cache calculation
+    const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
+
+    // Debug logging
+    logger.info(
+      `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
+    )
+
+    // Validate critical values
+    if (!modelSize || modelSize <= 0) {
+      throw new Error(`Invalid model size: ${modelSize}`)
+    }
+    if (!kvCachePerToken || kvCachePerToken <= 0) {
+      throw new Error(`Invalid KV cache per token: ${kvCachePerToken}`)
+    }
+    if (!layerSize || layerSize <= 0) {
+      throw new Error(`Invalid layer size: ${layerSize}`)
+    }
+
+    // GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
+    const GPU_OVERHEAD_FACTOR = 0.8
+
+    // VRAM budget with overhead consideration
+    const VRAM_RESERVE_GB = 0.5
+    const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
+    const usableVRAM = Math.max(
+      0,
+      (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
+    )
+
+    // Get model's maximum context length
+    const arch = gguf.metadata['general.architecture']
+    const modelMaxContextLength =
+      Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
+
+    // Set minimum context length
+    const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
+
+    // System RAM budget
+    const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
+
+    logger.info(
+      `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
+    )
+
+    // Validate memory info
+    if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
+      throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
+    }
+    if (!memoryInfo.totalVRAM || isNaN(memoryInfo.totalVRAM)) {
+      throw new Error(`Invalid total VRAM: ${memoryInfo.totalVRAM}`)
+    }
+    if (!this.memoryMode || !(this.memoryMode in memoryPercentages)) {
+      throw new Error(
+        `Invalid memory mode: ${this.memoryMode}. Must be 'high', 'medium', or 'low'`
+      )
+    }
+
+    // Calculate actual system RAM
+    const actualSystemRAM = Math.max(
+      0,
+      memoryInfo.totalMemory - memoryInfo.totalVRAM
+    )
+    const usableSystemMemory =
+      actualSystemRAM * memoryPercentages[this.memoryMode]
+
+    logger.info(
+      `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
+    )
+
+    // --- Priority 1: Allocate mmproj (if exists) ---
+    let noOffloadMmproj = false
+    let remainingVRAM = usableVRAM
+
+    if (mmprojSize > 0) {
+      if (mmprojSize <= remainingVRAM) {
+        noOffloadMmproj = true
+        remainingVRAM -= mmprojSize
+        logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
+      } else {
+        logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
+      }
+    }
+
+    // --- Priority 2: Calculate optimal layer/context balance ---
+    let gpuLayers = 0
+    let maxContextLength = MIN_CONTEXT_LENGTH
+    let noOffloadKVCache = false
+    let mode: ModelPlan['mode'] = 'Unsupported'
+
+    // Calculate how much VRAM we need for different context sizes
+    const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
+    const targetContext = requestedCtx || modelMaxContextLength
+
+    // Find the best balance of layers and context
+    let bestConfig = {
+      layers: 0,
+      context: MIN_CONTEXT_LENGTH,
+      vramUsed: 0,
+    }
+
+    for (const ctxSize of contextSizes) {
+      if (ctxSize > targetContext) break
+
+      const kvCacheSize = ctxSize * kvCachePerToken
+      const availableForLayers = remainingVRAM - kvCacheSize
+
+      if (availableForLayers <= 0) continue
+
+      const possibleLayers = Math.min(
+        Math.floor(availableForLayers / layerSize),
+        totalLayers
+      )
+
+      if (possibleLayers > 0) {
+        const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
+
+        // Verify this fits with some margin
+        if (totalVramNeeded <= remainingVRAM * 0.95) {
+          bestConfig = {
+            layers: possibleLayers,
+            context: ctxSize,
+            vramUsed: totalVramNeeded,
+          }
+        }
+      }
+    }
+
+    // Apply the best configuration found
+    if (bestConfig.layers > 0) {
+      gpuLayers = bestConfig.layers
+      maxContextLength = bestConfig.context
+      noOffloadKVCache = false
+      mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
+
+      logger.info(
+        `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
+          `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
+      )
+    } else {
+      // Fallback: Try minimal GPU layers with KV cache on CPU
+      gpuLayers = Math.min(
+        Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
+        totalLayers
+      )
+
+      if (gpuLayers > 0) {
+        // Calculate available system RAM for KV cache
+        const cpuLayers = totalLayers - gpuLayers
+        const modelCPUSize = cpuLayers * layerSize
+        const mmprojCPUSize =
+          mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0
+        const systemRAMUsed = modelCPUSize + mmprojCPUSize
+        const availableSystemRAMForKVCache = Math.max(
+          0,
+          usableSystemMemory - systemRAMUsed
+        )
+
+        // Calculate context that fits in system RAM
+        const systemRAMContext = Math.min(
+          Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
+          targetContext
+        )
+
+        if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
+          maxContextLength = systemRAMContext
+          noOffloadKVCache = true
+          mode = 'Hybrid'
+
+          logger.info(
+            `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
+              `${maxContextLength} context on CPU RAM`
+          )
+        } else {
+          // Can't fit reasonable context even with CPU RAM
+          // Reduce GPU layers further
+          gpuLayers = Math.floor(gpuLayers / 2)
+          maxContextLength = MIN_CONTEXT_LENGTH
+          noOffloadKVCache = true
+          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
+        }
+      } else {
+        // Pure CPU mode
+        gpuLayers = 0
+        noOffloadKVCache = true
+
+        // Calculate context for pure CPU mode
+        const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
+        const availableForKVCache = Math.max(
+          0,
+          usableSystemMemory - totalCPUMemoryNeeded
+        )
+
+        maxContextLength = Math.min(
+          Math.max(
+            MIN_CONTEXT_LENGTH,
+            Math.floor(availableForKVCache / kvCachePerToken)
+          ),
+          targetContext
+        )
+
+        mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
+      }
+    }
+
+    // Safety check: Verify total GPU memory usage
+    if (gpuLayers > 0 && !noOffloadKVCache) {
+      const estimatedGPUUsage =
+        gpuLayers * layerSize +
+        maxContextLength * kvCachePerToken +
+        (noOffloadMmproj ? mmprojSize : 0)
+
+      if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
+        logger.warn(
+          `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
+        )
+
+        // Reduce context first
+        while (
+          maxContextLength > MIN_CONTEXT_LENGTH &&
+          estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
+        ) {
+          maxContextLength = Math.floor(maxContextLength / 2)
+          const newEstimate =
+            gpuLayers * layerSize +
+            maxContextLength * kvCachePerToken +
+            (noOffloadMmproj ? mmprojSize : 0)
+          if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
+        }
+
+        // If still too much, reduce layers
+        if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
+          gpuLayers = Math.floor(gpuLayers * 0.7)
+          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
+          noOffloadKVCache = true // Move KV cache to CPU
+        }
+      }
+    }
+
+    // Apply user-requested context limit if specified
+    if (requestedCtx && requestedCtx > 0) {
+      maxContextLength = Math.min(maxContextLength, requestedCtx)
+      logger.info(
+        `User requested context: ${requestedCtx}, final: ${maxContextLength}`
+      )
+    }
+
+    // Ensure we never exceed model's maximum context
+    maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
+
+    // Final validation
+    if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
+      mode = 'Unsupported'
+    }
+
+    // Ensure maxContextLength is valid
+    maxContextLength = isNaN(maxContextLength)
+      ? MIN_CONTEXT_LENGTH
+      : Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
+
+    // Log final plan
+    const mmprojInfo = mmprojPath
+      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}`
+      : ''
+
+    logger.info(
+      `Final plan for ${path}: gpuLayers=${gpuLayers}/${totalLayers}, ` +
+        `maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, ` +
+        `mode=${mode}${mmprojInfo}`
+    )
+
+    return {
+      gpuLayers,
+      maxContextLength,
+      noOffloadKVCache,
+      mode,
+      noOffloadMmproj,
+    }
+  }
 
   /**
-   *  estimate KVCache size of from a given metadata
-   *
+   * estimate KVCache size from a given metadata
    */
   private async estimateKVCache(
     meta: Record<string, string>,
@@ -1907,6 +2281,7 @@ export default class llamacpp_extension extends AIEngine {
         `Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}`
       )
     }
+
     let ctxLen: number
     if (!ctx_size) {
       ctxLen = Number(meta[`${arch}.context_length`])
@@ -1941,13 +2316,13 @@ export default class llamacpp_extension extends AIEngine {
     }
   }
 
-  /*
-   * check the support status of a model by its path (local/remote)
+  /**
+   * Check the support status of a model by its path (local/remote)
    *
-   * * Returns:
-   * - "RED"    → weights don't fit
-   * - "YELLOW" → weights fit, KV cache doesn't
-   * - "GREEN"  → both weights + KV cache fit
+   * Returns:
+   * - "RED"    → weights don't fit in total memory
+   * - "YELLOW" → weights fit in VRAM but need system RAM, or KV cache doesn't fit
+   * - "GREEN"  → both weights + KV cache fit in VRAM
    */
   async isModelSupported(
     path: string,
@@ -1955,46 +2330,48 @@ export default class llamacpp_extension extends AIEngine {
   ): Promise<'RED' | 'YELLOW' | 'GREEN'> {
     try {
       const modelSize = await this.getModelSize(path)
+      const memoryInfo = await this.getTotalSystemMemory()
+
       logger.info(`modelSize: ${modelSize}`)
-      let gguf: GgufMetadata
-      gguf = await readGgufMetadata(path)
+
+      const gguf = await readGgufMetadata(path)
       let kvCacheSize: number
       if (ctx_size) {
         kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
       } else {
         kvCacheSize = await this.estimateKVCache(gguf.metadata)
       }
-      // total memory consumption = model weights + kvcache + a small buffer for outputs
-      // output buffer is small so not considering here
+
+      // Total memory consumption = model weights + kvcache
       const totalRequired = modelSize + kvCacheSize
       logger.info(
         `isModelSupported: Total memory requirement: ${totalRequired} for ${path}`
       )
-      let totalMemBytes: number
-      const devices = await this.getDevices()
-      if (devices.length > 0) {
-        // Sum total memory across all GPUs
-        totalMemBytes = devices
-          .map((d) => d.mem * 1024 * 1024)
-          .reduce((a, b) => a + b, 0)
-      } else {
-        // CPU fallback
-        const sys = await getSystemUsage()
-        totalMemBytes = sys.total_memory * 1024 * 1024
-      }
 
       // Use 80% of total memory as the usable limit
       const USABLE_MEMORY_PERCENTAGE = 0.8
-      const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE
+      const usableTotalMemory =
+        memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
+      const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
 
-      // check model size wrt 80% of system memory
-      if (modelSize > usableMemBytes) {
+      // Check if model fits in total memory at all
+      if (modelSize > usableTotalMemory) {
         return 'RED'
-      } else if (modelSize + kvCacheSize > usableMemBytes) {
-        return 'YELLOW'
-      } else {
+      }
+
+      // Check if everything fits in VRAM (ideal case)
+      if (totalRequired <= usableVRAM) {
         return 'GREEN'
       }
+
+      // Check if model fits in VRAM but total requirement exceeds VRAM
+      // OR if total requirement fits in total memory but not in VRAM
+      if (modelSize <= usableVRAM || totalRequired <= usableTotalMemory) {
+        return 'YELLOW'
+      }
+
+      // If we get here, nothing fits properly
+      return 'RED'
     } catch (e) {
       throw new Error(String(e))
     }
@@ -2006,39 +2383,42 @@ export default class llamacpp_extension extends AIEngine {
   async validateGgufFile(filePath: string): Promise<{
     isValid: boolean
     error?: string
-    metadata?: GgufMetadata
+    metadata?: any
   }> {
     try {
       logger.info(`Validating GGUF file: ${filePath}`)
       const metadata = await readGgufMetadata(filePath)
-      
+
       // Log full metadata for debugging
       logger.info('Full GGUF metadata:', JSON.stringify(metadata, null, 2))
-      
+
       // Check if architecture is 'clip' which is not supported for text generation
       const architecture = metadata.metadata?.['general.architecture']
       logger.info(`Model architecture: ${architecture}`)
-      
+
       if (architecture === 'clip') {
-        const errorMessage = 'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
+        const errorMessage =
+          'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
         logger.error('CLIP architecture detected:', architecture)
         return {
           isValid: false,
           error: errorMessage,
-          metadata
+          metadata,
         }
       }
-      
+
       logger.info('Model validation passed. Architecture:', architecture)
       return {
         isValid: true,
-        metadata
+        metadata,
       }
     } catch (error) {
       logger.error('Failed to validate GGUF file:', error)
       return {
         isValid: false,
-        error: `Failed to read model metadata: ${error instanceof Error ? error.message : 'Unknown error'}`
+        error: `Failed to read model metadata: ${
+          error instanceof Error ? error.message : 'Unknown error'
+        }`,
       }
     }
   }
diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx
index a18f5184a..4a1525003 100644
--- a/web-app/src/containers/ModelSetting.tsx
+++ b/web-app/src/containers/ModelSetting.tsx
@@ -1,5 +1,6 @@
-import { IconSettings } from '@tabler/icons-react'
+import { IconSettings, IconLoader } from '@tabler/icons-react'
 import debounce from 'lodash.debounce'
+import { useState } from 'react'
 
 import {
   Sheet,
@@ -9,6 +10,7 @@ import {
   SheetTitle,
   SheetTrigger,
 } from '@/components/ui/sheet'
+import { Button } from '@/components/ui/button'
 import { DynamicControllerSetting } from '@/containers/dynamicControllerSetting'
 import { useModelProvider } from '@/hooks/useModelProvider'
 import { useServiceHub } from '@/hooks/useServiceHub'
@@ -30,11 +32,134 @@ export function ModelSetting({
   const { t } = useTranslation()
   const serviceHub = useServiceHub()
 
+  const [isPlanning, setIsPlanning] = useState(false)
+
   // Create a debounced version of stopModel that waits 500ms after the last call
   const debouncedStopModel = debounce((modelId: string) => {
     serviceHub.models().stopModel(modelId)
   }, 500)
 
+  const handlePlanModelLoad = async () => {
+    if (provider.provider !== 'llamacpp') {
+      console.warn('planModelLoad is only available for llamacpp provider')
+      return
+    }
+    setIsPlanning(true)
+    try {
+      // Read the model config to get the actual model path
+      const modelConfig = await serviceHub.app().readYaml<{
+        model_path: string
+      }>(`llamacpp/models/${model.id}/model.yml`)
+
+      if (modelConfig && modelConfig.model_path) {
+        const result = await serviceHub
+          .models()
+          .planModelLoad(modelConfig.model_path)
+
+        // Apply the recommended settings to the model sequentially to avoid race conditions
+        const settingsToUpdate: Array<{
+          key: string
+          value: number | boolean
+        }> = []
+
+        if (model.settings?.ngl && result.gpuLayers !== undefined) {
+          settingsToUpdate.push({ key: 'ngl', value: result.gpuLayers })
+        }
+
+        if (model.settings?.ctx_len && result.maxContextLength !== undefined) {
+          settingsToUpdate.push({
+            key: 'ctx_len',
+            value: result.maxContextLength,
+          })
+        }
+
+        if (
+          model.settings?.no_kv_offload &&
+          result.noOffloadKVCache !== undefined
+        ) {
+          settingsToUpdate.push({
+            key: 'no_kv_offload',
+            value: result.noOffloadKVCache,
+          })
+        }
+
+        // Apply all settings in a single update to avoid race conditions
+        if (settingsToUpdate.length > 0) {
+          handleMultipleSettingsChange(settingsToUpdate)
+        }
+      } else {
+        console.warn('No model_path found in config for', model.id)
+      }
+    } catch (error) {
+      console.error('Error calling planModelLoad:', error)
+    } finally {
+      setIsPlanning(false)
+    }
+  }
+
+  const handleMultipleSettingsChange = (
+    settingsToUpdate: Array<{ key: string; value: number | boolean }>
+  ) => {
+    if (!provider) return
+
+    // Create a copy of the model with ALL updated settings at once
+    let updatedModel = { ...model }
+
+    settingsToUpdate.forEach(({ key, value }) => {
+      const existingSetting = updatedModel.settings?.[key] as ProviderSetting
+      updatedModel = {
+        ...updatedModel,
+        settings: {
+          ...updatedModel.settings,
+          [key]: {
+            ...existingSetting,
+            controller_props: {
+              ...existingSetting?.controller_props,
+              value: value,
+            },
+          } as ProviderSetting,
+        },
+      }
+    })
+
+    // Find the model index in the provider's models array
+    const modelIndex = provider.models.findIndex((m) => m.id === model.id)
+
+    if (modelIndex !== -1) {
+      // Create a copy of the provider's models array
+      const updatedModels = [...provider.models]
+
+      // Update the specific model in the array
+      updatedModels[modelIndex] = updatedModel as Model
+
+      // Update the provider with the new models array
+      updateProvider(provider.provider, {
+        models: updatedModels,
+      })
+
+      // Check if any of the updated settings require a model restart
+      const requiresRestart = settingsToUpdate.some(
+        ({ key }) =>
+          key === 'ctx_len' ||
+          key === 'ngl' ||
+          key === 'chat_template' ||
+          key === 'offload_mmproj'
+      )
+
+      if (requiresRestart) {
+        // Check if model is running before stopping it
+        serviceHub
+          .models()
+          .getActiveModels()
+          .then((activeModels) => {
+            if (activeModels.includes(model.id)) {
+              debouncedStopModel(model.id)
+            }
+          })
+      }
+    }
+  }
+
   const handleSettingChange = (
     key: string,
     value: string | boolean | number
@@ -72,8 +197,22 @@ export function ModelSetting({
       })
 
       // Call debounced stopModel only when updating ctx_len, ngl, chat_template, or offload_mmproj
-      if (key === 'ctx_len' || key === 'ngl' || key === 'chat_template' || key === 'offload_mmproj') {
-        debouncedStopModel(model.id)
+      // and only if the model is currently running
+      if (
+        key === 'ctx_len' ||
+        key === 'ngl' ||
+        key === 'chat_template' ||
+        key === 'offload_mmproj'
+      ) {
+        // Check if model is running before stopping it
+        serviceHub
+          .models()
+          .getActiveModels()
+          .then((activeModels) => {
+            if (activeModels.includes(model.id)) {
+              debouncedStopModel(model.id)
+            }
+          })
       }
     }
   }
@@ -98,7 +237,36 @@ export function ModelSetting({
           <SheetDescription>
             {t('common:modelSettings.description')}
           </SheetDescription>
+
+          {/* Model Load Planning Section - Only show for llamacpp provider */}
+          {provider.provider === 'llamacpp' && (
+            <div className="pb-4 border-b border-main-view-fg/10 my-4">
+              <div>
+                <h3 className="font-medium mb-1">Optimize Settings</h3>
+                <p className="text-main-view-fg/70 text-xs mb-3">
+                  Analyze your system and model, then apply optimal loading
+                  settings automatically
+                </p>
+                <Button
+                  onClick={handlePlanModelLoad}
+                  disabled={isPlanning}
+                  variant="default"
+                  className="w-full"
+                >
+                  {isPlanning ? (
+                    <>
+                      <IconLoader size={16} className="mr-2 animate-spin" />
+                      Optimizing...
+                    </>
+                  ) : (
+                    <>Auto-Optimize Settings</>
+                  )}
+                </Button>
+              </div>
+            </div>
+          )}
         </SheetHeader>
+
         <div className="px-4 space-y-6">
           {Object.entries(model.settings || {}).map(([key, value]) => {
             const config = value as ProviderSetting
diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts
index 86d7f4dba..4d476ae7c 100644
--- a/web-app/src/hooks/useModelProvider.ts
+++ b/web-app/src/hooks/useModelProvider.ts
@@ -93,7 +93,11 @@ export const useModelProvider = create<ModelProviderState>()(
                   ? legacyModels
                   : models
                 ).find(
-                  (m) => m.id.split(':').slice(0, 2).join(getServiceHub().path().sep()) === model.id
+                  (m) =>
+                    m.id
+                      .split(':')
+                      .slice(0, 2)
+                      .join(getServiceHub().path().sep()) === model.id
                 )?.settings || model.settings
               const existingModel = models.find((m) => m.id === model.id)
               return {
@@ -227,7 +231,7 @@ export const useModelProvider = create<ModelProviderState>()(
           >
         }
 
-        if (version === 0 && state?.providers) {
+        if (version <= 1 && state?.providers) {
           state.providers.forEach((provider) => {
             // Update cont_batching description for llamacpp provider
             if (provider.provider === 'llamacpp' && provider.settings) {
@@ -270,6 +274,15 @@ export const useModelProvider = create<ModelProviderState>()(
                     },
                   }
                 }
+
+                if (!model.settings.no_kv_offload) {
+                  model.settings.no_kv_offload = {
+                    ...modelSettings.no_kv_offload,
+                    controller_props: {
+                      ...modelSettings.no_kv_offload.controller_props,
+                    },
+                  }
+                }
               })
             }
           })
@@ -277,7 +290,7 @@ export const useModelProvider = create<ModelProviderState>()(
 
         return state
       },
-      version: 1,
+      version: 2,
     }
   )
 )
diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts
index b4d5164e7..32d05d70c 100644
--- a/web-app/src/lib/predefined.ts
+++ b/web-app/src/lib/predefined.ts
@@ -144,4 +144,13 @@ export const modelSettings = {
       type: 'text',
     },
   },
+  no_kv_offload: {
+    key: 'no_kv_offload',
+    title: 'Disable KV Offload',
+    description: 'Disable KV cache offload to GPU (if GPU is used).',
+    controller_type: 'checkbox',
+    controller_props: {
+      value: false,
+    },
+  },
 }
diff --git a/web-app/src/routes/settings/providers/$providerName.tsx b/web-app/src/routes/settings/providers/$providerName.tsx
index b24baf5cf..5bcc3de5a 100644
--- a/web-app/src/routes/settings/providers/$providerName.tsx
+++ b/web-app/src/routes/settings/providers/$providerName.tsx
@@ -1,3 +1,4 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { Card, CardItem } from '@/containers/Card'
 import HeaderPage from '@/containers/HeaderPage'
 import SettingsMenu from '@/containers/SettingsMenu'
@@ -116,22 +117,25 @@ function ProviderDetail() {
 
               // Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
               // Check if model has a custom capabilities config flag
-              // eslint-disable-next-line @typescript-eslint/no-explicit-any
-              const hasUserConfiguredCapabilities = (model as any)._userConfiguredCapabilities === true
-              
-              if (!capabilities.includes('vision') && !hasUserConfiguredCapabilities) {
+
+              const hasUserConfiguredCapabilities =
+                (model as any)._userConfiguredCapabilities === true
+
+              if (
+                !capabilities.includes('vision') &&
+                !hasUserConfiguredCapabilities
+              ) {
                 const updatedModels = [...llamacppProvider.models]
                 updatedModels[modelIndex] = {
                   ...model,
                   capabilities: [...capabilities, 'vision'],
                   // Mark this as auto-detected, not user-configured
                   _autoDetectedVision: true,
-                  // eslint-disable-next-line @typescript-eslint/no-explicit-any
                 } as any
 
                 updateProviderState('llamacpp', { models: updatedModels })
                 console.log(
-                  `Vision capability auto-added to model after provider refresh: ${importedModelName}`
+                  `Vision capability added to model after provider refresh: ${importedModelName}`
                 )
               }
             }
@@ -257,33 +261,36 @@ function ProviderDetail() {
     }
   }
 
-  const handleStartModel = (modelId: string) => {
+  const handleStartModel = async (modelId: string) => {
     // Add model to loading state
     setLoadingModels((prev) => [...prev, modelId])
-    if (provider)
-      // Original: startModel(provider, modelId).then(() => { setActiveModels((prevModels) => [...prevModels, modelId]) })
-      serviceHub
-        .models()
-        .startModel(provider, modelId)
-        .then(() => {
-          // Refresh active models after starting
-          serviceHub
-            .models()
-            .getActiveModels()
-            .then((models) => setActiveModels(models || []))
-        })
-        .catch((error) => {
-          console.error('Error starting model:', error)
-          if (error && typeof error === 'object' && 'message' in error) {
-            setModelLoadError(error)
-          } else {
-            setModelLoadError(`${error}`)
-          }
-        })
-        .finally(() => {
-          // Remove model from loading state
-          setLoadingModels((prev) => prev.filter((id) => id !== modelId))
-        })
+    if (provider) {
+      try {
+        // Start the model with plan result
+        await serviceHub.models().startModel(provider, modelId)
+
+        // Refresh active models after starting
+        serviceHub
+          .models()
+          .getActiveModels()
+          .then((models) => setActiveModels(models || []))
+      } catch (error) {
+        console.error('Error starting model:', error)
+        if (
+          error &&
+          typeof error === 'object' &&
+          'message' in error &&
+          typeof error.message === 'string'
+        ) {
+          setModelLoadError({ message: error.message })
+        } else {
+          setModelLoadError(typeof error === 'string' ? error : `${error}`)
+        }
+      } finally {
+        // Remove model from loading state
+        setLoadingModels((prev) => prev.filter((id) => id !== modelId))
+      }
+    }
   }
 
   const handleStopModel = (modelId: string) => {
diff --git a/web-app/src/services/models/default.ts b/web-app/src/services/models/default.ts
index d4322b971..54595d448 100644
--- a/web-app/src/services/models/default.ts
+++ b/web-app/src/services/models/default.ts
@@ -17,6 +17,7 @@ import type {
   HuggingFaceRepo,
   CatalogModel,
   ModelValidationResult,
+  ModelPlan,
 } from './types'
 
 // TODO: Replace this with the actual provider later
@@ -491,4 +492,47 @@ export class DefaultModelsService implements ModelsService {
       }
     }
   }
+
+  async planModelLoad(
+    modelPath: string,
+    requestedCtx?: number
+  ): Promise<ModelPlan> {
+    try {
+      const engine = this.getEngine('llamacpp') as AIEngine & {
+        planModelLoad?: (
+          path: string,
+          requestedCtx?: number
+        ) => Promise<ModelPlan>
+      }
+
+      if (engine && typeof engine.planModelLoad === 'function') {
+        // Get the full absolute path to the model file
+        const janDataFolderPath = await import('@janhq/core').then((core) =>
+          core.getJanDataFolderPath()
+        )
+        const joinPath = await import('@janhq/core').then(
+          (core) => core.joinPath
+        )
+        const fullModelPath = await joinPath([janDataFolderPath, modelPath])
+        return await engine.planModelLoad(fullModelPath, requestedCtx)
+      }
+
+      // Fallback if method is not available
+      console.warn('planModelLoad method not available in llamacpp engine')
+      return {
+        gpuLayers: 0,
+        maxContextLength: 2048,
+        noOffloadKVCache: true,
+        mode: 'Unsupported',
+      }
+    } catch (error) {
+      console.error(`Error planning model load for path ${modelPath}:`, error)
+      return {
+        gpuLayers: 0,
+        maxContextLength: 2048,
+        noOffloadKVCache: true,
+        mode: 'Unsupported',
+      }
+    }
+  }
 }
diff --git a/web-app/src/services/models/types.ts b/web-app/src/services/models/types.ts
index 7d51d8b09..920cbfe81 100644
--- a/web-app/src/services/models/types.ts
+++ b/web-app/src/services/models/types.ts
@@ -81,10 +81,20 @@ export interface ModelValidationResult {
   metadata?: GgufMetadata
 }
 
+export interface ModelPlan {
+  gpuLayers: number
+  maxContextLength: number
+  noOffloadKVCache: boolean
+  mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
+}
+
 export interface ModelsService {
   fetchModels(): Promise<modelInfo[]>
   fetchModelCatalog(): Promise<ModelCatalog>
-  fetchHuggingFaceRepo(repoId: string, hfToken?: string): Promise<HuggingFaceRepo | null>
+  fetchHuggingFaceRepo(
+    repoId: string,
+    hfToken?: string
+  ): Promise<HuggingFaceRepo | null>
   convertHfRepoToCatalogModel(repo: HuggingFaceRepo): CatalogModel
   updateModel(model: Partial<CoreModel>): Promise<void>
   pullModel(
@@ -107,14 +117,24 @@ export interface ModelsService {
   getActiveModels(provider?: string): Promise<string[]>
   stopModel(model: string, provider?: string): Promise<void>
   stopAllModels(): Promise<void>
-  startModel(provider: ProviderObject, model: string): Promise<SessionInfo | undefined>
+  startModel(
+    provider: ProviderObject,
+    model: string
+  ): Promise<SessionInfo | undefined>
   isToolSupported(modelId: string): Promise<boolean>
   checkMmprojExistsAndUpdateOffloadMMprojSetting(
     modelId: string,
-    updateProvider?: (providerName: string, data: Partial<ModelProvider>) => void,
+    updateProvider?: (
+      providerName: string,
+      data: Partial<ModelProvider>
+    ) => void,
     getProviderByName?: (providerName: string) => ModelProvider | undefined
   ): Promise<{ exists: boolean; settingsUpdated: boolean }>
   checkMmprojExists(modelId: string): Promise<boolean>
-  isModelSupported(modelPath: string, ctxSize?: number): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
+  isModelSupported(
+    modelPath: string,
+    ctxSize?: number
+  ): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
   validateGgufFile(filePath: string): Promise<ModelValidationResult>
-}
\ No newline at end of file
+  planModelLoad(modelPath: string, requestedCtx?: number): Promise<ModelPlan>
+}