diff --git a/.github/workflows/template-tauri-build-linux-x64.yml b/.github/workflows/template-tauri-build-linux-x64.yml
index 9e30d5627..bd9b38369 100644
--- a/.github/workflows/template-tauri-build-linux-x64.yml
+++ b/.github/workflows/template-tauri-build-linux-x64.yml
@@ -53,7 +53,7 @@ on:
         value: ${{ jobs.build-linux-x64.outputs.APPIMAGE_FILE_NAME }}
 jobs:
   build-linux-x64:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     outputs:
       DEB_SIG: ${{ steps.packageinfo.outputs.DEB_SIG }}
       APPIMAGE_SIG: ${{ steps.packageinfo.outputs.APPIMAGE_SIG }}
diff --git a/core/src/browser/extensions/engines/AIEngine.ts b/core/src/browser/extensions/engines/AIEngine.ts
index 855f6e4dc..af63d9b19 100644
--- a/core/src/browser/extensions/engines/AIEngine.ts
+++ b/core/src/browser/extensions/engines/AIEngine.ts
@@ -289,11 +289,6 @@ export abstract class AIEngine extends BaseExtension {
    */
   abstract getLoadedModels(): Promise<string[]>
 
-  /**
-   * Optional method to get the underlying chat client
-   */
-  getChatClient?(sessionId: string): any
-
   /**
    * Check if a tool is supported by the model
    * @param modelId
diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json
index 2bca12c0f..ce5fc62e4 100644
--- a/extensions/llamacpp-extension/settings.json
+++ b/extensions/llamacpp-extension/settings.json
@@ -96,18 +96,6 @@
       "textAlign": "right"
     }
   },
-  {
-    "key": "batch_size",
-    "title": "Batch Size",
-    "description": "Logical maximum batch size for processing prompts.",
-    "controllerType": "input",
-    "controllerProps": {
-      "value": 2048,
-      "placeholder": "2048",
-      "type": "number",
-      "textAlign": "right"
-    }
-  },
   {
     "key": "ubatch_size",
     "title": "uBatch Size",
diff --git a/extensions/llamacpp-extension/src/backend.ts b/extensions/llamacpp-extension/src/backend.ts
index 7f5e8a22b..d60ecc138 100644
--- a/extensions/llamacpp-extension/src/backend.ts
+++ b/extensions/llamacpp-extension/src/backend.ts
@@ -46,7 +46,6 @@ export async function getLocalInstalledBackends(): Promise<
       }
     }
   }
-  console.debug(local)
   return local
 }
 
diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
index 8fad4fd87..78e7c04f3 100644
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@@ -37,7 +37,13 @@ import {
 import { invoke } from '@tauri-apps/api/core'
 import { getProxyConfig } from './util'
 import { basename } from '@tauri-apps/api/path'
-import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
+import {
+  readGgufMetadata,
+  estimateKVCacheSize,
+  getModelSize,
+  isModelSupported,
+  planModelLoadInternal,
+} from '@janhq/tauri-plugin-llamacpp-api'
 import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api'
 
 // Error message constant - matches web-app/src/utils/error.ts
@@ -82,6 +88,7 @@ type ModelPlan = {
   maxContextLength: number
   noOffloadKVCache: boolean
   offloadMmproj?: boolean
+  batchSize: number
   mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
 }
 
@@ -2006,11 +2013,6 @@ export default class llamacpp_extension extends AIEngine {
     return responseData as EmbeddingResponse
   }
 
-  // Optional method for direct client access
-  override getChatClient(sessionId: string): any {
-    throw new Error('method not implemented yet')
-  }
-
   /**
    * Check if a tool is supported by the model
    * Currently read from GGUF chat_template
@@ -2073,7 +2075,7 @@ export default class llamacpp_extension extends AIEngine {
     path: string,
     meta: Record<string, string>
   ): Promise<{ layerSize: number; totalLayers: number }> {
-    const modelSize = await this.getModelSize(path)
+    const modelSize = await getModelSize(path)
     const arch = meta['general.architecture']
     const totalLayers = Number(meta[`${arch}.block_count`]) + 2 // 1 for lm_head layer and 1 for embedding layer
     if (!totalLayers) throw new Error('Invalid metadata: block_count not found')
@@ -2089,335 +2091,27 @@ export default class llamacpp_extension extends AIEngine {
       /^\/\/[^/]+/.test(norm) // UNC path //server/share
     )
   }
-
+  /*
+    * if (!this.isAbsolutePath(path))
+      path = await joinPath([await getJanDataFolderPath(), path])
+    if (mmprojPath && !this.isAbsolutePath(mmprojPath))
+      mmprojPath = await joinPath([await getJanDataFolderPath(), path])
+  */
   async planModelLoad(
     path: string,
     mmprojPath?: string,
     requestedCtx?: number
   ): Promise<ModelPlan> {
-    if (!this.isAbsolutePath(path))
+    if (!this.isAbsolutePath(path)) {
       path = await joinPath([await getJanDataFolderPath(), path])
+    }
     if (mmprojPath && !this.isAbsolutePath(mmprojPath))
       mmprojPath = await joinPath([await getJanDataFolderPath(), path])
-    const modelSize = await this.getModelSize(path)
-    const memoryInfo = await this.getTotalSystemMemory()
-    const gguf = await readGgufMetadata(path)
-
-    // Get mmproj size if provided
-    let mmprojSize = 0
-    if (mmprojPath) {
-      mmprojSize = await this.getModelSize(mmprojPath)
-    }
-
-    const { layerSize, totalLayers } = await this.getLayerSize(
-      path,
-      gguf.metadata
-    )
-
-    const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
-      .perTokenSize
-
-    logger.info(
-      `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
-    )
-
-    // Validate critical values
-    if (!modelSize || modelSize <= 0) {
-      throw new Error(`Invalid model size: ${modelSize}`)
-    }
-    if (!kvCachePerToken || kvCachePerToken <= 0) {
-      throw new Error(`Invalid KV cache per token: ${kvCachePerToken}`)
-    }
-    if (!layerSize || layerSize <= 0) {
-      throw new Error(`Invalid layer size: ${layerSize}`)
-    }
-
-    // Reserve memory for OS, other applications, and fixed engine overhead.
-    const VRAM_RESERVE_GB = 0.5
-    const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
-    const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.
-
-    // Get model's maximum context length
-    const arch = gguf.metadata['general.architecture']
-    const modelMaxContextLength =
-      Number(gguf.metadata[`${arch}.context_length`]) || 8192
-
-    const MIN_CONTEXT_LENGTH = 1024
-
-    // Memory percentages applied to both VRAM and RAM
-    const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
-
-    logger.info(
-      `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
-    )
-
-    if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
-      throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
-    }
-    if (!memoryInfo.totalVRAM || isNaN(memoryInfo.totalVRAM)) {
-      throw new Error(`Invalid total VRAM: ${memoryInfo.totalVRAM}`)
-    }
-    if (!this.memoryMode || !(this.memoryMode in memoryPercentages)) {
-      throw new Error(
-        `Invalid memory mode: ${this.memoryMode}. Must be 'high', 'medium', or 'low'`
-      )
-    }
-
-    // Apply memory mode to both VRAM and RAM separately
-    const memoryModeMultiplier = memoryPercentages[this.memoryMode]
-    const usableVRAM = Math.max(
-      0,
-      memoryInfo.totalVRAM * memoryModeMultiplier -
-        VRAM_RESERVE_BYTES -
-        ENGINE_FIXED_OVERHEAD_BYTES
-    )
-
-    const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
-    const usableSystemMemory = actualSystemRAM * memoryModeMultiplier
-
-    logger.info(
-      `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
-    )
-
-    let gpuLayers = 0
-    let maxContextLength = 0
-    let noOffloadKVCache = false
-    let mode: ModelPlan['mode'] = 'Unsupported'
-    let offloadMmproj = false
-
-    let remainingVRAM = usableVRAM
-    if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
-      offloadMmproj = true
-      remainingVRAM -= mmprojSize
-    }
-    const vramForMinContext = (
-      await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
-    ).size
-
-    const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
-    if (ramForModel + vramForMinContext > usableSystemMemory + usableVRAM) {
-      logger.error(
-        `Model unsupported. Not enough resources for model and min context.`
-      )
-      return {
-        gpuLayers: 0,
-        maxContextLength: 0,
-        noOffloadKVCache: true,
-        mode: 'Unsupported',
-        offloadMmproj: false,
-      }
-    }
-
-    const targetContext = Math.min(
-      requestedCtx || modelMaxContextLength,
-      modelMaxContextLength
-    )
-
-    let targetContextSize = (
-      await this.estimateKVCache(gguf.metadata, targetContext)
-    ).size
-
-    // Use `kvCachePerToken` for all VRAM calculations
-    if (modelSize + targetContextSize <= remainingVRAM) {
-      mode = 'GPU'
-      gpuLayers = totalLayers
-      maxContextLength = targetContext
-      noOffloadKVCache = false
-      logger.info(
-        'Planning: Ideal case fits. All layers and target context in VRAM.'
-      )
-    } else if (modelSize <= remainingVRAM) {
-      mode = 'GPU'
-      gpuLayers = totalLayers
-      noOffloadKVCache = false
-      const vramLeftForContext = remainingVRAM - modelSize
-      maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
-
-      // Add safety check to prevent OOM
-      const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
-      maxContextLength = Math.floor(maxContextLength * safetyBuffer)
-
-      logger.info(
-        `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
-      )
-    } else {
-      const vramAvailableForLayers = remainingVRAM - vramForMinContext
-
-      if (vramAvailableForLayers >= layerSize) {
-        mode = 'Hybrid'
-        gpuLayers = Math.min(
-          Math.floor(vramAvailableForLayers / layerSize),
-          totalLayers
-        )
-        noOffloadKVCache = false
-        const vramUsedByLayers = gpuLayers * layerSize
-        const vramLeftForContext = remainingVRAM - vramUsedByLayers
-        maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
-
-        logger.info(
-          'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
-        )
-      }
-    }
-
-    // Fallback logic: try different configurations if no VRAM-based plan worked
-    if (mode === 'Unsupported') {
-      logger.info('Planning: Trying fallback configurations...')
-
-      // Try putting some layers on GPU with KV cache in RAM
-      const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
-      if (possibleGpuLayers > 0) {
-        gpuLayers = Math.min(possibleGpuLayers, totalLayers)
-        const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
-        const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
-        const availableRamForKv =
-          usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
-        // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
-        const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
-
-        if (contextInRam >= MIN_CONTEXT_LENGTH) {
-          mode = 'Hybrid'
-          maxContextLength = contextInRam
-          noOffloadKVCache = true
-          logger.info(
-            `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
-          )
-        }
-      }
-
-      // If still unsupported, try pure CPU mode
-      if (mode === 'Unsupported') {
-        gpuLayers = 0
-        noOffloadKVCache = true
-        offloadMmproj = false
-        const ramUsedByModel = modelSize + mmprojSize
-        const availableRamForKv = usableSystemMemory - ramUsedByModel
-        maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
-        if (maxContextLength >= MIN_CONTEXT_LENGTH) {
-          mode = 'CPU'
-          logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
-        }
-      }
-    }
-
-    if (mode === 'CPU' || noOffloadKVCache) {
-      offloadMmproj = false
-    }
-
-    if (requestedCtx && requestedCtx > 0) {
-      maxContextLength = Math.min(maxContextLength, requestedCtx)
-    }
-
-    maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
-
-    if (maxContextLength < MIN_CONTEXT_LENGTH) {
-      mode = 'Unsupported'
-    }
-
-    if (mode === 'Unsupported') {
-      gpuLayers = 0
-      maxContextLength = 0
-    }
-
-    maxContextLength = isNaN(maxContextLength)
-      ? 0
-      : Math.floor(maxContextLength)
-
-    const mmprojInfo = mmprojPath
-      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(
-          2
-        )}MB, offloadMmproj=${offloadMmproj}`
-      : ''
-
-    logger.info(
-      `Final plan for ${path}: gpuLayers=${gpuLayers}/${totalLayers}, ` +
-        `maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, ` +
-        `mode=${mode}${mmprojInfo}`
-    )
-
-    return {
-      gpuLayers,
-      maxContextLength,
-      noOffloadKVCache,
-      mode,
-      offloadMmproj,
-    }
-  }
-  /**
-   * estimate KVCache size from a given metadata
-   */
-  private async estimateKVCache(
-    meta: Record<string, string>,
-    ctx_size?: number
-  ): Promise<{ size: number; perTokenSize: number }> {
-    const arch = meta['general.architecture']
-    if (!arch) throw new Error('Invalid metadata: architecture not found')
-
-    const nLayer = Number(meta[`${arch}.block_count`])
-    if (!nLayer) throw new Error('Invalid metadata: block_count not found')
-
-    const nHead = Number(meta[`${arch}.attention.head_count`])
-    if (!nHead) throw new Error('Invalid metadata: head_count not found')
-
-    // Try to get key/value lengths first (more accurate)
-    const keyLen = Number(meta[`${arch}.attention.key_length`])
-    const valLen = Number(meta[`${arch}.attention.value_length`])
-
-    let headDim: number
-
-    if (keyLen && valLen) {
-      // Use explicit key/value lengths if available
-      logger.info(
-        `Using explicit key_length: ${keyLen}, value_length: ${valLen}`
-      )
-      headDim = keyLen + valLen
-    } else {
-      // Fall back to embedding_length estimation
-      const embeddingLen = Number(meta[`${arch}.embedding_length`])
-      if (!embeddingLen)
-        throw new Error('Invalid metadata: embedding_length not found')
-
-      // Standard transformer: head_dim = embedding_dim / num_heads
-      // For KV cache: we need both K and V, so 2 * head_dim per head
-      headDim = (embeddingLen / nHead) * 2
-      logger.info(
-        `Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}`
-      )
-    }
-
-    const maxCtx = Number(meta[`${arch}.context_length`])
-    if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
-
-    // b) If the user supplied a value, clamp it to the model's max
-    let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
-
-    logger.info(`Final context length used for KV size: ${ctxLen}`)
-    logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)
-
-    logger.info(`ctxLen: ${ctxLen}`)
-    logger.info(`nLayer: ${nLayer}`)
-    logger.info(`nHead: ${nHead}`)
-    logger.info(`headDim: ${headDim}`)
-
-    // Consider f16 by default
-    // Can be extended by checking cache-type-v and cache-type-k
-    // but we are checking overall compatibility with the default settings
-    // fp16 = 8 bits * 2 = 16
-    const bytesPerElement = 2
-
-    // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
-    const kvPerToken = nHead * headDim * bytesPerElement * nLayer
-
-    return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
-  }
-
-  private async getModelSize(path: string): Promise<number> {
-    if (path.startsWith('https://')) {
-      const res = await fetch(path, { method: 'HEAD' })
-      const len = res.headers.get('content-length')
-      return len ? parseInt(len, 10) : 0
-    } else {
-      return (await fs.fileStat(path)).size
+    try {
+      const result = await planModelLoadInternal(path, this.memoryMode, mmprojPath, requestedCtx)
+      return result
+    } catch (e) {
+      throw new Error(String(e))
     }
   }
 
@@ -2431,50 +2125,11 @@ export default class llamacpp_extension extends AIEngine {
    */
   async isModelSupported(
     path: string,
-    ctx_size?: number
+    ctxSize?: number
   ): Promise<'RED' | 'YELLOW' | 'GREEN'> {
     try {
-      const modelSize = await this.getModelSize(path)
-      const memoryInfo = await this.getTotalSystemMemory()
-
-      logger.info(`modelSize: ${modelSize}`)
-
-      const gguf = await readGgufMetadata(path)
-      let kvCacheSize: number
-      if (ctx_size) {
-        kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
-      } else {
-        kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
-      }
-
-      // Total memory consumption = model weights + kvcache
-      const totalRequired = modelSize + kvCacheSize
-      logger.info(
-        `isModelSupported: Total memory requirement: ${totalRequired} for ${path}`
-      )
-
-      // Use 80% of total memory as the usable limit
-      const USABLE_MEMORY_PERCENTAGE = 0.9
-      const usableTotalMemory =
-        memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
-        memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
-      const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
-
-      // Check if model fits in total memory at all (this is the hard limit)
-      if (totalRequired > usableTotalMemory) {
-        return 'RED' // Truly impossible to run
-      }
-
-      // Check if everything fits in VRAM (ideal case)
-      if (totalRequired <= usableVRAM) {
-        return 'GREEN'
-      }
-
-      // If we get here, it means:
-      // - Total requirement fits in combined memory
-      // - But doesn't fit entirely in VRAM
-      // This is the CPU-GPU hybrid scenario
-      return 'YELLOW'
+      const result = await isModelSupported(path, Number(ctxSize))
+      return result
     } catch (e) {
       throw new Error(String(e))
     }
diff --git a/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs b/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs
index 8f0427a6b..228a3731e 100644
--- a/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs
+++ b/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs
@@ -15,6 +15,8 @@ use tauri::Runtime;
 
 static SYSTEM_INFO: OnceLock<SystemInfo> = OnceLock::new();
 
+pub use commands::get_system_info;
+
 /// Initialize the hardware plugin
 pub fn init<R: Runtime>() -> tauri::plugin::TauriPlugin<R> {
     tauri::plugin::Builder::new("hardware")
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml b/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml
index fd58f6225..e1a57b962 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml
@@ -24,6 +24,7 @@ tauri = { version = "2.5.0", default-features = false, features = [] }
 thiserror = "2.0.12"
 tokio = { version = "1", features = ["full"] }
 reqwest = { version = "0.11", features = ["json", "blocking", "stream"] }
+tauri-plugin-hardware = { path = "../tauri-plugin-hardware" }
 
 # Unix-specific dependencies
 [target.'cfg(unix)'.dependencies]
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/build.rs b/src-tauri/plugins/tauri-plugin-llamacpp/build.rs
index ca32eb4d5..93c0f405b 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/build.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/build.rs
@@ -14,6 +14,10 @@ const COMMANDS: &[&str] = &[
     "get_session_by_model",
     // GGUF commands
     "read_gguf_metadata",
+    "estimate_kv_cache_size",
+    "get_model_size",
+    "is_model_supported",
+    "plan_model_load"
 ];
 
 fn main() {
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts b/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts
index 0380e4fe7..957839a63 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts
@@ -2,28 +2,28 @@ import { invoke } from '@tauri-apps/api/core'
 
 // Types
 export interface SessionInfo {
-  pid: number;
-  port: number;
-  model_id: string;
-  model_path: string;
-  api_key: string;
+  pid: number
+  port: number
+  model_id: string
+  model_path: string
+  api_key: string
 }
 
 export interface DeviceInfo {
-  id: string;
-  name: string;
-  memory: number;
+  id: string
+  name: string
+  memory: number
 }
 
 export interface GgufMetadata {
-  version: number;
-  tensor_count: number;
-  metadata: Record<string, string>;
+  version: number
+  tensor_count: number
+  metadata: Record<string, string>
 }
 
 // Cleanup commands
 export async function cleanupLlamaProcesses(): Promise<void> {
-  return await invoke('plugin:llamacpp|cleanup_llama_processes');
+  return await invoke('plugin:llamacpp|cleanup_llama_processes')
 }
 
 // LlamaCpp server commands
@@ -35,12 +35,12 @@ export async function loadLlamaModel(
   return await invoke('plugin:llamacpp|load_llama_model', {
     backendPath,
     libraryPath,
-    args
-  });
+    args,
+  })
 }
 
 export async function unloadLlamaModel(pid: number): Promise<void> {
-  return await invoke('plugin:llamacpp|unload_llama_model', { pid });
+  return await invoke('plugin:llamacpp|unload_llama_model', { pid })
 }
 
 export async function getDevices(
@@ -49,8 +49,8 @@ export async function getDevices(
 ): Promise<DeviceInfo[]> {
   return await invoke('plugin:llamacpp|get_devices', {
     backendPath,
-    libraryPath
-  });
+    libraryPath,
+  })
 }
 
 export async function generateApiKey(
@@ -59,35 +59,84 @@ export async function generateApiKey(
 ): Promise<string> {
   return await invoke('plugin:llamacpp|generate_api_key', {
     modelId,
-    apiSecret
-  });
+    apiSecret,
+  })
 }
 
 export async function isProcessRunning(pid: number): Promise<boolean> {
-  return await invoke('plugin:llamacpp|is_process_running', { pid });
+  return await invoke('plugin:llamacpp|is_process_running', { pid })
 }
 
 export async function getRandomPort(): Promise<number> {
-  return await invoke('plugin:llamacpp|get_random_port');
+  return await invoke('plugin:llamacpp|get_random_port')
 }
 
-export async function findSessionByModel(modelId: string): Promise<SessionInfo | null> {
-  return await invoke('plugin:llamacpp|find_session_by_model', { modelId });
+export async function findSessionByModel(
+  modelId: string
+): Promise<SessionInfo | null> {
+  return await invoke('plugin:llamacpp|find_session_by_model', { modelId })
 }
 
 export async function getLoadedModels(): Promise<string[]> {
-  return await invoke('plugin:llamacpp|get_loaded_models');
+  return await invoke('plugin:llamacpp|get_loaded_models')
 }
 
 export async function getAllSessions(): Promise<SessionInfo[]> {
-  return await invoke('plugin:llamacpp|get_all_sessions');
+  return await invoke('plugin:llamacpp|get_all_sessions')
 }
 
-export async function getSessionByModel(modelId: string): Promise<SessionInfo | null> {
-  return await invoke('plugin:llamacpp|get_session_by_model', { modelId });
+export async function getSessionByModel(
+  modelId: string
+): Promise<SessionInfo | null> {
+  return await invoke('plugin:llamacpp|get_session_by_model', { modelId })
 }
 
 // GGUF commands
 export async function readGgufMetadata(path: string): Promise<GgufMetadata> {
-  return await invoke('plugin:llamacpp|read_gguf_metadata', { path });
+  return await invoke('plugin:llamacpp|read_gguf_metadata', { path })
+}
+
+export async function estimateKVCacheSize(
+  meta: Record<string, string>,
+  ctxSize?: number
+): Promise<{ size: number; per_token_size: number }> {
+  return await invoke('plugin:llamacpp|estimate_kv_cache_size', {
+    meta,
+    ctxSize,
+  })
+}
+
+export async function getModelSize(path: string): Promise<number> {
+  return await invoke('plugin:llamacpp|get_model_size', { path })
+}
+
+export async function isModelSupported(
+  path: string,
+  ctxSize?: number
+): Promise<'RED' | 'YELLOW' | 'GREEN'> {
+  return await invoke('plugin:llamacpp|is_model_supported', {
+    path,
+    ctxSize,
+  })
+}
+
+export async function planModelLoadInternal(
+  path: string,
+  memoryMode: string,
+  mmprojPath?: string,
+  requestedContext?: number
+): Promise<{
+  gpuLayers: number
+  maxContextLength: number
+  noOffloadKVCache: boolean
+  offloadMmproj?: boolean
+  batchSize: number
+  mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
+}> {
+  return await invoke('plugin:llamacpp|plan_model_load', {
+    path,
+    memoryMode,
+    mmprojPath,
+    requestedContext,
+  })
 }
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/estimate_kv_cache_size.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/estimate_kv_cache_size.toml
new file mode 100644
index 000000000..39d419214
--- /dev/null
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/estimate_kv_cache_size.toml
@@ -0,0 +1,13 @@
+# Automatically generated - DO NOT EDIT!
+
+"$schema" = "../../schemas/schema.json"
+
+[[permission]]
+identifier = "allow-estimate-kv-cache-size"
+description = "Enables the estimate_kv_cache_size command without any pre-configured scope."
+commands.allow = ["estimate_kv_cache_size"]
+
+[[permission]]
+identifier = "deny-estimate-kv-cache-size"
+description = "Denies the estimate_kv_cache_size command without any pre-configured scope."
+commands.deny = ["estimate_kv_cache_size"]
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/get_model_size.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/get_model_size.toml
new file mode 100644
index 000000000..7ba81c38e
--- /dev/null
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/get_model_size.toml
@@ -0,0 +1,13 @@
+# Automatically generated - DO NOT EDIT!
+
+"$schema" = "../../schemas/schema.json"
+
+[[permission]]
+identifier = "allow-get-model-size"
+description = "Enables the get_model_size command without any pre-configured scope."
+commands.allow = ["get_model_size"]
+
+[[permission]]
+identifier = "deny-get-model-size"
+description = "Denies the get_model_size command without any pre-configured scope."
+commands.deny = ["get_model_size"]
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/is_model_supported.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/is_model_supported.toml
new file mode 100644
index 000000000..e9c5ad23f
--- /dev/null
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/is_model_supported.toml
@@ -0,0 +1,13 @@
+# Automatically generated - DO NOT EDIT!
+
+"$schema" = "../../schemas/schema.json"
+
+[[permission]]
+identifier = "allow-is-model-supported"
+description = "Enables the is_model_supported command without any pre-configured scope."
+commands.allow = ["is_model_supported"]
+
+[[permission]]
+identifier = "deny-is-model-supported"
+description = "Denies the is_model_supported command without any pre-configured scope."
+commands.deny = ["is_model_supported"]
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/plan_model_load.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/plan_model_load.toml
new file mode 100644
index 000000000..a1a315967
--- /dev/null
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/plan_model_load.toml
@@ -0,0 +1,13 @@
+# Automatically generated - DO NOT EDIT!
+
+"$schema" = "../../schemas/schema.json"
+
+[[permission]]
+identifier = "allow-plan-model-load"
+description = "Enables the plan_model_load command without any pre-configured scope."
+commands.allow = ["plan_model_load"]
+
+[[permission]]
+identifier = "deny-plan-model-load"
+description = "Denies the plan_model_load command without any pre-configured scope."
+commands.deny = ["plan_model_load"]
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md
index 898cfe530..faab24f25 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md
@@ -16,6 +16,10 @@ Default permissions for the llamacpp plugin
 - `allow-get-all-sessions`
 - `allow-get-session-by-model`
 - `allow-read-gguf-metadata`
+- `allow-estimate-kv-cache-size`
+- `allow-get-model-size`
+- `allow-is-model-supported`
+- `allow-plan-model-load`
 
 ## Permission Table
 
@@ -55,6 +59,32 @@ Denies the cleanup_llama_processes command without any pre-configured scope.
 <tr>
 <td>
 
+`llamacpp:allow-estimate-kv-cache-size`
+
+</td>
+<td>
+
+Enables the estimate_kv_cache_size command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+`llamacpp:deny-estimate-kv-cache-size`
+
+</td>
+<td>
+
+Denies the estimate_kv_cache_size command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
 `llamacpp:allow-find-session-by-model`
 
 </td>
@@ -185,6 +215,32 @@ Denies the get_loaded_models command without any pre-configured scope.
 <tr>
 <td>
 
+`llamacpp:allow-get-model-size`
+
+</td>
+<td>
+
+Enables the get_model_size command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+`llamacpp:deny-get-model-size`
+
+</td>
+<td>
+
+Denies the get_model_size command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
 `llamacpp:allow-get-random-port`
 
 </td>
@@ -237,6 +293,32 @@ Denies the get_session_by_model command without any pre-configured scope.
 <tr>
 <td>
 
+`llamacpp:allow-is-model-supported`
+
+</td>
+<td>
+
+Enables the is_model_supported command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+`llamacpp:deny-is-model-supported`
+
+</td>
+<td>
+
+Denies the is_model_supported command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
 `llamacpp:allow-is-process-running`
 
 </td>
@@ -289,6 +371,32 @@ Denies the load_llama_model command without any pre-configured scope.
 <tr>
 <td>
 
+`llamacpp:allow-plan-model-load`
+
+</td>
+<td>
+
+Enables the plan_model_load command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+`llamacpp:deny-plan-model-load`
+
+</td>
+<td>
+
+Denies the plan_model_load command without any pre-configured scope.
+
+</td>
+</tr>
+
+<tr>
+<td>
+
 `llamacpp:allow-read-gguf-metadata`
 
 </td>
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml
index 08339b766..91938e047 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml
@@ -3,10 +3,10 @@ description = "Default permissions for the llamacpp plugin"
 permissions = [
     # Cleanup commands
     "allow-cleanup-llama-processes",
-    
+
     # LlamaCpp server commands
     "allow-load-llama-model",
-    "allow-unload-llama-model", 
+    "allow-unload-llama-model",
     "allow-get-devices",
     "allow-generate-api-key",
     "allow-is-process-running",
@@ -15,7 +15,11 @@ permissions = [
     "allow-get-loaded-models",
     "allow-get-all-sessions",
     "allow-get-session-by-model",
-    
+
     # GGUF commands
-    "allow-read-gguf-metadata"
+    "allow-read-gguf-metadata",
+    "allow-estimate-kv-cache-size",
+    "allow-get-model-size",
+    "allow-is-model-supported",
+    "allow-plan-model-load"
 ]
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json
index 70ccaf6f7..0dd243ff0 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json
@@ -306,6 +306,18 @@
           "const": "deny-cleanup-llama-processes",
           "markdownDescription": "Denies the cleanup_llama_processes command without any pre-configured scope."
         },
+        {
+          "description": "Enables the estimate_kv_cache_size command without any pre-configured scope.",
+          "type": "string",
+          "const": "allow-estimate-kv-cache-size",
+          "markdownDescription": "Enables the estimate_kv_cache_size command without any pre-configured scope."
+        },
+        {
+          "description": "Denies the estimate_kv_cache_size command without any pre-configured scope.",
+          "type": "string",
+          "const": "deny-estimate-kv-cache-size",
+          "markdownDescription": "Denies the estimate_kv_cache_size command without any pre-configured scope."
+        },
         {
           "description": "Enables the find_session_by_model command without any pre-configured scope.",
           "type": "string",
@@ -366,6 +378,18 @@
           "const": "deny-get-loaded-models",
           "markdownDescription": "Denies the get_loaded_models command without any pre-configured scope."
         },
+        {
+          "description": "Enables the get_model_size command without any pre-configured scope.",
+          "type": "string",
+          "const": "allow-get-model-size",
+          "markdownDescription": "Enables the get_model_size command without any pre-configured scope."
+        },
+        {
+          "description": "Denies the get_model_size command without any pre-configured scope.",
+          "type": "string",
+          "const": "deny-get-model-size",
+          "markdownDescription": "Denies the get_model_size command without any pre-configured scope."
+        },
         {
           "description": "Enables the get_random_port command without any pre-configured scope.",
           "type": "string",
@@ -390,6 +414,18 @@
           "const": "deny-get-session-by-model",
           "markdownDescription": "Denies the get_session_by_model command without any pre-configured scope."
         },
+        {
+          "description": "Enables the is_model_supported command without any pre-configured scope.",
+          "type": "string",
+          "const": "allow-is-model-supported",
+          "markdownDescription": "Enables the is_model_supported command without any pre-configured scope."
+        },
+        {
+          "description": "Denies the is_model_supported command without any pre-configured scope.",
+          "type": "string",
+          "const": "deny-is-model-supported",
+          "markdownDescription": "Denies the is_model_supported command without any pre-configured scope."
+        },
         {
           "description": "Enables the is_process_running command without any pre-configured scope.",
           "type": "string",
@@ -414,6 +450,18 @@
           "const": "deny-load-llama-model",
           "markdownDescription": "Denies the load_llama_model command without any pre-configured scope."
         },
+        {
+          "description": "Enables the plan_model_load command without any pre-configured scope.",
+          "type": "string",
+          "const": "allow-plan-model-load",
+          "markdownDescription": "Enables the plan_model_load command without any pre-configured scope."
+        },
+        {
+          "description": "Denies the plan_model_load command without any pre-configured scope.",
+          "type": "string",
+          "const": "deny-plan-model-load",
+          "markdownDescription": "Denies the plan_model_load command without any pre-configured scope."
+        },
         {
           "description": "Enables the read_gguf_metadata command without any pre-configured scope.",
           "type": "string",
@@ -439,10 +487,10 @@
           "markdownDescription": "Denies the unload_llama_model command without any pre-configured scope."
         },
         {
-          "description": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`",
+          "description": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`\n- `allow-estimate-kv-cache-size`\n- `allow-get-model-size`\n- `allow-is-model-supported`\n- `allow-plan-model-load`",
           "type": "string",
           "const": "default",
-          "markdownDescription": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`"
+          "markdownDescription": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`\n- `allow-estimate-kv-cache-size`\n- `allow-get-model-size`\n- `allow-is-model-supported`\n- `allow-plan-model-load`"
         }
       ]
     }
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs
index ae38f56f3..c636fa8bd 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs
@@ -1,58 +1,141 @@
-use super::helpers;
 use super::types::GgufMetadata;
-use reqwest;
-use std::fs::File;
-use std::io::BufReader;
-
+use super::utils::{estimate_kv_cache_internal, read_gguf_metadata_internal};
+use crate::gguf::types::{KVCacheError, KVCacheEstimate, ModelSupportStatus};
+use std::collections::HashMap;
+use std::fs;
+use tauri::Runtime;
+use tauri_plugin_hardware::get_system_info;
 /// Read GGUF metadata from a model file
 #[tauri::command]
 pub async fn read_gguf_metadata(path: String) -> Result<GgufMetadata, String> {
-    if path.starts_with("http://") || path.starts_with("https://") {
-        // Remote: read in 2MB chunks until successful
+    return read_gguf_metadata_internal(path).await;
+}
+
+#[tauri::command]
+pub async fn estimate_kv_cache_size(
+    meta: HashMap<String, String>,
+    ctx_size: Option<u64>,
+) -> Result<KVCacheEstimate, KVCacheError> {
+    estimate_kv_cache_internal(meta, ctx_size).await
+}
+
+#[tauri::command]
+pub async fn get_model_size(path: String) -> Result<u64, String> {
+    if path.starts_with("https://") {
+        // Handle remote URL
         let client = reqwest::Client::new();
-        let chunk_size = 2 * 1024 * 1024; // Fixed 2MB chunks
-        let max_total_size = 120 * 1024 * 1024; // Don't exceed 120MB total
-        let mut total_downloaded = 0;
-        let mut accumulated_data = Vec::new();
+        let response = client
+            .head(&path)
+            .send()
+            .await
+            .map_err(|e| format!("Failed to fetch HEAD request: {}", e))?;
 
-        while total_downloaded < max_total_size {
-            let start = total_downloaded;
-            let end = std::cmp::min(start + chunk_size - 1, max_total_size - 1);
-
-            let resp = client
-                .get(&path)
-                .header("Range", format!("bytes={}-{}", start, end))
-                .send()
-                .await
-                .map_err(|e| format!("Failed to fetch chunk {}-{}: {}", start, end, e))?;
-
-            let chunk_data = resp
-                .bytes()
-                .await
-                .map_err(|e| format!("Failed to read chunk response: {}", e))?;
-
-            accumulated_data.extend_from_slice(&chunk_data);
-            total_downloaded += chunk_data.len();
-
-            // Try parsing after each chunk
-            let cursor = std::io::Cursor::new(&accumulated_data);
-            if let Ok(metadata) = helpers::read_gguf_metadata(cursor) {
-                return Ok(metadata);
-            }
-
-            // If we got less data than expected, we've reached EOF
-            if chunk_data.len() < chunk_size {
-                break;
-            }
+        if let Some(content_length) = response.headers().get("content-length") {
+            let content_length_str = content_length
+                .to_str()
+                .map_err(|e| format!("Invalid content-length header: {}", e))?;
+            content_length_str
+                .parse::<u64>()
+                .map_err(|e| format!("Failed to parse content-length: {}", e))
+        } else {
+            Ok(0)
         }
-        Err("Could not parse GGUF metadata from downloaded data".to_string())
     } else {
-        // Local: use streaming file reader
-        let file =
-            File::open(&path).map_err(|e| format!("Failed to open local file {}: {}", path, e))?;
-        let reader = BufReader::new(file);
-
-        helpers::read_gguf_metadata(reader)
-            .map_err(|e| format!("Failed to parse GGUF metadata: {}", e))
+        // Handle local file using standard fs
+        let metadata =
+            fs::metadata(&path).map_err(|e| format!("Failed to get file metadata: {}", e))?;
+        Ok(metadata.len())
     }
 }
+
+#[tauri::command]
+pub async fn is_model_supported<R: Runtime>(
+    path: String,
+    ctx_size: Option<u32>,
+    app_handle: tauri::AppHandle<R>,
+) -> Result<ModelSupportStatus, String> {
+    // Get model size
+    let model_size = get_model_size(path.clone()).await?;
+
+    // Get system info
+    let system_info = get_system_info(app_handle.clone());
+
+    log::info!("modelSize: {}", model_size);
+
+    // Read GGUF metadata
+    let gguf = read_gguf_metadata(path.clone()).await?;
+
+    // Calculate KV cache size
+    let kv_cache_size = if let Some(ctx_size) = ctx_size {
+        log::info!("Using ctx_size: {}", ctx_size);
+        estimate_kv_cache_internal(gguf.metadata, Some(ctx_size as u64))
+            .await
+            .map_err(|e| e.to_string())?
+            .size
+    } else {
+        estimate_kv_cache_internal(gguf.metadata, None)
+            .await
+            .map_err(|e| e.to_string())?
+            .size
+    };
+
+    // Total memory consumption = model weights + kvcache
+    let total_required = model_size + kv_cache_size;
+    log::info!(
+        "isModelSupported: Total memory requirement: {} for {}; Got kvCacheSize: {} from BE",
+        total_required,
+        path,
+        kv_cache_size
+    );
+
+    const RESERVE_BYTES: u64 = 2288490189;
+    let total_system_memory = system_info.total_memory * 1024 * 1024;
+    // Calculate total VRAM from all GPUs
+    let total_vram: u64 = if system_info.gpus.is_empty() {
+        // On macOS with unified memory, GPU info may be empty
+        // Use total RAM as VRAM since memory is shared
+        log::info!("No GPUs detected (likely unified memory system), using total RAM as VRAM");
+        total_system_memory
+    } else {
+        system_info
+            .gpus
+            .iter()
+            .map(|g| g.total_memory * 1024 * 1024)
+            .sum::<u64>()
+    };
+
+    log::info!("Total VRAM reported/calculated (in bytes): {}", &total_vram);
+
+    let usable_vram = if total_vram > RESERVE_BYTES {
+        total_vram - RESERVE_BYTES
+    } else {
+        0
+    };
+
+    let usable_total_memory = if total_system_memory > RESERVE_BYTES {
+        (total_system_memory - RESERVE_BYTES) + usable_vram
+    } else {
+        0
+    };
+    log::info!("System RAM: {} bytes", &total_system_memory);
+    log::info!("Total VRAM: {} bytes", &total_vram);
+    log::info!("Usable total memory: {} bytes", &usable_total_memory);
+    log::info!("Usable VRAM: {} bytes", &usable_vram);
+    log::info!("Required: {} bytes", &total_required);
+
+    // Check if model fits in total memory at all (this is the hard limit)
+    if total_required > usable_total_memory {
+        return Ok(ModelSupportStatus::Red); // Truly impossible to run
+    }
+
+    // Check if everything fits in VRAM (ideal case)
+    if total_required <= usable_vram {
+        return Ok(ModelSupportStatus::Green);
+    }
+
+    // If we get here, it means:
+    // - Total requirement fits in combined memory
+    // - But doesn't fit entirely in VRAM
+    // This is the CPU-GPU hybrid scenario
+    Ok(ModelSupportStatus::Yellow)
+}
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs
index 44fa1911f..935b17b83 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs
@@ -1,3 +1,5 @@
 pub mod commands;
 pub mod helpers;
 pub mod types;
+pub mod utils;
+pub mod model_planner;
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/model_planner.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/model_planner.rs
new file mode 100644
index 000000000..118894871
--- /dev/null
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/model_planner.rs
@@ -0,0 +1,318 @@
+use crate::gguf::commands::get_model_size;
+use crate::gguf::utils::estimate_kv_cache_internal;
+use crate::gguf::utils::read_gguf_metadata_internal;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use tauri::Runtime;
+use tauri_plugin_hardware::get_system_info;
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "camelCase")]
+pub struct ModelPlan {
+    pub gpu_layers: u64,
+    pub max_context_length: u64,
+    pub no_offload_kv_cache: bool,
+    pub offload_mmproj: bool,
+    pub batch_size: u64,
+    pub mode: ModelMode,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum ModelMode {
+    GPU,
+    Hybrid,
+    CPU,
+    Unsupported,
+}
+
+#[tauri::command]
+pub async fn plan_model_load<R: Runtime>(
+    path: String,
+    memory_mode: String,
+    mmproj_path: Option<String>,
+    requested_ctx: Option<u64>,
+    app: tauri::AppHandle<R>,
+) -> Result<ModelPlan, String> {
+    let model_size = get_model_size(path.clone()).await?;
+    let sys_info = get_system_info(app.clone());
+    let gguf = read_gguf_metadata_internal(path.clone()).await?;
+
+    let mut mmproj_size: u64 = 0;
+    if let Some(ref mmproj) = mmproj_path {
+        mmproj_size = get_model_size(mmproj.clone()).await?;
+    }
+
+    let arch = gguf
+        .metadata
+        .get("general.architecture")
+        .ok_or("Missing architecture")?;
+    let repeating_layers: u64 = gguf
+        .metadata
+        .get(&format!("{arch}.block_count"))
+        .ok_or("Missing block_count")?
+        .parse()
+        .map_err(|_| "Invalid block_count")?;
+    let total_layers = repeating_layers + 1;
+    let layer_size = model_size / total_layers;
+
+    let kv_cache = estimate_kv_cache_internal(gguf.metadata.clone(), None)
+        .await
+        .map_err(|e| e.to_string())?;
+    let kv_cache_per_token = kv_cache.per_token_size;
+
+    if model_size == 0 || layer_size == 0 || kv_cache_per_token == 0 {
+        return Err("Invalid model/layer/cache sizes".into());
+    }
+
+    const RESERVE_BYTES: u64 = 2288490189;
+    const MIN_CONTEXT_LENGTH: u64 = 2048;
+
+    let model_max_ctx: u64 = gguf
+        .metadata
+        .get(&format!("{arch}.context_length"))
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(8192);
+
+    let memory_percentages = HashMap::from([("high", 0.7), ("medium", 0.5), ("low", 0.4)]);
+
+    let multiplier = *memory_percentages
+        .get(memory_mode.as_str())
+        .ok_or("Invalid memory mode")?;
+
+    log::info!("Got GPUs:\n{:?}", &sys_info.gpus);
+
+    let total_ram: u64 = sys_info.total_memory * 1024 * 1024;
+    log::info!(
+        "Total system memory reported from tauri_plugin_hardware(in bytes): {}",
+        &total_ram
+    );
+
+    let total_vram: u64 = if sys_info.gpus.is_empty() {
+        // On macOS with unified memory, GPU info may be empty
+        // Use total RAM as VRAM since memory is shared
+        log::info!("No GPUs detected (likely unified memory system), using total RAM as VRAM");
+        total_ram
+    } else {
+        sys_info
+            .gpus
+            .iter()
+            .map(|g| g.total_memory * 1024 * 1024)
+            .sum::<u64>()
+    };
+
+    log::info!("Total VRAM reported/calculated (in bytes): {}", &total_vram);
+    let usable_vram: u64 = if total_vram > RESERVE_BYTES {
+        (((total_vram - RESERVE_BYTES) as f64) * multiplier) as u64
+    } else {
+        0
+    };
+    log::info!("Usable vram calculated: {}", &usable_vram);
+
+    let usable_ram: u64 = if total_ram > RESERVE_BYTES {
+        (((total_ram - RESERVE_BYTES) as f64) * multiplier).max(0.0) as u64
+    } else {
+        0
+    };
+    log::info!("Usable ram calculated (in bytes): {}", &usable_ram);
+
+    let mut gpu_layers = 0;
+    let mut max_ctx_len = 0;
+    let mut no_offload_kv_cache = false;
+    let mut mode = ModelMode::Unsupported;
+    let mut offload_mmproj = false;
+    let mut batch_size = 2048;
+
+    let total_available_mem = usable_vram.saturating_add(usable_ram);
+    if model_size + mmproj_size > total_available_mem {
+        log::info!("Model not supported in this system!");
+        return Ok(ModelPlan {
+            gpu_layers: 0,
+            max_context_length: 0,
+            no_offload_kv_cache: true,
+            batch_size: 64,
+            mode: ModelMode::Unsupported,
+            offload_mmproj: false,
+        });
+    }
+    if mmproj_size > 0 {
+        offload_mmproj = true;
+    }
+
+    let kv_min_size = estimate_kv_cache_internal(gguf.metadata.clone(), Some(MIN_CONTEXT_LENGTH))
+        .await
+        .map_err(|e| e.to_string())?
+        .size;
+
+    if model_size + kv_min_size + mmproj_size <= usable_vram {
+        log::info!("Planning mode: Full GPU offload is possible.");
+        mode = ModelMode::GPU;
+        gpu_layers = total_layers;
+        let vram_left_for_ctx = usable_vram.saturating_sub(model_size);
+        let max_ctx_by_vram = (vram_left_for_ctx / kv_cache_per_token) as u64;
+        let requested_target = requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx);
+        max_ctx_len = requested_target.min(max_ctx_by_vram);
+        no_offload_kv_cache = false;
+        offload_mmproj = true;
+    } else {
+        let mut found_plan = false;
+
+        log::info!("Attempting VRAM-Maximized Hybrid plan (KV cache in VRAM only).");
+        for candidate_gpu_layers in (0..=total_layers).rev() {
+            let vram_used_by_layers = candidate_gpu_layers.saturating_mul(layer_size);
+            if vram_used_by_layers > usable_vram {
+                continue;
+            }
+
+            let ram_used_by_cpu_layers =
+                (total_layers.saturating_sub(candidate_gpu_layers)).saturating_mul(layer_size);
+            let ram_used_by_mmproj = if offload_mmproj { 0 } else { mmproj_size };
+            let required_ram_for_model = ram_used_by_cpu_layers.saturating_add(ram_used_by_mmproj);
+
+            if required_ram_for_model > usable_ram {
+                continue;
+            }
+
+            let vram_left_for_kv = usable_vram.saturating_sub(vram_used_by_layers);
+            let ctx_in_vram_only = (vram_left_for_kv / kv_cache_per_token) as u64;
+
+            if ctx_in_vram_only >= MIN_CONTEXT_LENGTH {
+                log::info!(
+                    "Found VRAM-Maximized Hybrid plan with {} GPU layers.",
+                    candidate_gpu_layers
+                );
+                mode = ModelMode::Hybrid;
+                gpu_layers = candidate_gpu_layers;
+                let requested_target = requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx);
+                max_ctx_len = requested_target.min(ctx_in_vram_only);
+                no_offload_kv_cache = false;
+                found_plan = true;
+                break;
+            }
+        }
+
+        if !found_plan {
+            log::info!("VRAM-Maximized plan not feasible. Falling back to Standard Hybrid (KV cache in VRAM+RAM).");
+            for candidate_gpu_layers in (0..=total_layers).rev() {
+                let vram_used_by_layers = candidate_gpu_layers.saturating_mul(layer_size);
+                if vram_used_by_layers > usable_vram {
+                    continue;
+                }
+                let vram_left_for_kv = usable_vram.saturating_sub(vram_used_by_layers);
+                let kv_in_vram = (vram_left_for_kv / kv_cache_per_token) as u64;
+
+                let ram_used_by_cpu_layers =
+                    (total_layers.saturating_sub(candidate_gpu_layers)).saturating_mul(layer_size);
+                let ram_used_by_mmproj = if offload_mmproj { 0 } else { mmproj_size };
+                let required_ram_for_model =
+                    ram_used_by_cpu_layers.saturating_add(ram_used_by_mmproj);
+
+                if required_ram_for_model > usable_ram {
+                    continue;
+                }
+
+                let available_ram_for_kv = usable_ram.saturating_sub(required_ram_for_model);
+                let kv_in_ram = (available_ram_for_kv / kv_cache_per_token) as u64;
+
+                let total_kv_tokens = kv_in_vram.saturating_add(kv_in_ram);
+
+                if total_kv_tokens >= MIN_CONTEXT_LENGTH {
+                    log::info!(
+                        "Found Standard Hybrid plan with {} GPU layers.",
+                        candidate_gpu_layers
+                    );
+                    mode = if candidate_gpu_layers > 0 {
+                        ModelMode::Hybrid
+                    } else {
+                        ModelMode::CPU
+                    };
+                    gpu_layers = candidate_gpu_layers;
+                    let requested_target =
+                        requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx);
+                    let max_possible_ctx = total_kv_tokens.min(model_max_ctx);
+                    max_ctx_len = requested_target.min(max_possible_ctx);
+                    no_offload_kv_cache = kv_in_ram > 0 && kv_in_vram == 0;
+                    found_plan = true;
+                    break;
+                }
+            }
+        }
+
+        if !found_plan {
+            log::info!("No hybrid plan found. Attempting CPU-only plan.");
+            if model_size + mmproj_size <= usable_ram {
+                let available_ram_for_kv = usable_ram.saturating_sub(model_size + mmproj_size);
+                let kv_tokens = (available_ram_for_kv / kv_cache_per_token) as u64;
+                if kv_tokens >= MIN_CONTEXT_LENGTH {
+                    mode = ModelMode::CPU;
+                    gpu_layers = 0;
+                    max_ctx_len = kv_tokens
+                        .min(requested_ctx.unwrap_or(model_max_ctx))
+                        .min(model_max_ctx);
+                    no_offload_kv_cache = true;
+                    offload_mmproj = false;
+                }
+            }
+        }
+    }
+
+    if let Some(req) = requested_ctx {
+        if req > 0 {
+            max_ctx_len = max_ctx_len.min(req);
+        }
+    }
+    max_ctx_len = max_ctx_len.min(model_max_ctx);
+
+    if max_ctx_len > 0 {
+        log::info!("Max context before power-of-2 adjustment: {}", max_ctx_len);
+        max_ctx_len = 1u64 << (63 - max_ctx_len.leading_zeros());
+        log::info!("Adjusted max context to power of 2: {}", max_ctx_len);
+    }
+
+    if mode == ModelMode::Unsupported {
+        if max_ctx_len >= MIN_CONTEXT_LENGTH {
+            // do nothing, plan is viable but wasn't assigned a mode
+        } else {
+            gpu_layers = 0;
+            max_ctx_len = 0;
+            offload_mmproj = false;
+        }
+    } else if max_ctx_len < MIN_CONTEXT_LENGTH {
+        log::info!(
+            "Final context length {} is less than minimum required {}. Marking as unsupported.",
+            max_ctx_len,
+            MIN_CONTEXT_LENGTH
+        );
+        mode = ModelMode::Unsupported;
+        gpu_layers = 0;
+        max_ctx_len = 0;
+        offload_mmproj = false;
+    }
+
+    if mode == ModelMode::Hybrid {
+        batch_size = 256;
+    } else if mode == ModelMode::CPU || no_offload_kv_cache || mode == ModelMode::Unsupported {
+        batch_size = 64;
+    }
+
+    if max_ctx_len > 0 {
+        batch_size = batch_size.min(max_ctx_len);
+    } else {
+        batch_size = 64;
+    }
+
+    if mode == ModelMode::CPU || no_offload_kv_cache {
+        offload_mmproj = false;
+    }
+
+    log::info!("Planned model load params: GPU Layers: {}, max_ctx_len: {}, kv_cache offload: {}, offload mmproj: {}, batch_size: {}",
+        gpu_layers, max_ctx_len, !no_offload_kv_cache, offload_mmproj, batch_size);
+    Ok(ModelPlan {
+        gpu_layers,
+        max_context_length: max_ctx_len,
+        no_offload_kv_cache,
+        offload_mmproj,
+        batch_size,
+        mode,
+    })
+}
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs
index a2bc73c59..49a497cf1 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs
@@ -1,4 +1,4 @@
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::convert::TryFrom;
 use std::io;
@@ -52,3 +52,42 @@ pub struct GgufMetadata {
     pub tensor_count: u64,
     pub metadata: HashMap<String, String>,
 }
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct KVCacheEstimate {
+    pub size: u64,
+    pub per_token_size: u64,
+}
+#[derive(Debug, thiserror::Error)]
+pub enum KVCacheError {
+    #[error("Invalid metadata: architecture not found")]
+    ArchitectureNotFound,
+    #[error("Invalid metadata: block_count not found or invalid")]
+    BlockCountInvalid,
+    #[error("Invalid metadata: head_count not found or invalid")]
+    HeadCountInvalid,
+    #[error("Invalid metadata: embedding_length not found or invalid")]
+    EmbeddingLengthInvalid,
+    #[error("Invalid metadata: context_length not found or invalid")]
+    ContextLengthInvalid,
+}
+
+impl serde::Serialize for KVCacheError {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_str(&self.to_string())
+    }
+}
+
+
+#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize)]
+pub enum ModelSupportStatus {
+    #[serde(rename = "RED")]
+    Red,
+    #[serde(rename = "YELLOW")]
+    Yellow,
+    #[serde(rename = "GREEN")]
+    Green,
+}
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs
new file mode 100644
index 000000000..50e3f4a14
--- /dev/null
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs
@@ -0,0 +1,164 @@
+use crate::gguf::helpers;
+use crate::gguf::types::{GgufMetadata, KVCacheError, KVCacheEstimate};
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::BufReader;
+
+// read gguf metadata
+pub async fn read_gguf_metadata_internal(path: String) -> Result<GgufMetadata, String> {
+    if path.starts_with("http://") || path.starts_with("https://") {
+        // Remote: read in 2MB chunks until successful
+        let client = reqwest::Client::new();
+        let chunk_size = 2 * 1024 * 1024; // Fixed 2MB chunks
+        let max_total_size = 120 * 1024 * 1024; // Don't exceed 120MB total
+        let mut total_downloaded = 0;
+        let mut accumulated_data = Vec::new();
+
+        while total_downloaded < max_total_size {
+            let start = total_downloaded;
+            let end = std::cmp::min(start + chunk_size - 1, max_total_size - 1);
+
+            let resp = client
+                .get(&path)
+                .header("Range", format!("bytes={}-{}", start, end))
+                .send()
+                .await
+                .map_err(|e| format!("Failed to fetch chunk {}-{}: {}", start, end, e))?;
+
+            let chunk_data = resp
+                .bytes()
+                .await
+                .map_err(|e| format!("Failed to read chunk response: {}", e))?;
+
+            accumulated_data.extend_from_slice(&chunk_data);
+            total_downloaded += chunk_data.len();
+
+            // Try parsing after each chunk
+            let cursor = std::io::Cursor::new(&accumulated_data);
+            if let Ok(metadata) = helpers::read_gguf_metadata(cursor) {
+                return Ok(metadata);
+            }
+
+            // If we got less data than expected, we've reached EOF
+            if chunk_data.len() < chunk_size {
+                break;
+            }
+        }
+        Err("Could not parse GGUF metadata from downloaded data".to_string())
+    } else {
+        // Local: use streaming file reader
+        let file =
+            File::open(&path).map_err(|e| format!("Failed to open local file {}: {}", path, e))?;
+        let reader = BufReader::new(file);
+
+        helpers::read_gguf_metadata(reader)
+            .map_err(|e| format!("Failed to parse GGUF metadata: {}", e))
+    }
+}
+
+/// Estimate KVCache size from a given metadata
+pub async fn estimate_kv_cache_internal(
+    meta: HashMap<String, String>,
+    ctx_size: Option<u64>,
+) -> Result<KVCacheEstimate, KVCacheError> {
+    log::info!("Received ctx_size parameter: {:?}", ctx_size);
+    let arch = meta
+        .get("general.architecture")
+        .ok_or(KVCacheError::ArchitectureNotFound)?;
+
+    // Number of layers
+    let n_layer_key = format!("{}.block_count", arch);
+    let n_layer = meta
+        .get(&n_layer_key)
+        .and_then(|s| s.parse::<u64>().ok())
+        .filter(|&n| n > 0)
+        .ok_or(KVCacheError::BlockCountInvalid)?;
+
+    // Attention heads (use kv heads if present, else full heads)
+    let n_head_key = format!("{}.attention.head_count", arch);
+    let n_head_kv_key = format!("{}.attention.head_count_kv", arch);
+    let n_head = meta
+        .get(&n_head_kv_key)
+        .and_then(|s| s.parse::<u64>().ok())
+        .filter(|&n| n > 0)
+        .unwrap_or_else(|| {
+            meta.get(&n_head_key)
+                .and_then(|s| s.parse::<u64>().ok())
+                .unwrap_or(0)
+        });
+    if n_head == 0 {
+        return Err(KVCacheError::HeadCountInvalid);
+    }
+
+    // Key/value dimensions
+    let key_len_key = format!("{}.attention.key_length", arch);
+    let val_len_key = format!("{}.attention.value_length", arch);
+
+    let key_len = meta
+        .get(&key_len_key)
+        .and_then(|s| s.parse::<u64>().ok())
+        .unwrap_or(0);
+    let val_len = meta
+        .get(&val_len_key)
+        .and_then(|s| s.parse::<u64>().ok())
+        .unwrap_or(0);
+
+    if key_len == 0 || val_len == 0 {
+        return Err(KVCacheError::EmbeddingLengthInvalid);
+    }
+
+    // Context length
+    let max_ctx_key = format!("{}.context_length", arch);
+    let max_ctx = meta
+        .get(&max_ctx_key)
+        .and_then(|s| s.parse::<u64>().ok())
+        .filter(|&n| n > 0)
+        .ok_or(KVCacheError::ContextLengthInvalid)?;
+    let ctx_len = ctx_size.map(|size| size.min(max_ctx)).unwrap_or(max_ctx);
+
+    // Sliding window if present
+    let sliding_key = format!("{}.attention.sliding_window", arch);
+    let sliding_window = meta
+        .get(&sliding_key)
+        .and_then(|s| s.parse::<u64>().ok())
+        .filter(|&n| n > 0);
+
+    // Assume fp16
+    const BYTES_PER_ELEMENT: u64 = 2;
+
+    // Per-token KV size
+    let kv_per_token = n_layer * n_head * (key_len + val_len) * BYTES_PER_ELEMENT;
+
+    // Pure full-attention cost
+    let full_cost = ctx_len * kv_per_token;
+
+    // Pure sliding-window cost (tiny, only keeps last W tokens)
+    let sliding_cost = sliding_window.map(|w| w * kv_per_token);
+
+    // Middle estimate: average of sliding + full if sliding_window is present
+    let chosen_size = if let Some(slide) = sliding_cost {
+        let middle = (full_cost + slide) / 2;
+        log::info!(
+            "KV estimates -> sliding: {} bytes (~{:.2} MB), full: {} bytes (~{:.2} MB), middle: {} bytes (~{:.2} MB)",
+            slide,
+            slide as f64 / (1024.0 * 1024.0),
+            full_cost,
+            full_cost as f64 / (1024.0 * 1024.0),
+            middle,
+            middle as f64 / (1024.0 * 1024.0)
+        );
+        middle
+    } else {
+        log::info!(
+            "KV estimate (no SWA detected) -> full: {} bytes (~{:.2} MB)",
+            full_cost,
+            full_cost as f64 / (1024.0 * 1024.0)
+        );
+        full_cost
+    };
+
+    Ok(KVCacheEstimate {
+        size: chosen_size,
+        per_token_size: kv_per_token,
+    })
+}
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs
index d35cb24cf..0cd09cb2b 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs
@@ -33,6 +33,10 @@ pub fn init<R: Runtime>() -> TauriPlugin<R> {
             commands::get_session_by_model,
             // GGUF commands
             gguf::commands::read_gguf_metadata,
+            gguf::commands::estimate_kv_cache_size,
+            gguf::commands::get_model_size,
+            gguf::commands::is_model_supported,
+            gguf::model_planner::plan_model_load
         ])
         .setup(|app, _api| {
             // Initialize and manage the plugin state
diff --git a/src-tauri/src/core/filesystem/commands.rs b/src-tauri/src/core/filesystem/commands.rs
index 7cf7803c6..6bb3f534a 100644
--- a/src-tauri/src/core/filesystem/commands.rs
+++ b/src-tauri/src/core/filesystem/commands.rs
@@ -193,7 +193,7 @@ pub fn decompress(app: tauri::AppHandle, path: &str, output_dir: &str) -> Result
             fs::File::open(&path_buf).map_err(|e| e.to_string())?
         }
     };
-    
+
     #[cfg(not(windows))]
     let file = fs::File::open(&path_buf).map_err(|e| e.to_string())?;
     if path.ends_with(".tar.gz") {
@@ -222,7 +222,10 @@ pub fn decompress(app: tauri::AppHandle, path: &str, output_dir: &str) -> Result
                 {
                     use std::os::unix::fs::PermissionsExt;
                     if let Some(mode) = entry.unix_mode() {
-                        let _ = std::fs::set_permissions(&outpath, std::fs::Permissions::from_mode(mode));
+                        let _ = std::fs::set_permissions(
+                            &outpath,
+                            std::fs::Permissions::from_mode(mode),
+                        );
                     }
                 }
             }
diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx
index 39a587cbc..9a3bfd814 100644
--- a/web-app/src/containers/ModelSetting.tsx
+++ b/web-app/src/containers/ModelSetting.tsx
@@ -103,6 +103,13 @@ export function ModelSetting({
           })
         }
 
+        if (model.settings?.batch_size && result.batchSize !== undefined) {
+          settingsToUpdate.push({
+            key: 'batch_size',
+            value: result.batchSize,
+          })
+        }
+
         // Apply all settings in a single update to avoid race conditions
         if (settingsToUpdate.length > 0) {
           handleMultipleSettingsChange(settingsToUpdate)
@@ -163,7 +170,8 @@ export function ModelSetting({
           key === 'ctx_len' ||
           key === 'ngl' ||
           key === 'chat_template' ||
-          key === 'offload_mmproj'
+          key === 'offload_mmproj' ||
+          key === 'batch_size'
       )
 
       if (requiresRestart) {
@@ -222,7 +230,8 @@ export function ModelSetting({
         key === 'ctx_len' ||
         key === 'ngl' ||
         key === 'chat_template' ||
-        key === 'offload_mmproj'
+        key === 'offload_mmproj' ||
+        key === 'batch_size'
       ) {
         // Check if model is running before stopping it
         serviceHub
diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts
index 4d476ae7c..bd3dbc49b 100644
--- a/web-app/src/hooks/useModelProvider.ts
+++ b/web-app/src/hooks/useModelProvider.ts
@@ -288,9 +288,40 @@ export const useModelProvider = create<ModelProviderState>()(
           })
         }
 
+        if (version <= 2 && state?.providers) {
+          state.providers.forEach((provider) => {
+            // Update cont_batching description for llamacpp provider
+            if (provider.provider === 'llamacpp' && provider.settings) {
+              const contBatchingSetting = provider.settings.find(
+                (s) => s.key === 'cont_batching'
+              )
+              if (contBatchingSetting) {
+                contBatchingSetting.description =
+                  'Enable continuous batching (a.k.a dynamic batching) for concurrent requests.'
+              }
+            }
+
+            // Migrate model settings
+            if (provider.models && provider.provider === 'llamacpp') {
+              provider.models.forEach((model) => {
+                if (!model.settings) model.settings = {}
+
+                if (!model.settings.batch_size) {
+                  model.settings.batch_size = {
+                    ...modelSettings.batch_size,
+                    controller_props: {
+                      ...modelSettings.batch_size.controller_props,
+                    },
+                  }
+                }
+              })
+            }
+          })
+        }
+
         return state
       },
-      version: 2,
+      version: 3,
     }
   )
 )
diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts
index 32d05d70c..1b90ee732 100644
--- a/web-app/src/lib/predefined.ts
+++ b/web-app/src/lib/predefined.ts
@@ -153,4 +153,16 @@ export const modelSettings = {
       value: false,
     },
   },
+  batch_size: {
+    key: 'batch_size',
+    title: 'Batch Size',
+    description: 'Logical maximum batch size for processing prompts.',
+    controller_type: 'input',
+    controller_props: {
+      value: 2048,
+      placeholder: '2048',
+      type: 'number',
+      textAlign: 'right',
+    },
+  },
 }
diff --git a/web-app/src/services/models/default.ts b/web-app/src/services/models/default.ts
index 186706334..5d18e2985 100644
--- a/web-app/src/services/models/default.ts
+++ b/web-app/src/services/models/default.ts
@@ -533,19 +533,21 @@ export class DefaultModelsService implements ModelsService {
       // Fallback if method is not available
       console.warn('planModelLoad method not available in llamacpp engine')
       return {
-        gpuLayers: 0,
+        gpuLayers: 100,
         maxContextLength: 2048,
-        noOffloadKVCache: true,
+        noOffloadKVCache: false,
         offloadMmproj: false,
+        batchSize: 2048,
         mode: 'Unsupported',
       }
     } catch (error) {
       console.error(`Error planning model load for path ${modelPath}:`, error)
       return {
-        gpuLayers: 0,
+        gpuLayers: 100,
         maxContextLength: 2048,
-        noOffloadKVCache: true,
+        noOffloadKVCache: false,
         offloadMmproj: false,
+        batchSize: 2048,
         mode: 'Unsupported',
       }
     }
diff --git a/web-app/src/services/models/types.ts b/web-app/src/services/models/types.ts
index d92dae38a..6248e82ac 100644
--- a/web-app/src/services/models/types.ts
+++ b/web-app/src/services/models/types.ts
@@ -86,6 +86,7 @@ export interface ModelPlan {
   maxContextLength: number
   noOffloadKVCache: boolean
   offloadMmproj: boolean
+  batchSize: number
   mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
 }