diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
index 1d98d4213..efccca679 100644
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@@ -1742,13 +1742,13 @@ export default class llamacpp_extension extends AIEngine {
           try {
             const data = JSON.parse(jsonStr)
             const chunk = data as chatCompletionChunk
-            
+
             // Check for out-of-context error conditions
             if (chunk.choices?.[0]?.finish_reason === 'length') {
               // finish_reason 'length' indicates context limit was hit
               throw new Error(OUT_OF_CONTEXT_SIZE)
             }
-            
+
             yield chunk
           } catch (e) {
             logger.error('Error parsing JSON from stream or server error:', e)
@@ -1828,13 +1828,13 @@ export default class llamacpp_extension extends AIEngine {
     }
 
     const completionResponse = (await response.json()) as chatCompletion
-    
+
     // Check for out-of-context error conditions
     if (completionResponse.choices?.[0]?.finish_reason === 'length') {
       // finish_reason 'length' indicates context limit was hit
       throw new Error(OUT_OF_CONTEXT_SIZE)
     }
-    
+
     return completionResponse
   }
 
@@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine {
       totalMemory,
     }
   }
-  private async getKVCachePerToken(
-    meta: Record<string, string>
-  ): Promise<number> {
-    const arch = meta['general.architecture']
-    const nLayer = Number(meta[`${arch}.block_count`])
-    const nHead = Number(meta[`${arch}.attention.head_count`])
-
-    // Get head dimensions
-    const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
-    const embeddingLen = Number(meta[`${arch}.embedding_length`])
-    const headDim = embeddingLen / nHead
-
-    // KV cache uses head_count_kv (for GQA models) or head_count
-    // Each token needs K and V, both are fp16 (2 bytes)
-    const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
-
-    return bytesPerToken
-  }
 
   private async getLayerSize(
     path: string,
@@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine {
       gguf.metadata
     )
 
-    // Fixed KV cache calculation
-    const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
+    const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
+      .perTokenSize
 
-    // Debug logging
     logger.info(
       `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
     )
@@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine {
       throw new Error(`Invalid layer size: ${layerSize}`)
     }
 
-    // GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
-    const GPU_OVERHEAD_FACTOR = 0.8
-
-    // VRAM budget with overhead consideration
+    // Reserve memory for OS, other applications, and fixed engine overhead.
     const VRAM_RESERVE_GB = 0.5
     const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
-    const usableVRAM = Math.max(
-      0,
-      (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
-    )
+    const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.
 
     // Get model's maximum context length
     const arch = gguf.metadata['general.architecture']
     const modelMaxContextLength =
-      Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
+      Number(gguf.metadata[`${arch}.context_length`]) || 8192
 
-    // Set minimum context length
-    const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
+    const MIN_CONTEXT_LENGTH = 1024
 
-    // System RAM budget
+    // Memory percentages applied to both VRAM and RAM
     const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
 
     logger.info(
       `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
     )
 
-    // Validate memory info
     if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
       throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
     }
@@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine {
       )
     }
 
-    // Calculate actual system RAM
-    const actualSystemRAM = Math.max(
+    // Apply memory mode to both VRAM and RAM separately
+    const memoryModeMultiplier = memoryPercentages[this.memoryMode]
+    const usableVRAM = Math.max(
       0,
-      memoryInfo.totalMemory - memoryInfo.totalVRAM
+      memoryInfo.totalVRAM * memoryModeMultiplier -
+        VRAM_RESERVE_BYTES -
+        ENGINE_FIXED_OVERHEAD_BYTES
     )
-    const usableSystemMemory =
-      actualSystemRAM * memoryPercentages[this.memoryMode]
+
+    const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
+    const usableSystemMemory = actualSystemRAM * memoryModeMultiplier
 
     logger.info(
-      `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
+      `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
     )
 
-    // --- Priority 1: Allocate mmproj (if exists) ---
-    let offloadMmproj = false
-    let remainingVRAM = usableVRAM
-
-    if (mmprojSize > 0) {
-      if (mmprojSize <= remainingVRAM) {
-        offloadMmproj = true
-        remainingVRAM -= mmprojSize
-        logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
-      } else {
-        logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
-      }
-    }
-
-    // --- Priority 2: Calculate optimal layer/context balance ---
     let gpuLayers = 0
-    let maxContextLength = MIN_CONTEXT_LENGTH
+    let maxContextLength = 0
     let noOffloadKVCache = false
     let mode: ModelPlan['mode'] = 'Unsupported'
+    let offloadMmproj = false
 
-    // Calculate how much VRAM we need for different context sizes
-    const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
-    const targetContext = requestedCtx || modelMaxContextLength
-
-    // Find the best balance of layers and context
-    let bestConfig = {
-      layers: 0,
-      context: MIN_CONTEXT_LENGTH,
-      vramUsed: 0,
+    let remainingVRAM = usableVRAM
+    if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
+      offloadMmproj = true
+      remainingVRAM -= mmprojSize
     }
+    const vramForMinContext = (
+      await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
+    ).size
 
-    for (const ctxSize of contextSizes) {
-      if (ctxSize > targetContext) break
-
-      const kvCacheSize = ctxSize * kvCachePerToken
-      const availableForLayers = remainingVRAM - kvCacheSize
-
-      if (availableForLayers <= 0) continue
-
-      const possibleLayers = Math.min(
-        Math.floor(availableForLayers / layerSize),
-        totalLayers
+    const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
+    if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) {
+      logger.error(
+        `Model unsupported. Not enough resources for model and min context.`
       )
-
-      if (possibleLayers > 0) {
-        const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
-
-        // Verify this fits with some margin
-        if (totalVramNeeded <= remainingVRAM * 0.95) {
-          bestConfig = {
-            layers: possibleLayers,
-            context: ctxSize,
-            vramUsed: totalVramNeeded,
-          }
-        }
+      return {
+        gpuLayers: 0,
+        maxContextLength: 0,
+        noOffloadKVCache: true,
+        mode: 'Unsupported',
+        offloadMmproj: false,
       }
     }
 
-    // Apply the best configuration found
-    if (bestConfig.layers > 0) {
-      gpuLayers = bestConfig.layers
-      maxContextLength = bestConfig.context
+    const targetContext = Math.min(
+      requestedCtx || modelMaxContextLength,
+      modelMaxContextLength
+    )
+
+    let targetContextSize = (
+      await this.estimateKVCache(gguf.metadata, targetContext)
+    ).size
+
+    // Use `kvCachePerToken` for all VRAM calculations
+    if (modelSize + targetContextSize <= remainingVRAM) {
+      mode = 'GPU'
+      gpuLayers = totalLayers
+      maxContextLength = targetContext
       noOffloadKVCache = false
-      mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
+      logger.info(
+        'Planning: Ideal case fits. All layers and target context in VRAM.'
+      )
+    } else if (modelSize <= remainingVRAM) {
+      mode = 'GPU'
+      gpuLayers = totalLayers
+      noOffloadKVCache = false
+      const vramLeftForContext = remainingVRAM - modelSize
+      maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
+
+      // Add safety check to prevent OOM
+      const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
+      maxContextLength = Math.floor(maxContextLength * safetyBuffer)
 
       logger.info(
-        `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
-          `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
+        `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
       )
     } else {
-      // Fallback: Try minimal GPU layers with KV cache on CPU
-      gpuLayers = Math.min(
-        Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
-        totalLayers
-      )
+      const vramAvailableForLayers = remainingVRAM - vramForMinContext
 
-      if (gpuLayers > 0) {
-        // Calculate available system RAM for KV cache
-        const cpuLayers = totalLayers - gpuLayers
-        const modelCPUSize = cpuLayers * layerSize
-        const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
-        const systemRAMUsed = modelCPUSize + mmprojCPUSize
-        const availableSystemRAMForKVCache = Math.max(
-          0,
-          usableSystemMemory - systemRAMUsed
+      if (vramAvailableForLayers >= layerSize) {
+        mode = 'Hybrid'
+        gpuLayers = Math.min(
+          Math.floor(vramAvailableForLayers / layerSize),
+          totalLayers
         )
+        noOffloadKVCache = false
+        const vramUsedByLayers = gpuLayers * layerSize
+        const vramLeftForContext = remainingVRAM - vramUsedByLayers
+        maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
 
-        // Calculate context that fits in system RAM
-        const systemRAMContext = Math.min(
-          Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
-          targetContext
+        logger.info(
+          'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
         )
+      }
+    }
 
-        if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
-          maxContextLength = systemRAMContext
-          noOffloadKVCache = true
+    // Fallback logic: try different configurations if no VRAM-based plan worked
+    if (mode === 'Unsupported') {
+      logger.info('Planning: Trying fallback configurations...')
+
+      // Try putting some layers on GPU with KV cache in RAM
+      const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
+      if (possibleGpuLayers > 0) {
+        gpuLayers = Math.min(possibleGpuLayers, totalLayers)
+        const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
+        const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
+        const availableRamForKv =
+          usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
+        // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
+        const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
+
+        if (contextInRam >= MIN_CONTEXT_LENGTH) {
           mode = 'Hybrid'
-
-          logger.info(
-            `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
-              `${maxContextLength} context on CPU RAM`
-          )
-        } else {
-          // Can't fit reasonable context even with CPU RAM
-          // Reduce GPU layers further
-          gpuLayers = Math.floor(gpuLayers / 2)
-          maxContextLength = MIN_CONTEXT_LENGTH
+          maxContextLength = contextInRam
           noOffloadKVCache = true
-          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
+          logger.info(
+            `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
+          )
         }
-      } else {
-        // Pure CPU mode
+      }
+
+      // If still unsupported, try pure CPU mode
+      if (mode === 'Unsupported') {
         gpuLayers = 0
         noOffloadKVCache = true
-
-        // Calculate context for pure CPU mode
-        const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
-        const availableForKVCache = Math.max(
-          0,
-          usableSystemMemory - totalCPUMemoryNeeded
-        )
-
-        maxContextLength = Math.min(
-          Math.max(
-            MIN_CONTEXT_LENGTH,
-            Math.floor(availableForKVCache / kvCachePerToken)
-          ),
-          targetContext
-        )
-
-        mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
-      }
-    }
-
-    // Safety check: Verify total GPU memory usage
-    if (gpuLayers > 0 && !noOffloadKVCache) {
-      const estimatedGPUUsage =
-        gpuLayers * layerSize +
-        maxContextLength * kvCachePerToken +
-        (offloadMmproj ? mmprojSize : 0)
-
-      if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
-        logger.warn(
-          `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
-        )
-
-        // Reduce context first
-        while (
-          maxContextLength > MIN_CONTEXT_LENGTH &&
-          estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
-        ) {
-          maxContextLength = Math.floor(maxContextLength / 2)
-          const newEstimate =
-            gpuLayers * layerSize +
-            maxContextLength * kvCachePerToken +
-            (offloadMmproj ? mmprojSize : 0)
-          if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
-        }
-
-        // If still too much, reduce layers
-        if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
-          gpuLayers = Math.floor(gpuLayers * 0.7)
-          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
-          noOffloadKVCache = true // Move KV cache to CPU
+        offloadMmproj = false
+        const ramUsedByModel = modelSize + mmprojSize
+        const availableRamForKv = usableSystemMemory - ramUsedByModel
+        maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
+        if (maxContextLength >= MIN_CONTEXT_LENGTH) {
+          mode = 'CPU'
+          logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
         }
       }
     }
 
-    // Apply user-requested context limit if specified
+    if (mode === 'CPU' || noOffloadKVCache) {
+      offloadMmproj = false
+    }
+
     if (requestedCtx && requestedCtx > 0) {
       maxContextLength = Math.min(maxContextLength, requestedCtx)
-      logger.info(
-        `User requested context: ${requestedCtx}, final: ${maxContextLength}`
-      )
     }
 
-    // Ensure we never exceed model's maximum context
     maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
 
-    // Final validation
-    if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
+    if (maxContextLength < MIN_CONTEXT_LENGTH) {
       mode = 'Unsupported'
     }
 
-    // Ensure maxContextLength is valid
-    maxContextLength = isNaN(maxContextLength)
-      ? MIN_CONTEXT_LENGTH
-      : Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
+    if (mode === 'Unsupported') {
+      gpuLayers = 0
+      maxContextLength = 0
+    }
+
+    maxContextLength = isNaN(maxContextLength)
+      ? 0
+      : Math.floor(maxContextLength)
 
-    // Log final plan
     const mmprojInfo = mmprojPath
       ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
       : ''
@@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine {
       offloadMmproj,
     }
   }
-
   /**
    * estimate KVCache size from a given metadata
    */
   private async estimateKVCache(
     meta: Record<string, string>,
     ctx_size?: number
-  ): Promise<number> {
+  ): Promise<{ size: number; perTokenSize: number }> {
     const arch = meta['general.architecture']
     if (!arch) throw new Error('Invalid metadata: architecture not found')
 
@@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine {
       )
     }
 
-    let ctxLen: number
-    if (!ctx_size) {
-      ctxLen = Number(meta[`${arch}.context_length`])
-    } else {
-      ctxLen = ctx_size
-    }
+    const maxCtx = Number(meta[`${arch}.context_length`])
+    if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
+
+    // b) If the user supplied a value, clamp it to the model's max
+    let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
+
+    logger.info(`Final context length used for KV size: ${ctxLen}`)
+    logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)
 
     logger.info(`ctxLen: ${ctxLen}`)
     logger.info(`nLayer: ${nLayer}`)
@@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine {
     // fp16 = 8 bits * 2 = 16
     const bytesPerElement = 2
 
-    // Total KV cache size per token = nHead * headDim * bytesPerElement
-    const kvPerToken = nHead * headDim * bytesPerElement
+    // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
+    const kvPerToken = nHead * headDim * bytesPerElement * nLayer
 
-    return ctxLen * nLayer * kvPerToken
+    return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
   }
 
   private async getModelSize(path: string): Promise<number> {
@@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine {
       const gguf = await readGgufMetadata(path)
       let kvCacheSize: number
       if (ctx_size) {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
       } else {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
       }
 
       // Total memory consumption = model weights + kvcache
@@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine {
       )
 
       // Use 80% of total memory as the usable limit
-      const USABLE_MEMORY_PERCENTAGE = 0.8
+      const USABLE_MEMORY_PERCENTAGE = 0.9
       const usableTotalMemory =
-        memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
+        memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
+        memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
       const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
 
       // Check if model fits in total memory at all