fix: KVCache size calculation and refactor (#6438)

- Removed the unused `getKVCachePerToken` helper and replaced it with a unified `estimateKVCache` that returns both total size and per‑token size. - Fixed the KV cache size calculation to account for all layers, correcting previous under‑estimation. - Added proper clamping of user‑requested context lengths to the model’s maximum. - Refactored VRAM budgeting: introduced explicit reserves, fixed engine overhead, and separate multipliers for VRAM and system RAM based on memory mode. - Implemented a more robust planning flow with clear GPU, Hybrid, and CPU pathways, including fallback configurations when resources are insufficient. - Updated default context length handling and safety buffers to prevent OOM situations. - Adjusted usable memory percentage to 90 % and refined logging for easier debugging.
2025-09-15 10:16:13 +05:30 · 2025-09-15 10:16:13 +05:30 · 489c5a3d9c
commit 489c5a3d9c
parent 7a2782e6fd
1 changed files with 143 additions and 210 deletions
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -1742,13 +1742,13 @@ export default class llamacpp_extension extends AIEngine {
          try {
            const data = JSON.parse(jsonStr)
            const chunk = data as chatCompletionChunk
-            
+
            // Check for out-of-context error conditions
            if (chunk.choices?.[0]?.finish_reason === 'length') {
              // finish_reason 'length' indicates context limit was hit
              throw new Error(OUT_OF_CONTEXT_SIZE)
            }
-            
+
            yield chunk
          } catch (e) {
            logger.error('Error parsing JSON from stream or server error:', e)
@ -1828,13 +1828,13 @@ export default class llamacpp_extension extends AIEngine {
    }
    const completionResponse = (await response.json()) as chatCompletion
-    
+
    // Check for out-of-context error conditions
    if (completionResponse.choices?.[0]?.finish_reason === 'length') {
      // finish_reason 'length' indicates context limit was hit
      throw new Error(OUT_OF_CONTEXT_SIZE)
    }
-    
+
    return completionResponse
  }
@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine {
      totalMemory,
    }
  }
  private async getKVCachePerToken(
    meta: Record<string, string>
  ): Promise<number> {
    const arch = meta['general.architecture']
    const nLayer = Number(meta[`${arch}.block_count`])
    const nHead = Number(meta[`${arch}.attention.head_count`])
    // Get head dimensions
    const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
    const embeddingLen = Number(meta[`${arch}.embedding_length`])
    const headDim = embeddingLen / nHead
    // KV cache uses head_count_kv (for GQA models) or head_count
    // Each token needs K and V, both are fp16 (2 bytes)
    const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
    return bytesPerToken
  }
  private async getLayerSize(
    path: string,
@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine {
      gguf.metadata
    )
-    // Fixed KV cache calculation
+    const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
-    const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
+      .perTokenSize
    // Debug logging
    logger.info(
      `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
    )
@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine {
      throw new Error(`Invalid layer size: ${layerSize}`)
    }
-    // GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
+    // Reserve memory for OS, other applications, and fixed engine overhead.
    const GPU_OVERHEAD_FACTOR = 0.8
    // VRAM budget with overhead consideration
    const VRAM_RESERVE_GB = 0.5
    const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
-    const usableVRAM = Math.max(
+    const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.
      0,
      (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
    )
    // Get model's maximum context length
    const arch = gguf.metadata['general.architecture']
    const modelMaxContextLength =
-      Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
+      Number(gguf.metadata[`${arch}.context_length`]) || 8192
-    // Set minimum context length
+    const MIN_CONTEXT_LENGTH = 1024
    const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
-    // System RAM budget
+    // Memory percentages applied to both VRAM and RAM
    const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
    logger.info(
      `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
    )
    // Validate memory info
    if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
      throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
    }
@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine {
      )
    }
-    // Calculate actual system RAM
+    // Apply memory mode to both VRAM and RAM separately
-    const actualSystemRAM = Math.max(
+    const memoryModeMultiplier = memoryPercentages[this.memoryMode]
    const usableVRAM = Math.max(
      0,
-      memoryInfo.totalMemory - memoryInfo.totalVRAM
+      memoryInfo.totalVRAM * memoryModeMultiplier -
        VRAM_RESERVE_BYTES -
        ENGINE_FIXED_OVERHEAD_BYTES
    )
-    const usableSystemMemory =
+
-      actualSystemRAM * memoryPercentages[this.memoryMode]
+    const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
    const usableSystemMemory = actualSystemRAM * memoryModeMultiplier
    logger.info(
-      `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
+      `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
    )
    // --- Priority 1: Allocate mmproj (if exists) ---
    let offloadMmproj = false
    let remainingVRAM = usableVRAM
    if (mmprojSize > 0) {
      if (mmprojSize <= remainingVRAM) {
        offloadMmproj = true
        remainingVRAM -= mmprojSize
        logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
      } else {
        logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
      }
    }
    // --- Priority 2: Calculate optimal layer/context balance ---
    let gpuLayers = 0
-    let maxContextLength = MIN_CONTEXT_LENGTH
+    let maxContextLength = 0
    let noOffloadKVCache = false
    let mode: ModelPlan['mode'] = 'Unsupported'
    let offloadMmproj = false
-    // Calculate how much VRAM we need for different context sizes
+    let remainingVRAM = usableVRAM
-    const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
+    if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
-    const targetContext = requestedCtx || modelMaxContextLength
+      offloadMmproj = true
-
+      remainingVRAM -= mmprojSize
    // Find the best balance of layers and context
    let bestConfig = {
      layers: 0,
      context: MIN_CONTEXT_LENGTH,
      vramUsed: 0,
    }
    const vramForMinContext = (
      await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
    ).size
-    for (const ctxSize of contextSizes) {
+    const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
-      if (ctxSize > targetContext) break
+    if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) {
-
+      logger.error(
-      const kvCacheSize = ctxSize * kvCachePerToken
+        `Model unsupported. Not enough resources for model and min context.`
      const availableForLayers = remainingVRAM - kvCacheSize
      if (availableForLayers <= 0) continue
      const possibleLayers = Math.min(
        Math.floor(availableForLayers / layerSize),
        totalLayers
      )
-
+      return {
-      if (possibleLayers > 0) {
+        gpuLayers: 0,
-        const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
+        maxContextLength: 0,
-
+        noOffloadKVCache: true,
-        // Verify this fits with some margin
+        mode: 'Unsupported',
-        if (totalVramNeeded <= remainingVRAM * 0.95) {
+        offloadMmproj: false,
          bestConfig = {
            layers: possibleLayers,
            context: ctxSize,
            vramUsed: totalVramNeeded,
          }
        }
      }
    }
-    // Apply the best configuration found
+    const targetContext = Math.min(
-    if (bestConfig.layers > 0) {
+      requestedCtx || modelMaxContextLength,
-      gpuLayers = bestConfig.layers
+      modelMaxContextLength
-      maxContextLength = bestConfig.context
+    )
    let targetContextSize = (
      await this.estimateKVCache(gguf.metadata, targetContext)
    ).size
    // Use `kvCachePerToken` for all VRAM calculations
    if (modelSize + targetContextSize <= remainingVRAM) {
      mode = 'GPU'
      gpuLayers = totalLayers
      maxContextLength = targetContext
      noOffloadKVCache = false
-      mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
+      logger.info(
        'Planning: Ideal case fits. All layers and target context in VRAM.'
      )
    } else if (modelSize <= remainingVRAM) {
      mode = 'GPU'
      gpuLayers = totalLayers
      noOffloadKVCache = false
      const vramLeftForContext = remainingVRAM - modelSize
      maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
      // Add safety check to prevent OOM
      const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
      maxContextLength = Math.floor(maxContextLength * safetyBuffer)
      logger.info(
-        `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
+        `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
          `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
      )
    } else {
-      // Fallback: Try minimal GPU layers with KV cache on CPU
+      const vramAvailableForLayers = remainingVRAM - vramForMinContext
      gpuLayers = Math.min(
        Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
        totalLayers
      )
-      if (gpuLayers > 0) {
+      if (vramAvailableForLayers >= layerSize) {
-        // Calculate available system RAM for KV cache
+        mode = 'Hybrid'
-        const cpuLayers = totalLayers - gpuLayers
+        gpuLayers = Math.min(
-        const modelCPUSize = cpuLayers * layerSize
+          Math.floor(vramAvailableForLayers / layerSize),
-        const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
+          totalLayers
        const systemRAMUsed = modelCPUSize + mmprojCPUSize
        const availableSystemRAMForKVCache = Math.max(
          0,
          usableSystemMemory - systemRAMUsed
        )
        noOffloadKVCache = false
        const vramUsedByLayers = gpuLayers * layerSize
        const vramLeftForContext = remainingVRAM - vramUsedByLayers
        maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
-        // Calculate context that fits in system RAM
+        logger.info(
-        const systemRAMContext = Math.min(
+          'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
          Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
          targetContext
        )
      }
    }
-        if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
+    // Fallback logic: try different configurations if no VRAM-based plan worked
-          maxContextLength = systemRAMContext
+    if (mode === 'Unsupported') {
-          noOffloadKVCache = true
+      logger.info('Planning: Trying fallback configurations...')
      // Try putting some layers on GPU with KV cache in RAM
      const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
      if (possibleGpuLayers > 0) {
        gpuLayers = Math.min(possibleGpuLayers, totalLayers)
        const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
        const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
        const availableRamForKv =
          usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
        // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
        const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
        if (contextInRam >= MIN_CONTEXT_LENGTH) {
          mode = 'Hybrid'
-
+          maxContextLength = contextInRam
          logger.info(
            `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
              `${maxContextLength} context on CPU RAM`
          )
        } else {
          // Can't fit reasonable context even with CPU RAM
          // Reduce GPU layers further
          gpuLayers = Math.floor(gpuLayers / 2)
          maxContextLength = MIN_CONTEXT_LENGTH
          noOffloadKVCache = true
-          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
+          logger.info(
            `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
          )
        }
-      } else {
+      }
-        // Pure CPU mode
+
      // If still unsupported, try pure CPU mode
      if (mode === 'Unsupported') {
        gpuLayers = 0
        noOffloadKVCache = true
-
+        offloadMmproj = false
-        // Calculate context for pure CPU mode
+        const ramUsedByModel = modelSize + mmprojSize
-        const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
+        const availableRamForKv = usableSystemMemory - ramUsedByModel
-        const availableForKVCache = Math.max(
+        maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
-          0,
+        if (maxContextLength >= MIN_CONTEXT_LENGTH) {
-          usableSystemMemory - totalCPUMemoryNeeded
+          mode = 'CPU'
-        )
+          logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
        maxContextLength = Math.min(
          Math.max(
            MIN_CONTEXT_LENGTH,
            Math.floor(availableForKVCache / kvCachePerToken)
          ),
          targetContext
        )
        mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
      }
    }
    // Safety check: Verify total GPU memory usage
    if (gpuLayers > 0 && !noOffloadKVCache) {
      const estimatedGPUUsage =
        gpuLayers * layerSize +
        maxContextLength * kvCachePerToken +
        (offloadMmproj ? mmprojSize : 0)
      if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
        logger.warn(
          `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
        )
        // Reduce context first
        while (
          maxContextLength > MIN_CONTEXT_LENGTH &&
          estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
        ) {
          maxContextLength = Math.floor(maxContextLength / 2)
          const newEstimate =
            gpuLayers * layerSize +
            maxContextLength * kvCachePerToken +
            (offloadMmproj ? mmprojSize : 0)
          if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
        }
        // If still too much, reduce layers
        if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
          gpuLayers = Math.floor(gpuLayers * 0.7)
          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
          noOffloadKVCache = true // Move KV cache to CPU
        }
      }
    }
-    // Apply user-requested context limit if specified
+    if (mode === 'CPU' || noOffloadKVCache) {
      offloadMmproj = false
    }
    if (requestedCtx && requestedCtx > 0) {
      maxContextLength = Math.min(maxContextLength, requestedCtx)
      logger.info(
        `User requested context: ${requestedCtx}, final: ${maxContextLength}`
      )
    }
    // Ensure we never exceed model's maximum context
    maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
-    // Final validation
+    if (maxContextLength < MIN_CONTEXT_LENGTH) {
    if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
      mode = 'Unsupported'
    }
-    // Ensure maxContextLength is valid
+    if (mode === 'Unsupported') {
-    maxContextLength = isNaN(maxContextLength)
+      gpuLayers = 0
-      ? MIN_CONTEXT_LENGTH
+      maxContextLength = 0
-      : Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
+    }
    maxContextLength = isNaN(maxContextLength)
      ? 0
      : Math.floor(maxContextLength)
    // Log final plan
    const mmprojInfo = mmprojPath
      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
      : ''
@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine {
      offloadMmproj,
    }
  }
  /**
   * estimate KVCache size from a given metadata
   */
  private async estimateKVCache(
    meta: Record<string, string>,
    ctx_size?: number
-  ): Promise<number> {
+  ): Promise<{ size: number; perTokenSize: number }> {
    const arch = meta['general.architecture']
    if (!arch) throw new Error('Invalid metadata: architecture not found')
@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine {
      )
    }
-    let ctxLen: number
+    const maxCtx = Number(meta[`${arch}.context_length`])
-    if (!ctx_size) {
+    if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
-      ctxLen = Number(meta[`${arch}.context_length`])
+
-    } else {
+    // b) If the user supplied a value, clamp it to the model's max
-      ctxLen = ctx_size
+    let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
-    }
+
    logger.info(`Final context length used for KV size: ${ctxLen}`)
    logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)
    logger.info(`ctxLen: ${ctxLen}`)
    logger.info(`nLayer: ${nLayer}`)
@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine {
    // fp16 = 8 bits * 2 = 16
    const bytesPerElement = 2
-    // Total KV cache size per token = nHead * headDim * bytesPerElement
+    // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
-    const kvPerToken = nHead * headDim * bytesPerElement
+    const kvPerToken = nHead * headDim * bytesPerElement * nLayer
-    return ctxLen * nLayer * kvPerToken
+    return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
  }
  private async getModelSize(path: string): Promise<number> {
@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine {
      const gguf = await readGgufMetadata(path)
      let kvCacheSize: number
      if (ctx_size) {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
      } else {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
      }
      // Total memory consumption = model weights + kvcache
@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine {
      )
      // Use 80% of total memory as the usable limit
-      const USABLE_MEMORY_PERCENTAGE = 0.8
+      const USABLE_MEMORY_PERCENTAGE = 0.9
      const usableTotalMemory =
-        memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
+        memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
        memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
      const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
      // Check if model fits in total memory at all