fix: KVCache size calculation and refactor (#6438)

- Removed the unused `getKVCachePerToken` helper and replaced it with a unified `estimateKVCache` that returns both total size and per‑token size. - Fixed the KV cache size calculation to account for all layers, correcting previous under‑estimation. - Added proper clamping of user‑requested context lengths to the model’s maximum. - Refactored VRAM budgeting: introduced explicit reserves, fixed engine overhead, and separate multipliers for VRAM and system RAM based on memory mode. - Implemented a more robust planning flow with clear GPU, Hybrid, and CPU pathways, including fallback configurations when resources are insufficient. - Updated default context length handling and safety buffers to prevent OOM situations. - Adjusted usable memory percentage to 90 % and refined logging for easier debugging.
2025-09-15 10:16:13 +05:30 · 2025-09-15 10:16:13 +05:30 · 489c5a3d9c
commit 489c5a3d9c
parent 7a2782e6fd
1 changed files with 143 additions and 210 deletions
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -1742,13 +1742,13 @@ export default class llamacpp_extension extends AIEngine {
          try {
            const data = JSON.parse(jsonStr)
            const chunk = data as chatCompletionChunk
-            
+
            // Check for out-of-context error conditions
            if (chunk.choices?.[0]?.finish_reason === 'length') {
              // finish_reason 'length' indicates context limit was hit
              throw new Error(OUT_OF_CONTEXT_SIZE)
            }
-            
+
            yield chunk
          } catch (e) {
            logger.error('Error parsing JSON from stream or server error:', e)
@ -1828,13 +1828,13 @@ export default class llamacpp_extension extends AIEngine {
    }

    const completionResponse = (await response.json()) as chatCompletion
-    
+
    // Check for out-of-context error conditions
    if (completionResponse.choices?.[0]?.finish_reason === 'length') {
      // finish_reason 'length' indicates context limit was hit
      throw new Error(OUT_OF_CONTEXT_SIZE)
    }
-    
+
    return completionResponse
  }

@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine {
      totalMemory,
    }
  }
-  private async getKVCachePerToken(
-    meta: Record<string, string>
-  ): Promise<number> {
-    const arch = meta['general.architecture']
-    const nLayer = Number(meta[`${arch}.block_count`])
-    const nHead = Number(meta[`${arch}.attention.head_count`])
-
-    // Get head dimensions
-    const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
-    const embeddingLen = Number(meta[`${arch}.embedding_length`])
-    const headDim = embeddingLen / nHead
-
-    // KV cache uses head_count_kv (for GQA models) or head_count
-    // Each token needs K and V, both are fp16 (2 bytes)
-    const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
-
-    return bytesPerToken
-  }

  private async getLayerSize(
    path: string,
@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine {
      gguf.metadata
    )

-    // Fixed KV cache calculation
-    const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
+    const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
+      .perTokenSize

-    // Debug logging
    logger.info(
      `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
    )
@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine {
      throw new Error(`Invalid layer size: ${layerSize}`)
    }

-    // GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
-    const GPU_OVERHEAD_FACTOR = 0.8
-
-    // VRAM budget with overhead consideration
+    // Reserve memory for OS, other applications, and fixed engine overhead.
    const VRAM_RESERVE_GB = 0.5
    const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
-    const usableVRAM = Math.max(
-      0,
-      (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
-    )
+    const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.

    // Get model's maximum context length
    const arch = gguf.metadata['general.architecture']
    const modelMaxContextLength =
-      Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
+      Number(gguf.metadata[`${arch}.context_length`]) || 8192

-    // Set minimum context length
-    const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
+    const MIN_CONTEXT_LENGTH = 1024

-    // System RAM budget
+    // Memory percentages applied to both VRAM and RAM
    const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }

    logger.info(
      `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
    )

-    // Validate memory info
    if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
      throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
    }
@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine {
      )
    }

-    // Calculate actual system RAM
-    const actualSystemRAM = Math.max(
+    // Apply memory mode to both VRAM and RAM separately
+    const memoryModeMultiplier = memoryPercentages[this.memoryMode]
+    const usableVRAM = Math.max(
      0,
-      memoryInfo.totalMemory - memoryInfo.totalVRAM
+      memoryInfo.totalVRAM * memoryModeMultiplier -
+        VRAM_RESERVE_BYTES -
+        ENGINE_FIXED_OVERHEAD_BYTES
    )
-    const usableSystemMemory =
-      actualSystemRAM * memoryPercentages[this.memoryMode]
+
+    const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
+    const usableSystemMemory = actualSystemRAM * memoryModeMultiplier

    logger.info(
-      `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
+      `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
    )

-    // --- Priority 1: Allocate mmproj (if exists) ---
-    let offloadMmproj = false
-    let remainingVRAM = usableVRAM
-
-    if (mmprojSize > 0) {
-      if (mmprojSize <= remainingVRAM) {
-        offloadMmproj = true
-        remainingVRAM -= mmprojSize
-        logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
-      } else {
-        logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
-      }
-    }
-
-    // --- Priority 2: Calculate optimal layer/context balance ---
    let gpuLayers = 0
-    let maxContextLength = MIN_CONTEXT_LENGTH
+    let maxContextLength = 0
    let noOffloadKVCache = false
    let mode: ModelPlan['mode'] = 'Unsupported'
+    let offloadMmproj = false

-    // Calculate how much VRAM we need for different context sizes
-    const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
-    const targetContext = requestedCtx || modelMaxContextLength
-
-    // Find the best balance of layers and context
-    let bestConfig = {
-      layers: 0,
-      context: MIN_CONTEXT_LENGTH,
-      vramUsed: 0,
+    let remainingVRAM = usableVRAM
+    if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
+      offloadMmproj = true
+      remainingVRAM -= mmprojSize
    }
+    const vramForMinContext = (
+      await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
+    ).size

-    for (const ctxSize of contextSizes) {
-      if (ctxSize > targetContext) break
-
-      const kvCacheSize = ctxSize * kvCachePerToken
-      const availableForLayers = remainingVRAM - kvCacheSize
-
-      if (availableForLayers <= 0) continue
-
-      const possibleLayers = Math.min(
-        Math.floor(availableForLayers / layerSize),
-        totalLayers
+    const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
+    if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) {
+      logger.error(
+        `Model unsupported. Not enough resources for model and min context.`
      )
-
-      if (possibleLayers > 0) {
-        const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
-
-        // Verify this fits with some margin
-        if (totalVramNeeded <= remainingVRAM * 0.95) {
-          bestConfig = {
-            layers: possibleLayers,
-            context: ctxSize,
-            vramUsed: totalVramNeeded,
-          }
-        }
+      return {
+        gpuLayers: 0,
+        maxContextLength: 0,
+        noOffloadKVCache: true,
+        mode: 'Unsupported',
+        offloadMmproj: false,
      }
    }

-    // Apply the best configuration found
-    if (bestConfig.layers > 0) {
-      gpuLayers = bestConfig.layers
-      maxContextLength = bestConfig.context
+    const targetContext = Math.min(
+      requestedCtx || modelMaxContextLength,
+      modelMaxContextLength
+    )
+
+    let targetContextSize = (
+      await this.estimateKVCache(gguf.metadata, targetContext)
+    ).size
+
+    // Use `kvCachePerToken` for all VRAM calculations
+    if (modelSize + targetContextSize <= remainingVRAM) {
+      mode = 'GPU'
+      gpuLayers = totalLayers
+      maxContextLength = targetContext
      noOffloadKVCache = false
-      mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
+      logger.info(
+        'Planning: Ideal case fits. All layers and target context in VRAM.'
+      )
+    } else if (modelSize <= remainingVRAM) {
+      mode = 'GPU'
+      gpuLayers = totalLayers
+      noOffloadKVCache = false
+      const vramLeftForContext = remainingVRAM - modelSize
+      maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
+
+      // Add safety check to prevent OOM
+      const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
+      maxContextLength = Math.floor(maxContextLength * safetyBuffer)

      logger.info(
-        `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
-          `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
+        `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
      )
    } else {
-      // Fallback: Try minimal GPU layers with KV cache on CPU
-      gpuLayers = Math.min(
-        Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
-        totalLayers
-      )
+      const vramAvailableForLayers = remainingVRAM - vramForMinContext

-      if (gpuLayers > 0) {
-        // Calculate available system RAM for KV cache
-        const cpuLayers = totalLayers - gpuLayers
-        const modelCPUSize = cpuLayers * layerSize
-        const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
-        const systemRAMUsed = modelCPUSize + mmprojCPUSize
-        const availableSystemRAMForKVCache = Math.max(
-          0,
-          usableSystemMemory - systemRAMUsed
+      if (vramAvailableForLayers >= layerSize) {
+        mode = 'Hybrid'
+        gpuLayers = Math.min(
+          Math.floor(vramAvailableForLayers / layerSize),
+          totalLayers
        )
+        noOffloadKVCache = false
+        const vramUsedByLayers = gpuLayers * layerSize
+        const vramLeftForContext = remainingVRAM - vramUsedByLayers
+        maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)

-        // Calculate context that fits in system RAM
-        const systemRAMContext = Math.min(
-          Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
-          targetContext
+        logger.info(
+          'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
        )
+      }
+    }

-        if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
-          maxContextLength = systemRAMContext
-          noOffloadKVCache = true
+    // Fallback logic: try different configurations if no VRAM-based plan worked
+    if (mode === 'Unsupported') {
+      logger.info('Planning: Trying fallback configurations...')
+
+      // Try putting some layers on GPU with KV cache in RAM
+      const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
+      if (possibleGpuLayers > 0) {
+        gpuLayers = Math.min(possibleGpuLayers, totalLayers)
+        const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
+        const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
+        const availableRamForKv =
+          usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
+        // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
+        const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
+
+        if (contextInRam >= MIN_CONTEXT_LENGTH) {
          mode = 'Hybrid'
-
-          logger.info(
-            `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
-              `${maxContextLength} context on CPU RAM`
-          )
-        } else {
-          // Can't fit reasonable context even with CPU RAM
-          // Reduce GPU layers further
-          gpuLayers = Math.floor(gpuLayers / 2)
-          maxContextLength = MIN_CONTEXT_LENGTH
+          maxContextLength = contextInRam
          noOffloadKVCache = true
-          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
+          logger.info(
+            `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
+          )
        }
-      } else {
-        // Pure CPU mode
+      }
+
+      // If still unsupported, try pure CPU mode
+      if (mode === 'Unsupported') {
        gpuLayers = 0
        noOffloadKVCache = true
-
-        // Calculate context for pure CPU mode
-        const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
-        const availableForKVCache = Math.max(
-          0,
-          usableSystemMemory - totalCPUMemoryNeeded
-        )
-
-        maxContextLength = Math.min(
-          Math.max(
-            MIN_CONTEXT_LENGTH,
-            Math.floor(availableForKVCache / kvCachePerToken)
-          ),
-          targetContext
-        )
-
-        mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
-      }
-    }
-
-    // Safety check: Verify total GPU memory usage
-    if (gpuLayers > 0 && !noOffloadKVCache) {
-      const estimatedGPUUsage =
-        gpuLayers * layerSize +
-        maxContextLength * kvCachePerToken +
-        (offloadMmproj ? mmprojSize : 0)
-
-      if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
-        logger.warn(
-          `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
-        )
-
-        // Reduce context first
-        while (
-          maxContextLength > MIN_CONTEXT_LENGTH &&
-          estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
-        ) {
-          maxContextLength = Math.floor(maxContextLength / 2)
-          const newEstimate =
-            gpuLayers * layerSize +
-            maxContextLength * kvCachePerToken +
-            (offloadMmproj ? mmprojSize : 0)
-          if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
-        }
-
-        // If still too much, reduce layers
-        if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
-          gpuLayers = Math.floor(gpuLayers * 0.7)
-          mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
-          noOffloadKVCache = true // Move KV cache to CPU
+        offloadMmproj = false
+        const ramUsedByModel = modelSize + mmprojSize
+        const availableRamForKv = usableSystemMemory - ramUsedByModel
+        maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
+        if (maxContextLength >= MIN_CONTEXT_LENGTH) {
+          mode = 'CPU'
+          logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
        }
      }
    }

-    // Apply user-requested context limit if specified
+    if (mode === 'CPU' || noOffloadKVCache) {
+      offloadMmproj = false
+    }
+
    if (requestedCtx && requestedCtx > 0) {
      maxContextLength = Math.min(maxContextLength, requestedCtx)
-      logger.info(
-        `User requested context: ${requestedCtx}, final: ${maxContextLength}`
-      )
    }

-    // Ensure we never exceed model's maximum context
    maxContextLength = Math.min(maxContextLength, modelMaxContextLength)

-    // Final validation
-    if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
+    if (maxContextLength < MIN_CONTEXT_LENGTH) {
      mode = 'Unsupported'
    }

-    // Ensure maxContextLength is valid
-    maxContextLength = isNaN(maxContextLength)
-      ? MIN_CONTEXT_LENGTH
-      : Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
+    if (mode === 'Unsupported') {
+      gpuLayers = 0
+      maxContextLength = 0
+    }
+
+    maxContextLength = isNaN(maxContextLength)
+      ? 0
+      : Math.floor(maxContextLength)

-    // Log final plan
    const mmprojInfo = mmprojPath
      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
      : ''
@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine {
      offloadMmproj,
    }
  }
-
  /**
   * estimate KVCache size from a given metadata
   */
  private async estimateKVCache(
    meta: Record<string, string>,
    ctx_size?: number
-  ): Promise<number> {
+  ): Promise<{ size: number; perTokenSize: number }> {
    const arch = meta['general.architecture']
    if (!arch) throw new Error('Invalid metadata: architecture not found')

@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine {
      )
    }

-    let ctxLen: number
-    if (!ctx_size) {
-      ctxLen = Number(meta[`${arch}.context_length`])
-    } else {
-      ctxLen = ctx_size
-    }
+    const maxCtx = Number(meta[`${arch}.context_length`])
+    if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
+
+    // b) If the user supplied a value, clamp it to the model's max
+    let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
+
+    logger.info(`Final context length used for KV size: ${ctxLen}`)
+    logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)

    logger.info(`ctxLen: ${ctxLen}`)
    logger.info(`nLayer: ${nLayer}`)
@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine {
    // fp16 = 8 bits * 2 = 16
    const bytesPerElement = 2

-    // Total KV cache size per token = nHead * headDim * bytesPerElement
-    const kvPerToken = nHead * headDim * bytesPerElement
+    // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
+    const kvPerToken = nHead * headDim * bytesPerElement * nLayer

-    return ctxLen * nLayer * kvPerToken
+    return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
  }

  private async getModelSize(path: string): Promise<number> {
@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine {
      const gguf = await readGgufMetadata(path)
      let kvCacheSize: number
      if (ctx_size) {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
      } else {
-        kvCacheSize = await this.estimateKVCache(gguf.metadata)
+        kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
      }

      // Total memory consumption = model weights + kvcache
@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine {
      )

      // Use 80% of total memory as the usable limit
-      const USABLE_MEMORY_PERCENTAGE = 0.8
+      const USABLE_MEMORY_PERCENTAGE = 0.9
      const usableTotalMemory =
-        memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
+        memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
+        memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
      const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE

      // Check if model fits in total memory at all