fix: KVCache size calculation and refactor (#6438)

- Removed the unused `getKVCachePerToken` helper and replaced it with a unified `estimateKVCache` that returns both total size and per‑token size.
- Fixed the KV cache size calculation to account for all layers, correcting previous under‑estimation.
- Added proper clamping of user‑requested context lengths to the model’s maximum.
- Refactored VRAM budgeting: introduced explicit reserves, fixed engine overhead, and separate multipliers for VRAM and system RAM based on memory mode.
- Implemented a more robust planning flow with clear GPU, Hybrid, and CPU pathways, including fallback configurations when resources are insufficient.
- Updated default context length handling and safety buffers to prevent OOM situations.
- Adjusted usable memory percentage to 90 % and refined logging for easier debugging.
This commit is contained in:
Akarshan Biswas 2025-09-15 10:16:13 +05:30 committed by GitHub
parent 7a2782e6fd
commit 489c5a3d9c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine {
totalMemory, totalMemory,
} }
} }
private async getKVCachePerToken(
meta: Record<string, string>
): Promise<number> {
const arch = meta['general.architecture']
const nLayer = Number(meta[`${arch}.block_count`])
const nHead = Number(meta[`${arch}.attention.head_count`])
// Get head dimensions
const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
const embeddingLen = Number(meta[`${arch}.embedding_length`])
const headDim = embeddingLen / nHead
// KV cache uses head_count_kv (for GQA models) or head_count
// Each token needs K and V, both are fp16 (2 bytes)
const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
return bytesPerToken
}
private async getLayerSize( private async getLayerSize(
path: string, path: string,
@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine {
gguf.metadata gguf.metadata
) )
// Fixed KV cache calculation const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata) .perTokenSize
// Debug logging
logger.info( logger.info(
`Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}` `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
) )
@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine {
throw new Error(`Invalid layer size: ${layerSize}`) throw new Error(`Invalid layer size: ${layerSize}`)
} }
// GPU overhead factor (20% reserved for GPU operations, alignment, etc.) // Reserve memory for OS, other applications, and fixed engine overhead.
const GPU_OVERHEAD_FACTOR = 0.8
// VRAM budget with overhead consideration
const VRAM_RESERVE_GB = 0.5 const VRAM_RESERVE_GB = 0.5
const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024 const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
const usableVRAM = Math.max( const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.
0,
(memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
)
// Get model's maximum context length // Get model's maximum context length
const arch = gguf.metadata['general.architecture'] const arch = gguf.metadata['general.architecture']
const modelMaxContextLength = const modelMaxContextLength =
Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback Number(gguf.metadata[`${arch}.context_length`]) || 8192
// Set minimum context length const MIN_CONTEXT_LENGTH = 1024
const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
// System RAM budget // Memory percentages applied to both VRAM and RAM
const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 } const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
logger.info( logger.info(
`Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}` `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
) )
// Validate memory info
if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) { if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`) throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
} }
@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine {
) )
} }
// Calculate actual system RAM // Apply memory mode to both VRAM and RAM separately
const actualSystemRAM = Math.max( const memoryModeMultiplier = memoryPercentages[this.memoryMode]
const usableVRAM = Math.max(
0, 0,
memoryInfo.totalMemory - memoryInfo.totalVRAM memoryInfo.totalVRAM * memoryModeMultiplier -
VRAM_RESERVE_BYTES -
ENGINE_FIXED_OVERHEAD_BYTES
) )
const usableSystemMemory =
actualSystemRAM * memoryPercentages[this.memoryMode] const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
const usableSystemMemory = actualSystemRAM * memoryModeMultiplier
logger.info( logger.info(
`Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}` `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
) )
// --- Priority 1: Allocate mmproj (if exists) ---
let offloadMmproj = false
let remainingVRAM = usableVRAM
if (mmprojSize > 0) {
if (mmprojSize <= remainingVRAM) {
offloadMmproj = true
remainingVRAM -= mmprojSize
logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
} else {
logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
}
}
// --- Priority 2: Calculate optimal layer/context balance ---
let gpuLayers = 0 let gpuLayers = 0
let maxContextLength = MIN_CONTEXT_LENGTH let maxContextLength = 0
let noOffloadKVCache = false let noOffloadKVCache = false
let mode: ModelPlan['mode'] = 'Unsupported' let mode: ModelPlan['mode'] = 'Unsupported'
let offloadMmproj = false
// Calculate how much VRAM we need for different context sizes let remainingVRAM = usableVRAM
const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072] if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
const targetContext = requestedCtx || modelMaxContextLength offloadMmproj = true
remainingVRAM -= mmprojSize
// Find the best balance of layers and context
let bestConfig = {
layers: 0,
context: MIN_CONTEXT_LENGTH,
vramUsed: 0,
} }
const vramForMinContext = (
await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
).size
for (const ctxSize of contextSizes) { const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
if (ctxSize > targetContext) break if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) {
logger.error(
const kvCacheSize = ctxSize * kvCachePerToken `Model unsupported. Not enough resources for model and min context.`
const availableForLayers = remainingVRAM - kvCacheSize
if (availableForLayers <= 0) continue
const possibleLayers = Math.min(
Math.floor(availableForLayers / layerSize),
totalLayers
) )
return {
if (possibleLayers > 0) { gpuLayers: 0,
const totalVramNeeded = possibleLayers * layerSize + kvCacheSize maxContextLength: 0,
noOffloadKVCache: true,
// Verify this fits with some margin mode: 'Unsupported',
if (totalVramNeeded <= remainingVRAM * 0.95) { offloadMmproj: false,
bestConfig = {
layers: possibleLayers,
context: ctxSize,
vramUsed: totalVramNeeded,
}
}
} }
} }
// Apply the best configuration found const targetContext = Math.min(
if (bestConfig.layers > 0) { requestedCtx || modelMaxContextLength,
gpuLayers = bestConfig.layers modelMaxContextLength
maxContextLength = bestConfig.context )
let targetContextSize = (
await this.estimateKVCache(gguf.metadata, targetContext)
).size
// Use `kvCachePerToken` for all VRAM calculations
if (modelSize + targetContextSize <= remainingVRAM) {
mode = 'GPU'
gpuLayers = totalLayers
maxContextLength = targetContext
noOffloadKVCache = false noOffloadKVCache = false
mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid' logger.info(
'Planning: Ideal case fits. All layers and target context in VRAM.'
)
} else if (modelSize <= remainingVRAM) {
mode = 'GPU'
gpuLayers = totalLayers
noOffloadKVCache = false
const vramLeftForContext = remainingVRAM - modelSize
maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
// Add safety check to prevent OOM
const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
maxContextLength = Math.floor(maxContextLength * safetyBuffer)
logger.info( logger.info(
`Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` + `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
`VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
) )
} else { } else {
// Fallback: Try minimal GPU layers with KV cache on CPU const vramAvailableForLayers = remainingVRAM - vramForMinContext
gpuLayers = Math.min(
Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
totalLayers
)
if (gpuLayers > 0) { if (vramAvailableForLayers >= layerSize) {
// Calculate available system RAM for KV cache mode = 'Hybrid'
const cpuLayers = totalLayers - gpuLayers gpuLayers = Math.min(
const modelCPUSize = cpuLayers * layerSize Math.floor(vramAvailableForLayers / layerSize),
const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0 totalLayers
const systemRAMUsed = modelCPUSize + mmprojCPUSize
const availableSystemRAMForKVCache = Math.max(
0,
usableSystemMemory - systemRAMUsed
) )
noOffloadKVCache = false
const vramUsedByLayers = gpuLayers * layerSize
const vramLeftForContext = remainingVRAM - vramUsedByLayers
maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
// Calculate context that fits in system RAM logger.info(
const systemRAMContext = Math.min( 'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
targetContext
) )
}
}
if (systemRAMContext >= MIN_CONTEXT_LENGTH) { // Fallback logic: try different configurations if no VRAM-based plan worked
maxContextLength = systemRAMContext if (mode === 'Unsupported') {
noOffloadKVCache = true logger.info('Planning: Trying fallback configurations...')
// Try putting some layers on GPU with KV cache in RAM
const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
if (possibleGpuLayers > 0) {
gpuLayers = Math.min(possibleGpuLayers, totalLayers)
const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
const availableRamForKv =
usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
// Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
if (contextInRam >= MIN_CONTEXT_LENGTH) {
mode = 'Hybrid' mode = 'Hybrid'
maxContextLength = contextInRam
logger.info(
`Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
`${maxContextLength} context on CPU RAM`
)
} else {
// Can't fit reasonable context even with CPU RAM
// Reduce GPU layers further
gpuLayers = Math.floor(gpuLayers / 2)
maxContextLength = MIN_CONTEXT_LENGTH
noOffloadKVCache = true noOffloadKVCache = true
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU' logger.info(
`Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
)
} }
} else { }
// Pure CPU mode
// If still unsupported, try pure CPU mode
if (mode === 'Unsupported') {
gpuLayers = 0 gpuLayers = 0
noOffloadKVCache = true noOffloadKVCache = true
offloadMmproj = false
// Calculate context for pure CPU mode const ramUsedByModel = modelSize + mmprojSize
const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0) const availableRamForKv = usableSystemMemory - ramUsedByModel
const availableForKVCache = Math.max( maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
0, if (maxContextLength >= MIN_CONTEXT_LENGTH) {
usableSystemMemory - totalCPUMemoryNeeded mode = 'CPU'
) logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
maxContextLength = Math.min(
Math.max(
MIN_CONTEXT_LENGTH,
Math.floor(availableForKVCache / kvCachePerToken)
),
targetContext
)
mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
}
}
// Safety check: Verify total GPU memory usage
if (gpuLayers > 0 && !noOffloadKVCache) {
const estimatedGPUUsage =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(offloadMmproj ? mmprojSize : 0)
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
logger.warn(
`GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
)
// Reduce context first
while (
maxContextLength > MIN_CONTEXT_LENGTH &&
estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
) {
maxContextLength = Math.floor(maxContextLength / 2)
const newEstimate =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(offloadMmproj ? mmprojSize : 0)
if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
}
// If still too much, reduce layers
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
gpuLayers = Math.floor(gpuLayers * 0.7)
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
noOffloadKVCache = true // Move KV cache to CPU
} }
} }
} }
// Apply user-requested context limit if specified if (mode === 'CPU' || noOffloadKVCache) {
offloadMmproj = false
}
if (requestedCtx && requestedCtx > 0) { if (requestedCtx && requestedCtx > 0) {
maxContextLength = Math.min(maxContextLength, requestedCtx) maxContextLength = Math.min(maxContextLength, requestedCtx)
logger.info(
`User requested context: ${requestedCtx}, final: ${maxContextLength}`
)
} }
// Ensure we never exceed model's maximum context
maxContextLength = Math.min(maxContextLength, modelMaxContextLength) maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
// Final validation if (maxContextLength < MIN_CONTEXT_LENGTH) {
if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
mode = 'Unsupported' mode = 'Unsupported'
} }
// Ensure maxContextLength is valid if (mode === 'Unsupported') {
maxContextLength = isNaN(maxContextLength) gpuLayers = 0
? MIN_CONTEXT_LENGTH maxContextLength = 0
: Math.max(MIN_CONTEXT_LENGTH, maxContextLength) }
maxContextLength = isNaN(maxContextLength)
? 0
: Math.floor(maxContextLength)
// Log final plan
const mmprojInfo = mmprojPath const mmprojInfo = mmprojPath
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}` ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
: '' : ''
@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine {
offloadMmproj, offloadMmproj,
} }
} }
/** /**
* estimate KVCache size from a given metadata * estimate KVCache size from a given metadata
*/ */
private async estimateKVCache( private async estimateKVCache(
meta: Record<string, string>, meta: Record<string, string>,
ctx_size?: number ctx_size?: number
): Promise<number> { ): Promise<{ size: number; perTokenSize: number }> {
const arch = meta['general.architecture'] const arch = meta['general.architecture']
if (!arch) throw new Error('Invalid metadata: architecture not found') if (!arch) throw new Error('Invalid metadata: architecture not found')
@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine {
) )
} }
let ctxLen: number const maxCtx = Number(meta[`${arch}.context_length`])
if (!ctx_size) { if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
ctxLen = Number(meta[`${arch}.context_length`])
} else { // b) If the user supplied a value, clamp it to the model's max
ctxLen = ctx_size let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
}
logger.info(`Final context length used for KV size: ${ctxLen}`)
logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)
logger.info(`ctxLen: ${ctxLen}`) logger.info(`ctxLen: ${ctxLen}`)
logger.info(`nLayer: ${nLayer}`) logger.info(`nLayer: ${nLayer}`)
@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine {
// fp16 = 8 bits * 2 = 16 // fp16 = 8 bits * 2 = 16
const bytesPerElement = 2 const bytesPerElement = 2
// Total KV cache size per token = nHead * headDim * bytesPerElement // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
const kvPerToken = nHead * headDim * bytesPerElement const kvPerToken = nHead * headDim * bytesPerElement * nLayer
return ctxLen * nLayer * kvPerToken return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
} }
private async getModelSize(path: string): Promise<number> { private async getModelSize(path: string): Promise<number> {
@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine {
const gguf = await readGgufMetadata(path) const gguf = await readGgufMetadata(path)
let kvCacheSize: number let kvCacheSize: number
if (ctx_size) { if (ctx_size) {
kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size) kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
} else { } else {
kvCacheSize = await this.estimateKVCache(gguf.metadata) kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
} }
// Total memory consumption = model weights + kvcache // Total memory consumption = model weights + kvcache
@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine {
) )
// Use 80% of total memory as the usable limit // Use 80% of total memory as the usable limit
const USABLE_MEMORY_PERCENTAGE = 0.8 const USABLE_MEMORY_PERCENTAGE = 0.9
const usableTotalMemory = const usableTotalMemory =
memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
// Check if model fits in total memory at all // Check if model fits in total memory at all