fix: KVCache size calculation and refactor (#6438)

- Removed the unused `getKVCachePerToken` helper and replaced it with a unified `estimateKVCache` that returns both total size and per‑token size.
- Fixed the KV cache size calculation to account for all layers, correcting previous under‑estimation.
- Added proper clamping of user‑requested context lengths to the model’s maximum.
- Refactored VRAM budgeting: introduced explicit reserves, fixed engine overhead, and separate multipliers for VRAM and system RAM based on memory mode.
- Implemented a more robust planning flow with clear GPU, Hybrid, and CPU pathways, including fallback configurations when resources are insufficient.
- Updated default context length handling and safety buffers to prevent OOM situations.
- Adjusted usable memory percentage to 90 % and refined logging for easier debugging.
This commit is contained in:
Akarshan Biswas 2025-09-15 10:16:13 +05:30 committed by GitHub
parent 7a2782e6fd
commit 489c5a3d9c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1742,13 +1742,13 @@ export default class llamacpp_extension extends AIEngine {
try {
const data = JSON.parse(jsonStr)
const chunk = data as chatCompletionChunk
// Check for out-of-context error conditions
if (chunk.choices?.[0]?.finish_reason === 'length') {
// finish_reason 'length' indicates context limit was hit
throw new Error(OUT_OF_CONTEXT_SIZE)
}
yield chunk
} catch (e) {
logger.error('Error parsing JSON from stream or server error:', e)
@ -1828,13 +1828,13 @@ export default class llamacpp_extension extends AIEngine {
}
const completionResponse = (await response.json()) as chatCompletion
// Check for out-of-context error conditions
if (completionResponse.choices?.[0]?.finish_reason === 'length') {
// finish_reason 'length' indicates context limit was hit
throw new Error(OUT_OF_CONTEXT_SIZE)
}
return completionResponse
}
@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine {
totalMemory,
}
}
private async getKVCachePerToken(
meta: Record<string, string>
): Promise<number> {
const arch = meta['general.architecture']
const nLayer = Number(meta[`${arch}.block_count`])
const nHead = Number(meta[`${arch}.attention.head_count`])
// Get head dimensions
const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
const embeddingLen = Number(meta[`${arch}.embedding_length`])
const headDim = embeddingLen / nHead
// KV cache uses head_count_kv (for GQA models) or head_count
// Each token needs K and V, both are fp16 (2 bytes)
const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
return bytesPerToken
}
private async getLayerSize(
path: string,
@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine {
gguf.metadata
)
// Fixed KV cache calculation
const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
.perTokenSize
// Debug logging
logger.info(
`Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
)
@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine {
throw new Error(`Invalid layer size: ${layerSize}`)
}
// GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
const GPU_OVERHEAD_FACTOR = 0.8
// VRAM budget with overhead consideration
// Reserve memory for OS, other applications, and fixed engine overhead.
const VRAM_RESERVE_GB = 0.5
const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
const usableVRAM = Math.max(
0,
(memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
)
const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.
// Get model's maximum context length
const arch = gguf.metadata['general.architecture']
const modelMaxContextLength =
Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
Number(gguf.metadata[`${arch}.context_length`]) || 8192
// Set minimum context length
const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
const MIN_CONTEXT_LENGTH = 1024
// System RAM budget
// Memory percentages applied to both VRAM and RAM
const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
logger.info(
`Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
)
// Validate memory info
if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
}
@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine {
)
}
// Calculate actual system RAM
const actualSystemRAM = Math.max(
// Apply memory mode to both VRAM and RAM separately
const memoryModeMultiplier = memoryPercentages[this.memoryMode]
const usableVRAM = Math.max(
0,
memoryInfo.totalMemory - memoryInfo.totalVRAM
memoryInfo.totalVRAM * memoryModeMultiplier -
VRAM_RESERVE_BYTES -
ENGINE_FIXED_OVERHEAD_BYTES
)
const usableSystemMemory =
actualSystemRAM * memoryPercentages[this.memoryMode]
const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
const usableSystemMemory = actualSystemRAM * memoryModeMultiplier
logger.info(
`Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
`Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
)
// --- Priority 1: Allocate mmproj (if exists) ---
let offloadMmproj = false
let remainingVRAM = usableVRAM
if (mmprojSize > 0) {
if (mmprojSize <= remainingVRAM) {
offloadMmproj = true
remainingVRAM -= mmprojSize
logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
} else {
logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
}
}
// --- Priority 2: Calculate optimal layer/context balance ---
let gpuLayers = 0
let maxContextLength = MIN_CONTEXT_LENGTH
let maxContextLength = 0
let noOffloadKVCache = false
let mode: ModelPlan['mode'] = 'Unsupported'
let offloadMmproj = false
// Calculate how much VRAM we need for different context sizes
const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
const targetContext = requestedCtx || modelMaxContextLength
// Find the best balance of layers and context
let bestConfig = {
layers: 0,
context: MIN_CONTEXT_LENGTH,
vramUsed: 0,
let remainingVRAM = usableVRAM
if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
offloadMmproj = true
remainingVRAM -= mmprojSize
}
const vramForMinContext = (
await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
).size
for (const ctxSize of contextSizes) {
if (ctxSize > targetContext) break
const kvCacheSize = ctxSize * kvCachePerToken
const availableForLayers = remainingVRAM - kvCacheSize
if (availableForLayers <= 0) continue
const possibleLayers = Math.min(
Math.floor(availableForLayers / layerSize),
totalLayers
const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) {
logger.error(
`Model unsupported. Not enough resources for model and min context.`
)
if (possibleLayers > 0) {
const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
// Verify this fits with some margin
if (totalVramNeeded <= remainingVRAM * 0.95) {
bestConfig = {
layers: possibleLayers,
context: ctxSize,
vramUsed: totalVramNeeded,
}
}
return {
gpuLayers: 0,
maxContextLength: 0,
noOffloadKVCache: true,
mode: 'Unsupported',
offloadMmproj: false,
}
}
// Apply the best configuration found
if (bestConfig.layers > 0) {
gpuLayers = bestConfig.layers
maxContextLength = bestConfig.context
const targetContext = Math.min(
requestedCtx || modelMaxContextLength,
modelMaxContextLength
)
let targetContextSize = (
await this.estimateKVCache(gguf.metadata, targetContext)
).size
// Use `kvCachePerToken` for all VRAM calculations
if (modelSize + targetContextSize <= remainingVRAM) {
mode = 'GPU'
gpuLayers = totalLayers
maxContextLength = targetContext
noOffloadKVCache = false
mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
logger.info(
'Planning: Ideal case fits. All layers and target context in VRAM.'
)
} else if (modelSize <= remainingVRAM) {
mode = 'GPU'
gpuLayers = totalLayers
noOffloadKVCache = false
const vramLeftForContext = remainingVRAM - modelSize
maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
// Add safety check to prevent OOM
const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
maxContextLength = Math.floor(maxContextLength * safetyBuffer)
logger.info(
`Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
`VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
`Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
)
} else {
// Fallback: Try minimal GPU layers with KV cache on CPU
gpuLayers = Math.min(
Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
totalLayers
)
const vramAvailableForLayers = remainingVRAM - vramForMinContext
if (gpuLayers > 0) {
// Calculate available system RAM for KV cache
const cpuLayers = totalLayers - gpuLayers
const modelCPUSize = cpuLayers * layerSize
const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
const systemRAMUsed = modelCPUSize + mmprojCPUSize
const availableSystemRAMForKVCache = Math.max(
0,
usableSystemMemory - systemRAMUsed
if (vramAvailableForLayers >= layerSize) {
mode = 'Hybrid'
gpuLayers = Math.min(
Math.floor(vramAvailableForLayers / layerSize),
totalLayers
)
noOffloadKVCache = false
const vramUsedByLayers = gpuLayers * layerSize
const vramLeftForContext = remainingVRAM - vramUsedByLayers
maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
// Calculate context that fits in system RAM
const systemRAMContext = Math.min(
Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
targetContext
logger.info(
'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
)
}
}
if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
maxContextLength = systemRAMContext
noOffloadKVCache = true
// Fallback logic: try different configurations if no VRAM-based plan worked
if (mode === 'Unsupported') {
logger.info('Planning: Trying fallback configurations...')
// Try putting some layers on GPU with KV cache in RAM
const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
if (possibleGpuLayers > 0) {
gpuLayers = Math.min(possibleGpuLayers, totalLayers)
const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
const availableRamForKv =
usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
// Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
if (contextInRam >= MIN_CONTEXT_LENGTH) {
mode = 'Hybrid'
logger.info(
`Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
`${maxContextLength} context on CPU RAM`
)
} else {
// Can't fit reasonable context even with CPU RAM
// Reduce GPU layers further
gpuLayers = Math.floor(gpuLayers / 2)
maxContextLength = MIN_CONTEXT_LENGTH
maxContextLength = contextInRam
noOffloadKVCache = true
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
logger.info(
`Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
)
}
} else {
// Pure CPU mode
}
// If still unsupported, try pure CPU mode
if (mode === 'Unsupported') {
gpuLayers = 0
noOffloadKVCache = true
// Calculate context for pure CPU mode
const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
const availableForKVCache = Math.max(
0,
usableSystemMemory - totalCPUMemoryNeeded
)
maxContextLength = Math.min(
Math.max(
MIN_CONTEXT_LENGTH,
Math.floor(availableForKVCache / kvCachePerToken)
),
targetContext
)
mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
}
}
// Safety check: Verify total GPU memory usage
if (gpuLayers > 0 && !noOffloadKVCache) {
const estimatedGPUUsage =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(offloadMmproj ? mmprojSize : 0)
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
logger.warn(
`GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
)
// Reduce context first
while (
maxContextLength > MIN_CONTEXT_LENGTH &&
estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
) {
maxContextLength = Math.floor(maxContextLength / 2)
const newEstimate =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(offloadMmproj ? mmprojSize : 0)
if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
}
// If still too much, reduce layers
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
gpuLayers = Math.floor(gpuLayers * 0.7)
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
noOffloadKVCache = true // Move KV cache to CPU
offloadMmproj = false
const ramUsedByModel = modelSize + mmprojSize
const availableRamForKv = usableSystemMemory - ramUsedByModel
maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
if (maxContextLength >= MIN_CONTEXT_LENGTH) {
mode = 'CPU'
logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
}
}
}
// Apply user-requested context limit if specified
if (mode === 'CPU' || noOffloadKVCache) {
offloadMmproj = false
}
if (requestedCtx && requestedCtx > 0) {
maxContextLength = Math.min(maxContextLength, requestedCtx)
logger.info(
`User requested context: ${requestedCtx}, final: ${maxContextLength}`
)
}
// Ensure we never exceed model's maximum context
maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
// Final validation
if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
if (maxContextLength < MIN_CONTEXT_LENGTH) {
mode = 'Unsupported'
}
// Ensure maxContextLength is valid
maxContextLength = isNaN(maxContextLength)
? MIN_CONTEXT_LENGTH
: Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
if (mode === 'Unsupported') {
gpuLayers = 0
maxContextLength = 0
}
maxContextLength = isNaN(maxContextLength)
? 0
: Math.floor(maxContextLength)
// Log final plan
const mmprojInfo = mmprojPath
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
: ''
@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine {
offloadMmproj,
}
}
/**
* estimate KVCache size from a given metadata
*/
private async estimateKVCache(
meta: Record<string, string>,
ctx_size?: number
): Promise<number> {
): Promise<{ size: number; perTokenSize: number }> {
const arch = meta['general.architecture']
if (!arch) throw new Error('Invalid metadata: architecture not found')
@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine {
)
}
let ctxLen: number
if (!ctx_size) {
ctxLen = Number(meta[`${arch}.context_length`])
} else {
ctxLen = ctx_size
}
const maxCtx = Number(meta[`${arch}.context_length`])
if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
// b) If the user supplied a value, clamp it to the model's max
let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
logger.info(`Final context length used for KV size: ${ctxLen}`)
logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)
logger.info(`ctxLen: ${ctxLen}`)
logger.info(`nLayer: ${nLayer}`)
@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine {
// fp16 = 8 bits * 2 = 16
const bytesPerElement = 2
// Total KV cache size per token = nHead * headDim * bytesPerElement
const kvPerToken = nHead * headDim * bytesPerElement
// Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
const kvPerToken = nHead * headDim * bytesPerElement * nLayer
return ctxLen * nLayer * kvPerToken
return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
}
private async getModelSize(path: string): Promise<number> {
@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine {
const gguf = await readGgufMetadata(path)
let kvCacheSize: number
if (ctx_size) {
kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
} else {
kvCacheSize = await this.estimateKVCache(gguf.metadata)
kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
}
// Total memory consumption = model weights + kvcache
@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine {
)
// Use 80% of total memory as the usable limit
const USABLE_MEMORY_PERCENTAGE = 0.8
const USABLE_MEMORY_PERCENTAGE = 0.9
const usableTotalMemory =
memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
// Check if model fits in total memory at all