diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 1d98d4213..efccca679 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -1742,13 +1742,13 @@ export default class llamacpp_extension extends AIEngine { try { const data = JSON.parse(jsonStr) const chunk = data as chatCompletionChunk - + // Check for out-of-context error conditions if (chunk.choices?.[0]?.finish_reason === 'length') { // finish_reason 'length' indicates context limit was hit throw new Error(OUT_OF_CONTEXT_SIZE) } - + yield chunk } catch (e) { logger.error('Error parsing JSON from stream or server error:', e) @@ -1828,13 +1828,13 @@ export default class llamacpp_extension extends AIEngine { } const completionResponse = (await response.json()) as chatCompletion - + // Check for out-of-context error conditions if (completionResponse.choices?.[0]?.finish_reason === 'length') { // finish_reason 'length' indicates context limit was hit throw new Error(OUT_OF_CONTEXT_SIZE) } - + return completionResponse } @@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine { totalMemory, } } - private async getKVCachePerToken( - meta: Record - ): Promise { - const arch = meta['general.architecture'] - const nLayer = Number(meta[`${arch}.block_count`]) - const nHead = Number(meta[`${arch}.attention.head_count`]) - - // Get head dimensions - const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead - const embeddingLen = Number(meta[`${arch}.embedding_length`]) - const headDim = embeddingLen / nHead - - // KV cache uses head_count_kv (for GQA models) or head_count - // Each token needs K and V, both are fp16 (2 bytes) - const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers - - return bytesPerToken - } private async getLayerSize( path: string, @@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine { gguf.metadata ) - // Fixed KV cache calculation - const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata) + const kvCachePerToken = (await this.estimateKVCache(gguf.metadata)) + .perTokenSize - // Debug logging logger.info( `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}` ) @@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine { throw new Error(`Invalid layer size: ${layerSize}`) } - // GPU overhead factor (20% reserved for GPU operations, alignment, etc.) - const GPU_OVERHEAD_FACTOR = 0.8 - - // VRAM budget with overhead consideration + // Reserve memory for OS, other applications, and fixed engine overhead. const VRAM_RESERVE_GB = 0.5 const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024 - const usableVRAM = Math.max( - 0, - (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR - ) + const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc. // Get model's maximum context length const arch = gguf.metadata['general.architecture'] const modelMaxContextLength = - Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback + Number(gguf.metadata[`${arch}.context_length`]) || 8192 - // Set minimum context length - const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility + const MIN_CONTEXT_LENGTH = 1024 - // System RAM budget + // Memory percentages applied to both VRAM and RAM const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 } logger.info( `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}` ) - // Validate memory info if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) { throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`) } @@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine { ) } - // Calculate actual system RAM - const actualSystemRAM = Math.max( + // Apply memory mode to both VRAM and RAM separately + const memoryModeMultiplier = memoryPercentages[this.memoryMode] + const usableVRAM = Math.max( 0, - memoryInfo.totalMemory - memoryInfo.totalVRAM + memoryInfo.totalVRAM * memoryModeMultiplier - + VRAM_RESERVE_BYTES - + ENGINE_FIXED_OVERHEAD_BYTES ) - const usableSystemMemory = - actualSystemRAM * memoryPercentages[this.memoryMode] + + const actualSystemRAM = Math.max(0, memoryInfo.totalRAM) + const usableSystemMemory = actualSystemRAM * memoryModeMultiplier logger.info( - `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}` + `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}` ) - // --- Priority 1: Allocate mmproj (if exists) --- - let offloadMmproj = false - let remainingVRAM = usableVRAM - - if (mmprojSize > 0) { - if (mmprojSize <= remainingVRAM) { - offloadMmproj = true - remainingVRAM -= mmprojSize - logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`) - } else { - logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`) - } - } - - // --- Priority 2: Calculate optimal layer/context balance --- let gpuLayers = 0 - let maxContextLength = MIN_CONTEXT_LENGTH + let maxContextLength = 0 let noOffloadKVCache = false let mode: ModelPlan['mode'] = 'Unsupported' + let offloadMmproj = false - // Calculate how much VRAM we need for different context sizes - const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072] - const targetContext = requestedCtx || modelMaxContextLength - - // Find the best balance of layers and context - let bestConfig = { - layers: 0, - context: MIN_CONTEXT_LENGTH, - vramUsed: 0, + let remainingVRAM = usableVRAM + if (mmprojSize > 0 && mmprojSize <= remainingVRAM) { + offloadMmproj = true + remainingVRAM -= mmprojSize } + const vramForMinContext = ( + await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH) + ).size - for (const ctxSize of contextSizes) { - if (ctxSize > targetContext) break - - const kvCacheSize = ctxSize * kvCachePerToken - const availableForLayers = remainingVRAM - kvCacheSize - - if (availableForLayers <= 0) continue - - const possibleLayers = Math.min( - Math.floor(availableForLayers / layerSize), - totalLayers + const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize) + if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) { + logger.error( + `Model unsupported. Not enough resources for model and min context.` ) - - if (possibleLayers > 0) { - const totalVramNeeded = possibleLayers * layerSize + kvCacheSize - - // Verify this fits with some margin - if (totalVramNeeded <= remainingVRAM * 0.95) { - bestConfig = { - layers: possibleLayers, - context: ctxSize, - vramUsed: totalVramNeeded, - } - } + return { + gpuLayers: 0, + maxContextLength: 0, + noOffloadKVCache: true, + mode: 'Unsupported', + offloadMmproj: false, } } - // Apply the best configuration found - if (bestConfig.layers > 0) { - gpuLayers = bestConfig.layers - maxContextLength = bestConfig.context + const targetContext = Math.min( + requestedCtx || modelMaxContextLength, + modelMaxContextLength + ) + + let targetContextSize = ( + await this.estimateKVCache(gguf.metadata, targetContext) + ).size + + // Use `kvCachePerToken` for all VRAM calculations + if (modelSize + targetContextSize <= remainingVRAM) { + mode = 'GPU' + gpuLayers = totalLayers + maxContextLength = targetContext noOffloadKVCache = false - mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid' + logger.info( + 'Planning: Ideal case fits. All layers and target context in VRAM.' + ) + } else if (modelSize <= remainingVRAM) { + mode = 'GPU' + gpuLayers = totalLayers + noOffloadKVCache = false + const vramLeftForContext = remainingVRAM - modelSize + maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken) + + // Add safety check to prevent OOM + const safetyBuffer = 0.9 // Use 90% of calculated context to be safe + maxContextLength = Math.floor(maxContextLength * safetyBuffer) logger.info( - `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` + - `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes` + `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}` ) } else { - // Fallback: Try minimal GPU layers with KV cache on CPU - gpuLayers = Math.min( - Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers - totalLayers - ) + const vramAvailableForLayers = remainingVRAM - vramForMinContext - if (gpuLayers > 0) { - // Calculate available system RAM for KV cache - const cpuLayers = totalLayers - gpuLayers - const modelCPUSize = cpuLayers * layerSize - const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0 - const systemRAMUsed = modelCPUSize + mmprojCPUSize - const availableSystemRAMForKVCache = Math.max( - 0, - usableSystemMemory - systemRAMUsed + if (vramAvailableForLayers >= layerSize) { + mode = 'Hybrid' + gpuLayers = Math.min( + Math.floor(vramAvailableForLayers / layerSize), + totalLayers ) + noOffloadKVCache = false + const vramUsedByLayers = gpuLayers * layerSize + const vramLeftForContext = remainingVRAM - vramUsedByLayers + maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken) - // Calculate context that fits in system RAM - const systemRAMContext = Math.min( - Math.floor(availableSystemRAMForKVCache / kvCachePerToken), - targetContext + logger.info( + 'Planning: Hybrid mode. Offloading layers to fit context in VRAM.' ) + } + } - if (systemRAMContext >= MIN_CONTEXT_LENGTH) { - maxContextLength = systemRAMContext - noOffloadKVCache = true + // Fallback logic: try different configurations if no VRAM-based plan worked + if (mode === 'Unsupported') { + logger.info('Planning: Trying fallback configurations...') + + // Try putting some layers on GPU with KV cache in RAM + const possibleGpuLayers = Math.floor(remainingVRAM / layerSize) + if (possibleGpuLayers > 0) { + gpuLayers = Math.min(possibleGpuLayers, totalLayers) + const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize + const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0 + const availableRamForKv = + usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj) + // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific + const contextInRam = Math.floor(availableRamForKv / kvCachePerToken) + + if (contextInRam >= MIN_CONTEXT_LENGTH) { mode = 'Hybrid' - - logger.info( - `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` + - `${maxContextLength} context on CPU RAM` - ) - } else { - // Can't fit reasonable context even with CPU RAM - // Reduce GPU layers further - gpuLayers = Math.floor(gpuLayers / 2) - maxContextLength = MIN_CONTEXT_LENGTH + maxContextLength = contextInRam noOffloadKVCache = true - mode = gpuLayers > 0 ? 'Hybrid' : 'CPU' + logger.info( + `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}` + ) } - } else { - // Pure CPU mode + } + + // If still unsupported, try pure CPU mode + if (mode === 'Unsupported') { gpuLayers = 0 noOffloadKVCache = true - - // Calculate context for pure CPU mode - const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0) - const availableForKVCache = Math.max( - 0, - usableSystemMemory - totalCPUMemoryNeeded - ) - - maxContextLength = Math.min( - Math.max( - MIN_CONTEXT_LENGTH, - Math.floor(availableForKVCache / kvCachePerToken) - ), - targetContext - ) - - mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported' - } - } - - // Safety check: Verify total GPU memory usage - if (gpuLayers > 0 && !noOffloadKVCache) { - const estimatedGPUUsage = - gpuLayers * layerSize + - maxContextLength * kvCachePerToken + - (offloadMmproj ? mmprojSize : 0) - - if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { - logger.warn( - `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...` - ) - - // Reduce context first - while ( - maxContextLength > MIN_CONTEXT_LENGTH && - estimatedGPUUsage > memoryInfo.totalVRAM * 0.9 - ) { - maxContextLength = Math.floor(maxContextLength / 2) - const newEstimate = - gpuLayers * layerSize + - maxContextLength * kvCachePerToken + - (offloadMmproj ? mmprojSize : 0) - if (newEstimate <= memoryInfo.totalVRAM * 0.9) break - } - - // If still too much, reduce layers - if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { - gpuLayers = Math.floor(gpuLayers * 0.7) - mode = gpuLayers > 0 ? 'Hybrid' : 'CPU' - noOffloadKVCache = true // Move KV cache to CPU + offloadMmproj = false + const ramUsedByModel = modelSize + mmprojSize + const availableRamForKv = usableSystemMemory - ramUsedByModel + maxContextLength = Math.floor(availableRamForKv / kvCachePerToken) + if (maxContextLength >= MIN_CONTEXT_LENGTH) { + mode = 'CPU' + logger.info(`Planning: CPU mode - Context: ${maxContextLength}`) } } } - // Apply user-requested context limit if specified + if (mode === 'CPU' || noOffloadKVCache) { + offloadMmproj = false + } + if (requestedCtx && requestedCtx > 0) { maxContextLength = Math.min(maxContextLength, requestedCtx) - logger.info( - `User requested context: ${requestedCtx}, final: ${maxContextLength}` - ) } - // Ensure we never exceed model's maximum context maxContextLength = Math.min(maxContextLength, modelMaxContextLength) - // Final validation - if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) { + if (maxContextLength < MIN_CONTEXT_LENGTH) { mode = 'Unsupported' } - // Ensure maxContextLength is valid - maxContextLength = isNaN(maxContextLength) - ? MIN_CONTEXT_LENGTH - : Math.max(MIN_CONTEXT_LENGTH, maxContextLength) + if (mode === 'Unsupported') { + gpuLayers = 0 + maxContextLength = 0 + } + + maxContextLength = isNaN(maxContextLength) + ? 0 + : Math.floor(maxContextLength) - // Log final plan const mmprojInfo = mmprojPath ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}` : '' @@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine { offloadMmproj, } } - /** * estimate KVCache size from a given metadata */ private async estimateKVCache( meta: Record, ctx_size?: number - ): Promise { + ): Promise<{ size: number; perTokenSize: number }> { const arch = meta['general.architecture'] if (!arch) throw new Error('Invalid metadata: architecture not found') @@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine { ) } - let ctxLen: number - if (!ctx_size) { - ctxLen = Number(meta[`${arch}.context_length`]) - } else { - ctxLen = ctx_size - } + const maxCtx = Number(meta[`${arch}.context_length`]) + if (!maxCtx) throw new Error('Invalid metadata: context_length not found') + + // b) If the user supplied a value, clamp it to the model's max + let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx + + logger.info(`Final context length used for KV size: ${ctxLen}`) + logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`) logger.info(`ctxLen: ${ctxLen}`) logger.info(`nLayer: ${nLayer}`) @@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine { // fp16 = 8 bits * 2 = 16 const bytesPerElement = 2 - // Total KV cache size per token = nHead * headDim * bytesPerElement - const kvPerToken = nHead * headDim * bytesPerElement + // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer + const kvPerToken = nHead * headDim * bytesPerElement * nLayer - return ctxLen * nLayer * kvPerToken + return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken } } private async getModelSize(path: string): Promise { @@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine { const gguf = await readGgufMetadata(path) let kvCacheSize: number if (ctx_size) { - kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size) + kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size } else { - kvCacheSize = await this.estimateKVCache(gguf.metadata) + kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size } // Total memory consumption = model weights + kvcache @@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine { ) // Use 80% of total memory as the usable limit - const USABLE_MEMORY_PERCENTAGE = 0.8 + const USABLE_MEMORY_PERCENTAGE = 0.9 const usableTotalMemory = - memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE + memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE + + memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE // Check if model fits in total memory at all