fix: KVCache size calculation and refactor (#6438)
- Removed the unused `getKVCachePerToken` helper and replaced it with a unified `estimateKVCache` that returns both total size and per‑token size. - Fixed the KV cache size calculation to account for all layers, correcting previous under‑estimation. - Added proper clamping of user‑requested context lengths to the model’s maximum. - Refactored VRAM budgeting: introduced explicit reserves, fixed engine overhead, and separate multipliers for VRAM and system RAM based on memory mode. - Implemented a more robust planning flow with clear GPU, Hybrid, and CPU pathways, including fallback configurations when resources are insufficient. - Updated default context length handling and safety buffers to prevent OOM situations. - Adjusted usable memory percentage to 90 % and refined logging for easier debugging.
This commit is contained in:
parent
7a2782e6fd
commit
489c5a3d9c
@ -1742,13 +1742,13 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
try {
|
try {
|
||||||
const data = JSON.parse(jsonStr)
|
const data = JSON.parse(jsonStr)
|
||||||
const chunk = data as chatCompletionChunk
|
const chunk = data as chatCompletionChunk
|
||||||
|
|
||||||
// Check for out-of-context error conditions
|
// Check for out-of-context error conditions
|
||||||
if (chunk.choices?.[0]?.finish_reason === 'length') {
|
if (chunk.choices?.[0]?.finish_reason === 'length') {
|
||||||
// finish_reason 'length' indicates context limit was hit
|
// finish_reason 'length' indicates context limit was hit
|
||||||
throw new Error(OUT_OF_CONTEXT_SIZE)
|
throw new Error(OUT_OF_CONTEXT_SIZE)
|
||||||
}
|
}
|
||||||
|
|
||||||
yield chunk
|
yield chunk
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error('Error parsing JSON from stream or server error:', e)
|
logger.error('Error parsing JSON from stream or server error:', e)
|
||||||
@ -1828,13 +1828,13 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const completionResponse = (await response.json()) as chatCompletion
|
const completionResponse = (await response.json()) as chatCompletion
|
||||||
|
|
||||||
// Check for out-of-context error conditions
|
// Check for out-of-context error conditions
|
||||||
if (completionResponse.choices?.[0]?.finish_reason === 'length') {
|
if (completionResponse.choices?.[0]?.finish_reason === 'length') {
|
||||||
// finish_reason 'length' indicates context limit was hit
|
// finish_reason 'length' indicates context limit was hit
|
||||||
throw new Error(OUT_OF_CONTEXT_SIZE)
|
throw new Error(OUT_OF_CONTEXT_SIZE)
|
||||||
}
|
}
|
||||||
|
|
||||||
return completionResponse
|
return completionResponse
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2036,24 +2036,6 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
totalMemory,
|
totalMemory,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private async getKVCachePerToken(
|
|
||||||
meta: Record<string, string>
|
|
||||||
): Promise<number> {
|
|
||||||
const arch = meta['general.architecture']
|
|
||||||
const nLayer = Number(meta[`${arch}.block_count`])
|
|
||||||
const nHead = Number(meta[`${arch}.attention.head_count`])
|
|
||||||
|
|
||||||
// Get head dimensions
|
|
||||||
const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
|
|
||||||
const embeddingLen = Number(meta[`${arch}.embedding_length`])
|
|
||||||
const headDim = embeddingLen / nHead
|
|
||||||
|
|
||||||
// KV cache uses head_count_kv (for GQA models) or head_count
|
|
||||||
// Each token needs K and V, both are fp16 (2 bytes)
|
|
||||||
const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
|
|
||||||
|
|
||||||
return bytesPerToken
|
|
||||||
}
|
|
||||||
|
|
||||||
private async getLayerSize(
|
private async getLayerSize(
|
||||||
path: string,
|
path: string,
|
||||||
@ -2100,10 +2082,9 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
gguf.metadata
|
gguf.metadata
|
||||||
)
|
)
|
||||||
|
|
||||||
// Fixed KV cache calculation
|
const kvCachePerToken = (await this.estimateKVCache(gguf.metadata))
|
||||||
const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
|
.perTokenSize
|
||||||
|
|
||||||
// Debug logging
|
|
||||||
logger.info(
|
logger.info(
|
||||||
`Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
|
`Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
|
||||||
)
|
)
|
||||||
@ -2119,33 +2100,25 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
throw new Error(`Invalid layer size: ${layerSize}`)
|
throw new Error(`Invalid layer size: ${layerSize}`)
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
|
// Reserve memory for OS, other applications, and fixed engine overhead.
|
||||||
const GPU_OVERHEAD_FACTOR = 0.8
|
|
||||||
|
|
||||||
// VRAM budget with overhead consideration
|
|
||||||
const VRAM_RESERVE_GB = 0.5
|
const VRAM_RESERVE_GB = 0.5
|
||||||
const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
|
const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
|
||||||
const usableVRAM = Math.max(
|
const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc.
|
||||||
0,
|
|
||||||
(memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
|
|
||||||
)
|
|
||||||
|
|
||||||
// Get model's maximum context length
|
// Get model's maximum context length
|
||||||
const arch = gguf.metadata['general.architecture']
|
const arch = gguf.metadata['general.architecture']
|
||||||
const modelMaxContextLength =
|
const modelMaxContextLength =
|
||||||
Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
|
Number(gguf.metadata[`${arch}.context_length`]) || 8192
|
||||||
|
|
||||||
// Set minimum context length
|
const MIN_CONTEXT_LENGTH = 1024
|
||||||
const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
|
|
||||||
|
|
||||||
// System RAM budget
|
// Memory percentages applied to both VRAM and RAM
|
||||||
const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
|
const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
`Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
|
`Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
|
||||||
)
|
)
|
||||||
|
|
||||||
// Validate memory info
|
|
||||||
if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
|
if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
|
||||||
throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
|
throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
|
||||||
}
|
}
|
||||||
@ -2158,208 +2131,166 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate actual system RAM
|
// Apply memory mode to both VRAM and RAM separately
|
||||||
const actualSystemRAM = Math.max(
|
const memoryModeMultiplier = memoryPercentages[this.memoryMode]
|
||||||
|
const usableVRAM = Math.max(
|
||||||
0,
|
0,
|
||||||
memoryInfo.totalMemory - memoryInfo.totalVRAM
|
memoryInfo.totalVRAM * memoryModeMultiplier -
|
||||||
|
VRAM_RESERVE_BYTES -
|
||||||
|
ENGINE_FIXED_OVERHEAD_BYTES
|
||||||
)
|
)
|
||||||
const usableSystemMemory =
|
|
||||||
actualSystemRAM * memoryPercentages[this.memoryMode]
|
const actualSystemRAM = Math.max(0, memoryInfo.totalRAM)
|
||||||
|
const usableSystemMemory = actualSystemRAM * memoryModeMultiplier
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
`Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
|
`Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
|
||||||
)
|
)
|
||||||
|
|
||||||
// --- Priority 1: Allocate mmproj (if exists) ---
|
|
||||||
let offloadMmproj = false
|
|
||||||
let remainingVRAM = usableVRAM
|
|
||||||
|
|
||||||
if (mmprojSize > 0) {
|
|
||||||
if (mmprojSize <= remainingVRAM) {
|
|
||||||
offloadMmproj = true
|
|
||||||
remainingVRAM -= mmprojSize
|
|
||||||
logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
|
|
||||||
} else {
|
|
||||||
logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Priority 2: Calculate optimal layer/context balance ---
|
|
||||||
let gpuLayers = 0
|
let gpuLayers = 0
|
||||||
let maxContextLength = MIN_CONTEXT_LENGTH
|
let maxContextLength = 0
|
||||||
let noOffloadKVCache = false
|
let noOffloadKVCache = false
|
||||||
let mode: ModelPlan['mode'] = 'Unsupported'
|
let mode: ModelPlan['mode'] = 'Unsupported'
|
||||||
|
let offloadMmproj = false
|
||||||
|
|
||||||
// Calculate how much VRAM we need for different context sizes
|
let remainingVRAM = usableVRAM
|
||||||
const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
if (mmprojSize > 0 && mmprojSize <= remainingVRAM) {
|
||||||
const targetContext = requestedCtx || modelMaxContextLength
|
offloadMmproj = true
|
||||||
|
remainingVRAM -= mmprojSize
|
||||||
// Find the best balance of layers and context
|
|
||||||
let bestConfig = {
|
|
||||||
layers: 0,
|
|
||||||
context: MIN_CONTEXT_LENGTH,
|
|
||||||
vramUsed: 0,
|
|
||||||
}
|
}
|
||||||
|
const vramForMinContext = (
|
||||||
|
await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH)
|
||||||
|
).size
|
||||||
|
|
||||||
for (const ctxSize of contextSizes) {
|
const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize)
|
||||||
if (ctxSize > targetContext) break
|
if (ramForModel + vramForMinContext > (usableSystemMemory + usableVRAM)) {
|
||||||
|
logger.error(
|
||||||
const kvCacheSize = ctxSize * kvCachePerToken
|
`Model unsupported. Not enough resources for model and min context.`
|
||||||
const availableForLayers = remainingVRAM - kvCacheSize
|
|
||||||
|
|
||||||
if (availableForLayers <= 0) continue
|
|
||||||
|
|
||||||
const possibleLayers = Math.min(
|
|
||||||
Math.floor(availableForLayers / layerSize),
|
|
||||||
totalLayers
|
|
||||||
)
|
)
|
||||||
|
return {
|
||||||
if (possibleLayers > 0) {
|
gpuLayers: 0,
|
||||||
const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
|
maxContextLength: 0,
|
||||||
|
noOffloadKVCache: true,
|
||||||
// Verify this fits with some margin
|
mode: 'Unsupported',
|
||||||
if (totalVramNeeded <= remainingVRAM * 0.95) {
|
offloadMmproj: false,
|
||||||
bestConfig = {
|
|
||||||
layers: possibleLayers,
|
|
||||||
context: ctxSize,
|
|
||||||
vramUsed: totalVramNeeded,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply the best configuration found
|
const targetContext = Math.min(
|
||||||
if (bestConfig.layers > 0) {
|
requestedCtx || modelMaxContextLength,
|
||||||
gpuLayers = bestConfig.layers
|
modelMaxContextLength
|
||||||
maxContextLength = bestConfig.context
|
)
|
||||||
|
|
||||||
|
let targetContextSize = (
|
||||||
|
await this.estimateKVCache(gguf.metadata, targetContext)
|
||||||
|
).size
|
||||||
|
|
||||||
|
// Use `kvCachePerToken` for all VRAM calculations
|
||||||
|
if (modelSize + targetContextSize <= remainingVRAM) {
|
||||||
|
mode = 'GPU'
|
||||||
|
gpuLayers = totalLayers
|
||||||
|
maxContextLength = targetContext
|
||||||
noOffloadKVCache = false
|
noOffloadKVCache = false
|
||||||
mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
|
logger.info(
|
||||||
|
'Planning: Ideal case fits. All layers and target context in VRAM.'
|
||||||
|
)
|
||||||
|
} else if (modelSize <= remainingVRAM) {
|
||||||
|
mode = 'GPU'
|
||||||
|
gpuLayers = totalLayers
|
||||||
|
noOffloadKVCache = false
|
||||||
|
const vramLeftForContext = remainingVRAM - modelSize
|
||||||
|
maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
|
||||||
|
|
||||||
|
// Add safety check to prevent OOM
|
||||||
|
const safetyBuffer = 0.9 // Use 90% of calculated context to be safe
|
||||||
|
maxContextLength = Math.floor(maxContextLength * safetyBuffer)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
`Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
|
`Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}`
|
||||||
`VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
|
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
// Fallback: Try minimal GPU layers with KV cache on CPU
|
const vramAvailableForLayers = remainingVRAM - vramForMinContext
|
||||||
gpuLayers = Math.min(
|
|
||||||
Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
|
|
||||||
totalLayers
|
|
||||||
)
|
|
||||||
|
|
||||||
if (gpuLayers > 0) {
|
if (vramAvailableForLayers >= layerSize) {
|
||||||
// Calculate available system RAM for KV cache
|
mode = 'Hybrid'
|
||||||
const cpuLayers = totalLayers - gpuLayers
|
gpuLayers = Math.min(
|
||||||
const modelCPUSize = cpuLayers * layerSize
|
Math.floor(vramAvailableForLayers / layerSize),
|
||||||
const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
|
totalLayers
|
||||||
const systemRAMUsed = modelCPUSize + mmprojCPUSize
|
|
||||||
const availableSystemRAMForKVCache = Math.max(
|
|
||||||
0,
|
|
||||||
usableSystemMemory - systemRAMUsed
|
|
||||||
)
|
)
|
||||||
|
noOffloadKVCache = false
|
||||||
|
const vramUsedByLayers = gpuLayers * layerSize
|
||||||
|
const vramLeftForContext = remainingVRAM - vramUsedByLayers
|
||||||
|
maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken)
|
||||||
|
|
||||||
// Calculate context that fits in system RAM
|
logger.info(
|
||||||
const systemRAMContext = Math.min(
|
'Planning: Hybrid mode. Offloading layers to fit context in VRAM.'
|
||||||
Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
|
|
||||||
targetContext
|
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
|
// Fallback logic: try different configurations if no VRAM-based plan worked
|
||||||
maxContextLength = systemRAMContext
|
if (mode === 'Unsupported') {
|
||||||
noOffloadKVCache = true
|
logger.info('Planning: Trying fallback configurations...')
|
||||||
|
|
||||||
|
// Try putting some layers on GPU with KV cache in RAM
|
||||||
|
const possibleGpuLayers = Math.floor(remainingVRAM / layerSize)
|
||||||
|
if (possibleGpuLayers > 0) {
|
||||||
|
gpuLayers = Math.min(possibleGpuLayers, totalLayers)
|
||||||
|
const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize
|
||||||
|
const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0
|
||||||
|
const availableRamForKv =
|
||||||
|
usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj)
|
||||||
|
// Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific
|
||||||
|
const contextInRam = Math.floor(availableRamForKv / kvCachePerToken)
|
||||||
|
|
||||||
|
if (contextInRam >= MIN_CONTEXT_LENGTH) {
|
||||||
mode = 'Hybrid'
|
mode = 'Hybrid'
|
||||||
|
maxContextLength = contextInRam
|
||||||
logger.info(
|
|
||||||
`Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
|
|
||||||
`${maxContextLength} context on CPU RAM`
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
// Can't fit reasonable context even with CPU RAM
|
|
||||||
// Reduce GPU layers further
|
|
||||||
gpuLayers = Math.floor(gpuLayers / 2)
|
|
||||||
maxContextLength = MIN_CONTEXT_LENGTH
|
|
||||||
noOffloadKVCache = true
|
noOffloadKVCache = true
|
||||||
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
|
logger.info(
|
||||||
|
`Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}`
|
||||||
|
)
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
// Pure CPU mode
|
|
||||||
|
// If still unsupported, try pure CPU mode
|
||||||
|
if (mode === 'Unsupported') {
|
||||||
gpuLayers = 0
|
gpuLayers = 0
|
||||||
noOffloadKVCache = true
|
noOffloadKVCache = true
|
||||||
|
offloadMmproj = false
|
||||||
// Calculate context for pure CPU mode
|
const ramUsedByModel = modelSize + mmprojSize
|
||||||
const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
|
const availableRamForKv = usableSystemMemory - ramUsedByModel
|
||||||
const availableForKVCache = Math.max(
|
maxContextLength = Math.floor(availableRamForKv / kvCachePerToken)
|
||||||
0,
|
if (maxContextLength >= MIN_CONTEXT_LENGTH) {
|
||||||
usableSystemMemory - totalCPUMemoryNeeded
|
mode = 'CPU'
|
||||||
)
|
logger.info(`Planning: CPU mode - Context: ${maxContextLength}`)
|
||||||
|
|
||||||
maxContextLength = Math.min(
|
|
||||||
Math.max(
|
|
||||||
MIN_CONTEXT_LENGTH,
|
|
||||||
Math.floor(availableForKVCache / kvCachePerToken)
|
|
||||||
),
|
|
||||||
targetContext
|
|
||||||
)
|
|
||||||
|
|
||||||
mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Safety check: Verify total GPU memory usage
|
|
||||||
if (gpuLayers > 0 && !noOffloadKVCache) {
|
|
||||||
const estimatedGPUUsage =
|
|
||||||
gpuLayers * layerSize +
|
|
||||||
maxContextLength * kvCachePerToken +
|
|
||||||
(offloadMmproj ? mmprojSize : 0)
|
|
||||||
|
|
||||||
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
|
|
||||||
logger.warn(
|
|
||||||
`GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
|
|
||||||
)
|
|
||||||
|
|
||||||
// Reduce context first
|
|
||||||
while (
|
|
||||||
maxContextLength > MIN_CONTEXT_LENGTH &&
|
|
||||||
estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
|
|
||||||
) {
|
|
||||||
maxContextLength = Math.floor(maxContextLength / 2)
|
|
||||||
const newEstimate =
|
|
||||||
gpuLayers * layerSize +
|
|
||||||
maxContextLength * kvCachePerToken +
|
|
||||||
(offloadMmproj ? mmprojSize : 0)
|
|
||||||
if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
|
|
||||||
}
|
|
||||||
|
|
||||||
// If still too much, reduce layers
|
|
||||||
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
|
|
||||||
gpuLayers = Math.floor(gpuLayers * 0.7)
|
|
||||||
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
|
|
||||||
noOffloadKVCache = true // Move KV cache to CPU
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply user-requested context limit if specified
|
if (mode === 'CPU' || noOffloadKVCache) {
|
||||||
|
offloadMmproj = false
|
||||||
|
}
|
||||||
|
|
||||||
if (requestedCtx && requestedCtx > 0) {
|
if (requestedCtx && requestedCtx > 0) {
|
||||||
maxContextLength = Math.min(maxContextLength, requestedCtx)
|
maxContextLength = Math.min(maxContextLength, requestedCtx)
|
||||||
logger.info(
|
|
||||||
`User requested context: ${requestedCtx}, final: ${maxContextLength}`
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure we never exceed model's maximum context
|
|
||||||
maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
|
maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
|
||||||
|
|
||||||
// Final validation
|
if (maxContextLength < MIN_CONTEXT_LENGTH) {
|
||||||
if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
|
|
||||||
mode = 'Unsupported'
|
mode = 'Unsupported'
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure maxContextLength is valid
|
if (mode === 'Unsupported') {
|
||||||
maxContextLength = isNaN(maxContextLength)
|
gpuLayers = 0
|
||||||
? MIN_CONTEXT_LENGTH
|
maxContextLength = 0
|
||||||
: Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
|
}
|
||||||
|
|
||||||
|
maxContextLength = isNaN(maxContextLength)
|
||||||
|
? 0
|
||||||
|
: Math.floor(maxContextLength)
|
||||||
|
|
||||||
// Log final plan
|
|
||||||
const mmprojInfo = mmprojPath
|
const mmprojInfo = mmprojPath
|
||||||
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
|
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
|
||||||
: ''
|
: ''
|
||||||
@ -2378,14 +2309,13 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
offloadMmproj,
|
offloadMmproj,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* estimate KVCache size from a given metadata
|
* estimate KVCache size from a given metadata
|
||||||
*/
|
*/
|
||||||
private async estimateKVCache(
|
private async estimateKVCache(
|
||||||
meta: Record<string, string>,
|
meta: Record<string, string>,
|
||||||
ctx_size?: number
|
ctx_size?: number
|
||||||
): Promise<number> {
|
): Promise<{ size: number; perTokenSize: number }> {
|
||||||
const arch = meta['general.architecture']
|
const arch = meta['general.architecture']
|
||||||
if (!arch) throw new Error('Invalid metadata: architecture not found')
|
if (!arch) throw new Error('Invalid metadata: architecture not found')
|
||||||
|
|
||||||
@ -2421,12 +2351,14 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
let ctxLen: number
|
const maxCtx = Number(meta[`${arch}.context_length`])
|
||||||
if (!ctx_size) {
|
if (!maxCtx) throw new Error('Invalid metadata: context_length not found')
|
||||||
ctxLen = Number(meta[`${arch}.context_length`])
|
|
||||||
} else {
|
// b) If the user supplied a value, clamp it to the model's max
|
||||||
ctxLen = ctx_size
|
let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx
|
||||||
}
|
|
||||||
|
logger.info(`Final context length used for KV size: ${ctxLen}`)
|
||||||
|
logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`)
|
||||||
|
|
||||||
logger.info(`ctxLen: ${ctxLen}`)
|
logger.info(`ctxLen: ${ctxLen}`)
|
||||||
logger.info(`nLayer: ${nLayer}`)
|
logger.info(`nLayer: ${nLayer}`)
|
||||||
@ -2439,10 +2371,10 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
// fp16 = 8 bits * 2 = 16
|
// fp16 = 8 bits * 2 = 16
|
||||||
const bytesPerElement = 2
|
const bytesPerElement = 2
|
||||||
|
|
||||||
// Total KV cache size per token = nHead * headDim * bytesPerElement
|
// Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer
|
||||||
const kvPerToken = nHead * headDim * bytesPerElement
|
const kvPerToken = nHead * headDim * bytesPerElement * nLayer
|
||||||
|
|
||||||
return ctxLen * nLayer * kvPerToken
|
return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken }
|
||||||
}
|
}
|
||||||
|
|
||||||
private async getModelSize(path: string): Promise<number> {
|
private async getModelSize(path: string): Promise<number> {
|
||||||
@ -2476,9 +2408,9 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
const gguf = await readGgufMetadata(path)
|
const gguf = await readGgufMetadata(path)
|
||||||
let kvCacheSize: number
|
let kvCacheSize: number
|
||||||
if (ctx_size) {
|
if (ctx_size) {
|
||||||
kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
|
kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size
|
||||||
} else {
|
} else {
|
||||||
kvCacheSize = await this.estimateKVCache(gguf.metadata)
|
kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total memory consumption = model weights + kvcache
|
// Total memory consumption = model weights + kvcache
|
||||||
@ -2488,9 +2420,10 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Use 80% of total memory as the usable limit
|
// Use 80% of total memory as the usable limit
|
||||||
const USABLE_MEMORY_PERCENTAGE = 0.8
|
const USABLE_MEMORY_PERCENTAGE = 0.9
|
||||||
const usableTotalMemory =
|
const usableTotalMemory =
|
||||||
memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
|
memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE +
|
||||||
|
memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
|
||||||
const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
|
const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
|
||||||
|
|
||||||
// Check if model fits in total memory at all
|
// Check if model fits in total memory at all
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user