diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 58ba666dd..a086b74db 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -80,7 +80,7 @@ type ModelPlan = { gpuLayers: number maxContextLength: number noOffloadKVCache: boolean - noOffloadMmproj?: boolean + offloadMmproj?: boolean mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' } @@ -328,7 +328,8 @@ export default class llamacpp_extension extends AIEngine { await this.determineBestBackend(version_backends) } } else { - bestAvailableBackendString = await this.determineBestBackend(version_backends) + bestAvailableBackendString = + await this.determineBestBackend(version_backends) } let settings = structuredClone(SETTINGS) @@ -2047,11 +2048,25 @@ export default class llamacpp_extension extends AIEngine { return { layerSize: modelSize / totalLayers, totalLayers } } + private isAbsolutePath(p: string): boolean { + // Normalize back‑slashes to forward‑slashes first. + const norm = p.replace(/\\/g, '/') + return ( + norm.startsWith('/') || // POSIX absolute + /^[a-zA-Z]:/.test(norm) || // Drive‑letter Windows (C: or D:) + /^\/\/[^/]+/.test(norm) // UNC path //server/share + ) + } + async planModelLoad( path: string, - requestedCtx?: number, - mmprojPath?: string + mmprojPath?: string, + requestedCtx?: number ): Promise { + if (!this.isAbsolutePath(path)) + path = await joinPath([await getJanDataFolderPath(), path]) + if (mmprojPath && !this.isAbsolutePath(mmprojPath)) + mmprojPath = await joinPath([await getJanDataFolderPath(), path]) const modelSize = await this.getModelSize(path) const memoryInfo = await this.getTotalSystemMemory() const gguf = await readGgufMetadata(path) @@ -2138,12 +2153,12 @@ export default class llamacpp_extension extends AIEngine { ) // --- Priority 1: Allocate mmproj (if exists) --- - let noOffloadMmproj = false + let offloadMmproj = false let remainingVRAM = usableVRAM if (mmprojSize > 0) { if (mmprojSize <= remainingVRAM) { - noOffloadMmproj = true + offloadMmproj = true remainingVRAM -= mmprojSize logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`) } else { @@ -2217,8 +2232,7 @@ export default class llamacpp_extension extends AIEngine { // Calculate available system RAM for KV cache const cpuLayers = totalLayers - gpuLayers const modelCPUSize = cpuLayers * layerSize - const mmprojCPUSize = - mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0 + const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0 const systemRAMUsed = modelCPUSize + mmprojCPUSize const availableSystemRAMForKVCache = Math.max( 0, @@ -2277,7 +2291,7 @@ export default class llamacpp_extension extends AIEngine { const estimatedGPUUsage = gpuLayers * layerSize + maxContextLength * kvCachePerToken + - (noOffloadMmproj ? mmprojSize : 0) + (offloadMmproj ? mmprojSize : 0) if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { logger.warn( @@ -2293,7 +2307,7 @@ export default class llamacpp_extension extends AIEngine { const newEstimate = gpuLayers * layerSize + maxContextLength * kvCachePerToken + - (noOffloadMmproj ? mmprojSize : 0) + (offloadMmproj ? mmprojSize : 0) if (newEstimate <= memoryInfo.totalVRAM * 0.9) break } @@ -2329,7 +2343,7 @@ export default class llamacpp_extension extends AIEngine { // Log final plan const mmprojInfo = mmprojPath - ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}` + ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}` : '' logger.info( @@ -2343,7 +2357,7 @@ export default class llamacpp_extension extends AIEngine { maxContextLength, noOffloadKVCache, mode, - noOffloadMmproj, + offloadMmproj, } } diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx index 4a1525003..39a587cbc 100644 --- a/web-app/src/containers/ModelSetting.tsx +++ b/web-app/src/containers/ModelSetting.tsx @@ -46,15 +46,16 @@ export function ModelSetting({ } setIsPlanning(true) try { - // Read the model config to get the actual model path + // Read the model config to get the actual model path and mmproj path const modelConfig = await serviceHub.app().readYaml<{ model_path: string + mmproj_path?: string }>(`llamacpp/models/${model.id}/model.yml`) if (modelConfig && modelConfig.model_path) { const result = await serviceHub .models() - .planModelLoad(modelConfig.model_path) + .planModelLoad(modelConfig.model_path, modelConfig.mmproj_path) // Apply the recommended settings to the model sequentially to avoid race conditions const settingsToUpdate: Array<{ @@ -82,6 +83,25 @@ export function ModelSetting({ value: result.noOffloadKVCache, }) } + if ( + model.settings?.no_kv_offload && + result.noOffloadKVCache !== undefined + ) { + settingsToUpdate.push({ + key: 'no_kv_offload', + value: result.noOffloadKVCache, + }) + } + + if ( + model.settings?.mmproj_offload && + result.offloadMmproj !== undefined + ) { + settingsToUpdate.push({ + key: 'mmproj_offload', + value: result.offloadMmproj, + }) + } // Apply all settings in a single update to avoid race conditions if (settingsToUpdate.length > 0) { @@ -242,11 +262,18 @@ export function ModelSetting({ {provider.provider === 'llamacpp' && (
-

Optimize Settings

-

- Analyze your system and model, then apply optimal loading - settings automatically -

+
+
+

Optimize Settings

+
+ {t('mcp-servers:experimental')} +
+
+

+ Analyze your system and model, then apply optimal loading + settings automatically +

+