Merge pull request #6416 from menloresearch/enhancement/experimental-label

enhancement: add label experimental for optimize setting
This commit is contained in:
Faisal Amir 2025-09-11 16:12:35 +07:00 committed by GitHub
commit e709d200aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 76 additions and 21 deletions

View File

@ -80,7 +80,7 @@ type ModelPlan = {
gpuLayers: number gpuLayers: number
maxContextLength: number maxContextLength: number
noOffloadKVCache: boolean noOffloadKVCache: boolean
noOffloadMmproj?: boolean offloadMmproj?: boolean
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
} }
@ -328,7 +328,8 @@ export default class llamacpp_extension extends AIEngine {
await this.determineBestBackend(version_backends) await this.determineBestBackend(version_backends)
} }
} else { } else {
bestAvailableBackendString = await this.determineBestBackend(version_backends) bestAvailableBackendString =
await this.determineBestBackend(version_backends)
} }
let settings = structuredClone(SETTINGS) let settings = structuredClone(SETTINGS)
@ -2047,11 +2048,25 @@ export default class llamacpp_extension extends AIEngine {
return { layerSize: modelSize / totalLayers, totalLayers } return { layerSize: modelSize / totalLayers, totalLayers }
} }
private isAbsolutePath(p: string): boolean {
// Normalize backslashes to forwardslashes first.
const norm = p.replace(/\\/g, '/')
return (
norm.startsWith('/') || // POSIX absolute
/^[a-zA-Z]:/.test(norm) || // Driveletter Windows (C: or D:)
/^\/\/[^/]+/.test(norm) // UNC path //server/share
)
}
async planModelLoad( async planModelLoad(
path: string, path: string,
requestedCtx?: number, mmprojPath?: string,
mmprojPath?: string requestedCtx?: number
): Promise<ModelPlan> { ): Promise<ModelPlan> {
if (!this.isAbsolutePath(path))
path = await joinPath([await getJanDataFolderPath(), path])
if (mmprojPath && !this.isAbsolutePath(mmprojPath))
mmprojPath = await joinPath([await getJanDataFolderPath(), path])
const modelSize = await this.getModelSize(path) const modelSize = await this.getModelSize(path)
const memoryInfo = await this.getTotalSystemMemory() const memoryInfo = await this.getTotalSystemMemory()
const gguf = await readGgufMetadata(path) const gguf = await readGgufMetadata(path)
@ -2138,12 +2153,12 @@ export default class llamacpp_extension extends AIEngine {
) )
// --- Priority 1: Allocate mmproj (if exists) --- // --- Priority 1: Allocate mmproj (if exists) ---
let noOffloadMmproj = false let offloadMmproj = false
let remainingVRAM = usableVRAM let remainingVRAM = usableVRAM
if (mmprojSize > 0) { if (mmprojSize > 0) {
if (mmprojSize <= remainingVRAM) { if (mmprojSize <= remainingVRAM) {
noOffloadMmproj = true offloadMmproj = true
remainingVRAM -= mmprojSize remainingVRAM -= mmprojSize
logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`) logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
} else { } else {
@ -2217,8 +2232,7 @@ export default class llamacpp_extension extends AIEngine {
// Calculate available system RAM for KV cache // Calculate available system RAM for KV cache
const cpuLayers = totalLayers - gpuLayers const cpuLayers = totalLayers - gpuLayers
const modelCPUSize = cpuLayers * layerSize const modelCPUSize = cpuLayers * layerSize
const mmprojCPUSize = const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0
const systemRAMUsed = modelCPUSize + mmprojCPUSize const systemRAMUsed = modelCPUSize + mmprojCPUSize
const availableSystemRAMForKVCache = Math.max( const availableSystemRAMForKVCache = Math.max(
0, 0,
@ -2277,7 +2291,7 @@ export default class llamacpp_extension extends AIEngine {
const estimatedGPUUsage = const estimatedGPUUsage =
gpuLayers * layerSize + gpuLayers * layerSize +
maxContextLength * kvCachePerToken + maxContextLength * kvCachePerToken +
(noOffloadMmproj ? mmprojSize : 0) (offloadMmproj ? mmprojSize : 0)
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
logger.warn( logger.warn(
@ -2293,7 +2307,7 @@ export default class llamacpp_extension extends AIEngine {
const newEstimate = const newEstimate =
gpuLayers * layerSize + gpuLayers * layerSize +
maxContextLength * kvCachePerToken + maxContextLength * kvCachePerToken +
(noOffloadMmproj ? mmprojSize : 0) (offloadMmproj ? mmprojSize : 0)
if (newEstimate <= memoryInfo.totalVRAM * 0.9) break if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
} }
@ -2329,7 +2343,7 @@ export default class llamacpp_extension extends AIEngine {
// Log final plan // Log final plan
const mmprojInfo = mmprojPath const mmprojInfo = mmprojPath
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}` ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
: '' : ''
logger.info( logger.info(
@ -2343,7 +2357,7 @@ export default class llamacpp_extension extends AIEngine {
maxContextLength, maxContextLength,
noOffloadKVCache, noOffloadKVCache,
mode, mode,
noOffloadMmproj, offloadMmproj,
} }
} }

View File

@ -46,15 +46,16 @@ export function ModelSetting({
} }
setIsPlanning(true) setIsPlanning(true)
try { try {
// Read the model config to get the actual model path // Read the model config to get the actual model path and mmproj path
const modelConfig = await serviceHub.app().readYaml<{ const modelConfig = await serviceHub.app().readYaml<{
model_path: string model_path: string
mmproj_path?: string
}>(`llamacpp/models/${model.id}/model.yml`) }>(`llamacpp/models/${model.id}/model.yml`)
if (modelConfig && modelConfig.model_path) { if (modelConfig && modelConfig.model_path) {
const result = await serviceHub const result = await serviceHub
.models() .models()
.planModelLoad(modelConfig.model_path) .planModelLoad(modelConfig.model_path, modelConfig.mmproj_path)
// Apply the recommended settings to the model sequentially to avoid race conditions // Apply the recommended settings to the model sequentially to avoid race conditions
const settingsToUpdate: Array<{ const settingsToUpdate: Array<{
@ -82,6 +83,25 @@ export function ModelSetting({
value: result.noOffloadKVCache, value: result.noOffloadKVCache,
}) })
} }
if (
model.settings?.no_kv_offload &&
result.noOffloadKVCache !== undefined
) {
settingsToUpdate.push({
key: 'no_kv_offload',
value: result.noOffloadKVCache,
})
}
if (
model.settings?.mmproj_offload &&
result.offloadMmproj !== undefined
) {
settingsToUpdate.push({
key: 'mmproj_offload',
value: result.offloadMmproj,
})
}
// Apply all settings in a single update to avoid race conditions // Apply all settings in a single update to avoid race conditions
if (settingsToUpdate.length > 0) { if (settingsToUpdate.length > 0) {
@ -242,11 +262,18 @@ export function ModelSetting({
{provider.provider === 'llamacpp' && ( {provider.provider === 'llamacpp' && (
<div className="pb-4 border-b border-main-view-fg/10 my-4"> <div className="pb-4 border-b border-main-view-fg/10 my-4">
<div> <div>
<h3 className="font-medium mb-1">Optimize Settings</h3> <div>
<p className="text-main-view-fg/70 text-xs mb-3"> <div className="flex items-center gap-2 mb-1">
Analyze your system and model, then apply optimal loading <h3 className="font-medium">Optimize Settings</h3>
settings automatically <div className="text-xs bg-main-view-fg/10 border border-main-view-fg/20 text-main-view-fg/70 rounded-full py-0.5 px-2">
</p> <span>{t('mcp-servers:experimental')}</span>
</div>
</div>
<p className="text-main-view-fg/70 text-xs mb-3">
Analyze your system and model, then apply optimal loading
settings automatically
</p>
</div>
<Button <Button
onClick={handlePlanModelLoad} onClick={handlePlanModelLoad}
disabled={isPlanning} disabled={isPlanning}

View File

@ -495,12 +495,14 @@ export class DefaultModelsService implements ModelsService {
async planModelLoad( async planModelLoad(
modelPath: string, modelPath: string,
mmprojPath?: string,
requestedCtx?: number requestedCtx?: number
): Promise<ModelPlan> { ): Promise<ModelPlan> {
try { try {
const engine = this.getEngine('llamacpp') as AIEngine & { const engine = this.getEngine('llamacpp') as AIEngine & {
planModelLoad?: ( planModelLoad?: (
path: string, path: string,
mmprojPath?: string,
requestedCtx?: number requestedCtx?: number
) => Promise<ModelPlan> ) => Promise<ModelPlan>
} }
@ -514,7 +516,12 @@ export class DefaultModelsService implements ModelsService {
(core) => core.joinPath (core) => core.joinPath
) )
const fullModelPath = await joinPath([janDataFolderPath, modelPath]) const fullModelPath = await joinPath([janDataFolderPath, modelPath])
return await engine.planModelLoad(fullModelPath, requestedCtx) // mmprojPath is currently unused, but included for compatibility
return await engine.planModelLoad(
fullModelPath,
mmprojPath,
requestedCtx
)
} }
// Fallback if method is not available // Fallback if method is not available
@ -523,6 +530,7 @@ export class DefaultModelsService implements ModelsService {
gpuLayers: 0, gpuLayers: 0,
maxContextLength: 2048, maxContextLength: 2048,
noOffloadKVCache: true, noOffloadKVCache: true,
offloadMmproj: false,
mode: 'Unsupported', mode: 'Unsupported',
} }
} catch (error) { } catch (error) {
@ -531,6 +539,7 @@ export class DefaultModelsService implements ModelsService {
gpuLayers: 0, gpuLayers: 0,
maxContextLength: 2048, maxContextLength: 2048,
noOffloadKVCache: true, noOffloadKVCache: true,
offloadMmproj: false,
mode: 'Unsupported', mode: 'Unsupported',
} }
} }

View File

@ -85,6 +85,7 @@ export interface ModelPlan {
gpuLayers: number gpuLayers: number
maxContextLength: number maxContextLength: number
noOffloadKVCache: boolean noOffloadKVCache: boolean
offloadMmproj: boolean
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
} }
@ -136,5 +137,9 @@ export interface ModelsService {
ctxSize?: number ctxSize?: number
): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'> ): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
validateGgufFile(filePath: string): Promise<ModelValidationResult> validateGgufFile(filePath: string): Promise<ModelValidationResult>
planModelLoad(modelPath: string, requestedCtx?: number): Promise<ModelPlan> planModelLoad(
modelPath: string,
mmprojPath?: string,
requestedCtx?: number
): Promise<ModelPlan>
} }