Merge pull request #6416 from menloresearch/enhancement/experimental-label

enhancement: add label experimental for optimize setting
2025-09-11 16:12:35 +07:00 · 2025-09-11 16:12:35 +07:00 · e709d200aa
commit e709d200aa
parent 4856cfbfc4 19aa15ffcd
4 changed files with 76 additions and 21 deletions
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -80,7 +80,7 @@ type ModelPlan = {
  gpuLayers: number
  maxContextLength: number
  noOffloadKVCache: boolean
-  noOffloadMmproj?: boolean
+  offloadMmproj?: boolean
  mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
 }
@ -328,7 +328,8 @@ export default class llamacpp_extension extends AIEngine {
            await this.determineBestBackend(version_backends)
        }
      } else {
-        bestAvailableBackendString = await this.determineBestBackend(version_backends)
+        bestAvailableBackendString =
          await this.determineBestBackend(version_backends)
      }
      let settings = structuredClone(SETTINGS)
@ -2047,11 +2048,25 @@ export default class llamacpp_extension extends AIEngine {
    return { layerSize: modelSize / totalLayers, totalLayers }
  }
  private isAbsolutePath(p: string): boolean {
    // Normalize back‑slashes to forward‑slashes first.
    const norm = p.replace(/\\/g, '/')
    return (
      norm.startsWith('/') || // POSIX absolute
      /^[a-zA-Z]:/.test(norm) || // Drive‑letter Windows (C: or D:)
      /^\/\/[^/]+/.test(norm) // UNC path //server/share
    )
  }
  async planModelLoad(
    path: string,
-    requestedCtx?: number,
+    mmprojPath?: string,
-    mmprojPath?: string
+    requestedCtx?: number
  ): Promise<ModelPlan> {
    if (!this.isAbsolutePath(path))
      path = await joinPath([await getJanDataFolderPath(), path])
    if (mmprojPath && !this.isAbsolutePath(mmprojPath))
      mmprojPath = await joinPath([await getJanDataFolderPath(), path])
    const modelSize = await this.getModelSize(path)
    const memoryInfo = await this.getTotalSystemMemory()
    const gguf = await readGgufMetadata(path)
@ -2138,12 +2153,12 @@ export default class llamacpp_extension extends AIEngine {
    )
    // --- Priority 1: Allocate mmproj (if exists) ---
-    let noOffloadMmproj = false
+    let offloadMmproj = false
    let remainingVRAM = usableVRAM
    if (mmprojSize > 0) {
      if (mmprojSize <= remainingVRAM) {
-        noOffloadMmproj = true
+        offloadMmproj = true
        remainingVRAM -= mmprojSize
        logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
      } else {
@ -2217,8 +2232,7 @@ export default class llamacpp_extension extends AIEngine {
        // Calculate available system RAM for KV cache
        const cpuLayers = totalLayers - gpuLayers
        const modelCPUSize = cpuLayers * layerSize
-        const mmprojCPUSize =
+        const mmprojCPUSize = mmprojSize > 0 && !offloadMmproj ? mmprojSize : 0
          mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0
        const systemRAMUsed = modelCPUSize + mmprojCPUSize
        const availableSystemRAMForKVCache = Math.max(
          0,
@ -2277,7 +2291,7 @@ export default class llamacpp_extension extends AIEngine {
      const estimatedGPUUsage =
        gpuLayers * layerSize +
        maxContextLength * kvCachePerToken +
-        (noOffloadMmproj ? mmprojSize : 0)
+        (offloadMmproj ? mmprojSize : 0)
      if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
        logger.warn(
@ -2293,7 +2307,7 @@ export default class llamacpp_extension extends AIEngine {
          const newEstimate =
            gpuLayers * layerSize +
            maxContextLength * kvCachePerToken +
-            (noOffloadMmproj ? mmprojSize : 0)
+            (offloadMmproj ? mmprojSize : 0)
          if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
        }
@ -2329,7 +2343,7 @@ export default class llamacpp_extension extends AIEngine {
    // Log final plan
    const mmprojInfo = mmprojPath
-      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}`
+      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
      : ''
    logger.info(
@ -2343,7 +2357,7 @@ export default class llamacpp_extension extends AIEngine {
      maxContextLength,
      noOffloadKVCache,
      mode,
-      noOffloadMmproj,
+      offloadMmproj,
    }
  }
--- a/web-app/src/containers/ModelSetting.tsx
+++ b/web-app/src/containers/ModelSetting.tsx
@ -46,15 +46,16 @@ export function ModelSetting({
    }
    setIsPlanning(true)
    try {
-      // Read the model config to get the actual model path
+      // Read the model config to get the actual model path and mmproj path
      const modelConfig = await serviceHub.app().readYaml<{
        model_path: string
        mmproj_path?: string
      }>(`llamacpp/models/${model.id}/model.yml`)
      if (modelConfig && modelConfig.model_path) {
        const result = await serviceHub
          .models()
-          .planModelLoad(modelConfig.model_path)
+          .planModelLoad(modelConfig.model_path, modelConfig.mmproj_path)
        // Apply the recommended settings to the model sequentially to avoid race conditions
        const settingsToUpdate: Array<{
@ -82,6 +83,25 @@ export function ModelSetting({
            value: result.noOffloadKVCache,
          })
        }
        if (
          model.settings?.no_kv_offload &&
          result.noOffloadKVCache !== undefined
        ) {
          settingsToUpdate.push({
            key: 'no_kv_offload',
            value: result.noOffloadKVCache,
          })
        }
        if (
          model.settings?.mmproj_offload &&
          result.offloadMmproj !== undefined
        ) {
          settingsToUpdate.push({
            key: 'mmproj_offload',
            value: result.offloadMmproj,
          })
        }
        // Apply all settings in a single update to avoid race conditions
        if (settingsToUpdate.length > 0) {
@ -242,11 +262,18 @@ export function ModelSetting({
          {provider.provider === 'llamacpp' && (
            <div className="pb-4 border-b border-main-view-fg/10 my-4">
              <div>
-                <h3 className="font-medium mb-1">Optimize Settings</h3>
+                <div>
                  <div className="flex items-center gap-2 mb-1">
                    <h3 className="font-medium">Optimize Settings</h3>
                    <div className="text-xs bg-main-view-fg/10 border border-main-view-fg/20 text-main-view-fg/70 rounded-full py-0.5 px-2">
                      <span>{t('mcp-servers:experimental')}</span>
                    </div>
                  </div>
                  <p className="text-main-view-fg/70 text-xs mb-3">
                    Analyze your system and model, then apply optimal loading
                    settings automatically
                  </p>
                </div>
                <Button
                  onClick={handlePlanModelLoad}
                  disabled={isPlanning}
--- a/web-app/src/services/models/default.ts
+++ b/web-app/src/services/models/default.ts
@ -495,12 +495,14 @@ export class DefaultModelsService implements ModelsService {
  async planModelLoad(
    modelPath: string,
    mmprojPath?: string,
    requestedCtx?: number
  ): Promise<ModelPlan> {
    try {
      const engine = this.getEngine('llamacpp') as AIEngine & {
        planModelLoad?: (
          path: string,
          mmprojPath?: string,
          requestedCtx?: number
        ) => Promise<ModelPlan>
      }
@ -514,7 +516,12 @@ export class DefaultModelsService implements ModelsService {
          (core) => core.joinPath
        )
        const fullModelPath = await joinPath([janDataFolderPath, modelPath])
-        return await engine.planModelLoad(fullModelPath, requestedCtx)
+        // mmprojPath is currently unused, but included for compatibility
        return await engine.planModelLoad(
          fullModelPath,
          mmprojPath,
          requestedCtx
        )
      }
      // Fallback if method is not available
@ -523,6 +530,7 @@ export class DefaultModelsService implements ModelsService {
        gpuLayers: 0,
        maxContextLength: 2048,
        noOffloadKVCache: true,
        offloadMmproj: false,
        mode: 'Unsupported',
      }
    } catch (error) {
@ -531,6 +539,7 @@ export class DefaultModelsService implements ModelsService {
        gpuLayers: 0,
        maxContextLength: 2048,
        noOffloadKVCache: true,
        offloadMmproj: false,
        mode: 'Unsupported',
      }
    }
--- a/web-app/src/services/models/types.ts
+++ b/web-app/src/services/models/types.ts
@ -85,6 +85,7 @@ export interface ModelPlan {
  gpuLayers: number
  maxContextLength: number
  noOffloadKVCache: boolean
  offloadMmproj: boolean
  mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
 }
@ -136,5 +137,9 @@ export interface ModelsService {
    ctxSize?: number
  ): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
  validateGgufFile(filePath: string): Promise<ModelValidationResult>
-  planModelLoad(modelPath: string, requestedCtx?: number): Promise<ModelPlan>
+  planModelLoad(
    modelPath: string,
    mmprojPath?: string,
    requestedCtx?: number
  ): Promise<ModelPlan>
 }