diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json index ddbefa936..2bca12c0f 100644 --- a/extensions/llamacpp-extension/settings.json +++ b/extensions/llamacpp-extension/settings.json @@ -36,6 +36,21 @@ "controllerType": "checkbox", "controllerProps": { "value": true } }, + { + "key": "memory_util", + "title": "Smart Memory utilization", + "description": "Smart memory utilization mode for running local GGUF models", + "controllerType": "dropdown", + "controllerProps": { + "value": "high", + "options": [ + { "value": "high", "name": "High" }, + { "value": "medium", "name": "Medium" }, + { "value": "low", "name": "Low" } + ], + "recommended": "high" + } + }, { "key": "threads", "title": "Threads", @@ -178,15 +193,6 @@ "value": false } }, - { - "key": "no_kv_offload", - "title": "Disable KV Offload", - "description": "Disable KV cache offload to GPU (if GPU is used).", - "controllerType": "checkbox", - "controllerProps": { - "value": false - } - }, { "key": "cache_type_k", "title": "KV Cache K Type", diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 744eed3c4..e706b58ae 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -35,10 +35,7 @@ import { import { invoke } from '@tauri-apps/api/core' import { getProxyConfig } from './util' import { basename } from '@tauri-apps/api/path' -import { - GgufMetadata, - readGgufMetadata, -} from '@janhq/tauri-plugin-llamacpp-api' +import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api' import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api' type LlamacppConfig = { @@ -46,6 +43,7 @@ type LlamacppConfig = { auto_update_engine: boolean auto_unload: boolean llamacpp_env: string + memory_util: string chat_template: string n_gpu_layers: number offload_mmproj: boolean @@ -74,6 +72,14 @@ type LlamacppConfig = { ctx_shift: boolean } +type ModelPlan = { + gpuLayers: number + maxContextLength: number + noOffloadKVCache: boolean + noOffloadMmproj?: boolean + mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' +} + interface DownloadItem { url: string save_path: string @@ -116,6 +122,12 @@ interface DeviceList { free: number } +interface SystemMemory { + totalVRAM: number + totalRAM: number + totalMemory: number +} + /** * Override the default app.log function to use Jan's logging system. * @param args @@ -159,6 +171,7 @@ export default class llamacpp_extension extends AIEngine { provider: string = 'llamacpp' autoUnload: boolean = true llamacpp_env: string = '' + memoryMode: string = 'high' readonly providerId: string = 'llamacpp' private config: LlamacppConfig @@ -190,6 +203,7 @@ export default class llamacpp_extension extends AIEngine { this.autoUnload = this.config.auto_unload this.llamacpp_env = this.config.llamacpp_env + this.memoryMode = this.config.memory_util // This sets the base directory where model files for this provider are stored. this.providerPath = await joinPath([ @@ -836,6 +850,8 @@ export default class llamacpp_extension extends AIEngine { this.autoUnload = value as boolean } else if (key === 'llamacpp_env') { this.llamacpp_env = value as string + } else if (key === 'memory_util') { + this.memoryMode = value as string } } @@ -1864,10 +1880,368 @@ export default class llamacpp_extension extends AIEngine { 'tokenizer.chat_template' ]?.includes('tools') } + /** + * Get total system memory including both VRAM and RAM + */ + private async getTotalSystemMemory(): Promise { + const devices = await this.getDevices() + let totalVRAM = 0 + + if (devices.length > 0) { + // Sum total VRAM across all GPUs + totalVRAM = devices + .map((d) => d.mem * 1024 * 1024) + .reduce((a, b) => a + b, 0) + } + + // Get system RAM + const sys = await getSystemUsage() + const totalRAM = sys.used_memory * 1024 * 1024 + + const totalMemory = totalVRAM + totalRAM + + logger.info( + `Total VRAM: ${totalVRAM} bytes, Total RAM: ${totalRAM} bytes, Free: ${usableRAM} bytes, Total Memory: ${totalMemory} bytes` + ) + + return { + totalVRAM, + totalRAM, + totalMemory, + } + } + private async getKVCachePerToken( + meta: Record + ): Promise { + const arch = meta['general.architecture'] + const nLayer = Number(meta[`${arch}.block_count`]) + const nHead = Number(meta[`${arch}.attention.head_count`]) + + // Get head dimensions + const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead + const embeddingLen = Number(meta[`${arch}.embedding_length`]) + const headDim = embeddingLen / nHead + + // KV cache uses head_count_kv (for GQA models) or head_count + // Each token needs K and V, both are fp16 (2 bytes) + const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers + + return bytesPerToken + } + + private async getLayerSize( + path: string, + meta: Record + ): Promise<{ layerSize: number; totalLayers: number }> { + const modelSize = await this.getModelSize(path) + const arch = meta['general.architecture'] + const totalLayers = Number(meta[`${arch}.block_count`]) + if (!totalLayers) throw new Error('Invalid metadata: block_count not found') + return { layerSize: modelSize / totalLayers, totalLayers } + } + + async planModelLoad( + path: string, + requestedCtx?: number, + mmprojPath?: string + ): Promise { + const modelSize = await this.getModelSize(path) + const memoryInfo = await this.getTotalSystemMemory() + const gguf = await readGgufMetadata(path) + + // Get mmproj size if provided + let mmprojSize = 0 + if (mmprojPath) { + mmprojSize = await this.getModelSize(mmprojPath) + } + + const { layerSize, totalLayers } = await this.getLayerSize( + path, + gguf.metadata + ) + + // Fixed KV cache calculation + const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata) + + // Debug logging + logger.info( + `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}` + ) + + // Validate critical values + if (!modelSize || modelSize <= 0) { + throw new Error(`Invalid model size: ${modelSize}`) + } + if (!kvCachePerToken || kvCachePerToken <= 0) { + throw new Error(`Invalid KV cache per token: ${kvCachePerToken}`) + } + if (!layerSize || layerSize <= 0) { + throw new Error(`Invalid layer size: ${layerSize}`) + } + + // GPU overhead factor (20% reserved for GPU operations, alignment, etc.) + const GPU_OVERHEAD_FACTOR = 0.8 + + // VRAM budget with overhead consideration + const VRAM_RESERVE_GB = 0.5 + const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024 + const usableVRAM = Math.max( + 0, + (memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR + ) + + // Get model's maximum context length + const arch = gguf.metadata['general.architecture'] + const modelMaxContextLength = + Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback + + // Set minimum context length + const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility + + // System RAM budget + const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 } + + logger.info( + `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}` + ) + + // Validate memory info + if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) { + throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`) + } + if (!memoryInfo.totalVRAM || isNaN(memoryInfo.totalVRAM)) { + throw new Error(`Invalid total VRAM: ${memoryInfo.totalVRAM}`) + } + if (!this.memoryMode || !(this.memoryMode in memoryPercentages)) { + throw new Error( + `Invalid memory mode: ${this.memoryMode}. Must be 'high', 'medium', or 'low'` + ) + } + + // Calculate actual system RAM + const actualSystemRAM = Math.max( + 0, + memoryInfo.totalMemory - memoryInfo.totalVRAM + ) + const usableSystemMemory = + actualSystemRAM * memoryPercentages[this.memoryMode] + + logger.info( + `Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}` + ) + + // --- Priority 1: Allocate mmproj (if exists) --- + let noOffloadMmproj = false + let remainingVRAM = usableVRAM + + if (mmprojSize > 0) { + if (mmprojSize <= remainingVRAM) { + noOffloadMmproj = true + remainingVRAM -= mmprojSize + logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`) + } else { + logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`) + } + } + + // --- Priority 2: Calculate optimal layer/context balance --- + let gpuLayers = 0 + let maxContextLength = MIN_CONTEXT_LENGTH + let noOffloadKVCache = false + let mode: ModelPlan['mode'] = 'Unsupported' + + // Calculate how much VRAM we need for different context sizes + const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072] + const targetContext = requestedCtx || modelMaxContextLength + + // Find the best balance of layers and context + let bestConfig = { + layers: 0, + context: MIN_CONTEXT_LENGTH, + vramUsed: 0, + } + + for (const ctxSize of contextSizes) { + if (ctxSize > targetContext) break + + const kvCacheSize = ctxSize * kvCachePerToken + const availableForLayers = remainingVRAM - kvCacheSize + + if (availableForLayers <= 0) continue + + const possibleLayers = Math.min( + Math.floor(availableForLayers / layerSize), + totalLayers + ) + + if (possibleLayers > 0) { + const totalVramNeeded = possibleLayers * layerSize + kvCacheSize + + // Verify this fits with some margin + if (totalVramNeeded <= remainingVRAM * 0.95) { + bestConfig = { + layers: possibleLayers, + context: ctxSize, + vramUsed: totalVramNeeded, + } + } + } + } + + // Apply the best configuration found + if (bestConfig.layers > 0) { + gpuLayers = bestConfig.layers + maxContextLength = bestConfig.context + noOffloadKVCache = false + mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid' + + logger.info( + `Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` + + `VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes` + ) + } else { + // Fallback: Try minimal GPU layers with KV cache on CPU + gpuLayers = Math.min( + Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers + totalLayers + ) + + if (gpuLayers > 0) { + // Calculate available system RAM for KV cache + const cpuLayers = totalLayers - gpuLayers + const modelCPUSize = cpuLayers * layerSize + const mmprojCPUSize = + mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0 + const systemRAMUsed = modelCPUSize + mmprojCPUSize + const availableSystemRAMForKVCache = Math.max( + 0, + usableSystemMemory - systemRAMUsed + ) + + // Calculate context that fits in system RAM + const systemRAMContext = Math.min( + Math.floor(availableSystemRAMForKVCache / kvCachePerToken), + targetContext + ) + + if (systemRAMContext >= MIN_CONTEXT_LENGTH) { + maxContextLength = systemRAMContext + noOffloadKVCache = true + mode = 'Hybrid' + + logger.info( + `Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` + + `${maxContextLength} context on CPU RAM` + ) + } else { + // Can't fit reasonable context even with CPU RAM + // Reduce GPU layers further + gpuLayers = Math.floor(gpuLayers / 2) + maxContextLength = MIN_CONTEXT_LENGTH + noOffloadKVCache = true + mode = gpuLayers > 0 ? 'Hybrid' : 'CPU' + } + } else { + // Pure CPU mode + gpuLayers = 0 + noOffloadKVCache = true + + // Calculate context for pure CPU mode + const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0) + const availableForKVCache = Math.max( + 0, + usableSystemMemory - totalCPUMemoryNeeded + ) + + maxContextLength = Math.min( + Math.max( + MIN_CONTEXT_LENGTH, + Math.floor(availableForKVCache / kvCachePerToken) + ), + targetContext + ) + + mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported' + } + } + + // Safety check: Verify total GPU memory usage + if (gpuLayers > 0 && !noOffloadKVCache) { + const estimatedGPUUsage = + gpuLayers * layerSize + + maxContextLength * kvCachePerToken + + (noOffloadMmproj ? mmprojSize : 0) + + if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { + logger.warn( + `GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...` + ) + + // Reduce context first + while ( + maxContextLength > MIN_CONTEXT_LENGTH && + estimatedGPUUsage > memoryInfo.totalVRAM * 0.9 + ) { + maxContextLength = Math.floor(maxContextLength / 2) + const newEstimate = + gpuLayers * layerSize + + maxContextLength * kvCachePerToken + + (noOffloadMmproj ? mmprojSize : 0) + if (newEstimate <= memoryInfo.totalVRAM * 0.9) break + } + + // If still too much, reduce layers + if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) { + gpuLayers = Math.floor(gpuLayers * 0.7) + mode = gpuLayers > 0 ? 'Hybrid' : 'CPU' + noOffloadKVCache = true // Move KV cache to CPU + } + } + } + + // Apply user-requested context limit if specified + if (requestedCtx && requestedCtx > 0) { + maxContextLength = Math.min(maxContextLength, requestedCtx) + logger.info( + `User requested context: ${requestedCtx}, final: ${maxContextLength}` + ) + } + + // Ensure we never exceed model's maximum context + maxContextLength = Math.min(maxContextLength, modelMaxContextLength) + + // Final validation + if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) { + mode = 'Unsupported' + } + + // Ensure maxContextLength is valid + maxContextLength = isNaN(maxContextLength) + ? MIN_CONTEXT_LENGTH + : Math.max(MIN_CONTEXT_LENGTH, maxContextLength) + + // Log final plan + const mmprojInfo = mmprojPath + ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}` + : '' + + logger.info( + `Final plan for ${path}: gpuLayers=${gpuLayers}/${totalLayers}, ` + + `maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, ` + + `mode=${mode}${mmprojInfo}` + ) + + return { + gpuLayers, + maxContextLength, + noOffloadKVCache, + mode, + noOffloadMmproj, + } + } /** - * estimate KVCache size of from a given metadata - * + * estimate KVCache size from a given metadata */ private async estimateKVCache( meta: Record, @@ -1907,6 +2281,7 @@ export default class llamacpp_extension extends AIEngine { `Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}` ) } + let ctxLen: number if (!ctx_size) { ctxLen = Number(meta[`${arch}.context_length`]) @@ -1941,13 +2316,13 @@ export default class llamacpp_extension extends AIEngine { } } - /* - * check the support status of a model by its path (local/remote) + /** + * Check the support status of a model by its path (local/remote) * - * * Returns: - * - "RED" → weights don't fit - * - "YELLOW" → weights fit, KV cache doesn't - * - "GREEN" → both weights + KV cache fit + * Returns: + * - "RED" → weights don't fit in total memory + * - "YELLOW" → weights fit in VRAM but need system RAM, or KV cache doesn't fit + * - "GREEN" → both weights + KV cache fit in VRAM */ async isModelSupported( path: string, @@ -1955,46 +2330,48 @@ export default class llamacpp_extension extends AIEngine { ): Promise<'RED' | 'YELLOW' | 'GREEN'> { try { const modelSize = await this.getModelSize(path) + const memoryInfo = await this.getTotalSystemMemory() + logger.info(`modelSize: ${modelSize}`) - let gguf: GgufMetadata - gguf = await readGgufMetadata(path) + + const gguf = await readGgufMetadata(path) let kvCacheSize: number if (ctx_size) { kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size) } else { kvCacheSize = await this.estimateKVCache(gguf.metadata) } - // total memory consumption = model weights + kvcache + a small buffer for outputs - // output buffer is small so not considering here + + // Total memory consumption = model weights + kvcache const totalRequired = modelSize + kvCacheSize logger.info( `isModelSupported: Total memory requirement: ${totalRequired} for ${path}` ) - let totalMemBytes: number - const devices = await this.getDevices() - if (devices.length > 0) { - // Sum total memory across all GPUs - totalMemBytes = devices - .map((d) => d.mem * 1024 * 1024) - .reduce((a, b) => a + b, 0) - } else { - // CPU fallback - const sys = await getSystemUsage() - totalMemBytes = sys.total_memory * 1024 * 1024 - } // Use 80% of total memory as the usable limit const USABLE_MEMORY_PERCENTAGE = 0.8 - const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE + const usableTotalMemory = + memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE + const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE - // check model size wrt 80% of system memory - if (modelSize > usableMemBytes) { + // Check if model fits in total memory at all + if (modelSize > usableTotalMemory) { return 'RED' - } else if (modelSize + kvCacheSize > usableMemBytes) { - return 'YELLOW' - } else { + } + + // Check if everything fits in VRAM (ideal case) + if (totalRequired <= usableVRAM) { return 'GREEN' } + + // Check if model fits in VRAM but total requirement exceeds VRAM + // OR if total requirement fits in total memory but not in VRAM + if (modelSize <= usableVRAM || totalRequired <= usableTotalMemory) { + return 'YELLOW' + } + + // If we get here, nothing fits properly + return 'RED' } catch (e) { throw new Error(String(e)) } @@ -2006,39 +2383,42 @@ export default class llamacpp_extension extends AIEngine { async validateGgufFile(filePath: string): Promise<{ isValid: boolean error?: string - metadata?: GgufMetadata + metadata?: any }> { try { logger.info(`Validating GGUF file: ${filePath}`) const metadata = await readGgufMetadata(filePath) - + // Log full metadata for debugging logger.info('Full GGUF metadata:', JSON.stringify(metadata, null, 2)) - + // Check if architecture is 'clip' which is not supported for text generation const architecture = metadata.metadata?.['general.architecture'] logger.info(`Model architecture: ${architecture}`) - + if (architecture === 'clip') { - const errorMessage = 'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.' + const errorMessage = + 'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.' logger.error('CLIP architecture detected:', architecture) return { isValid: false, error: errorMessage, - metadata + metadata, } } - + logger.info('Model validation passed. Architecture:', architecture) return { isValid: true, - metadata + metadata, } } catch (error) { logger.error('Failed to validate GGUF file:', error) return { isValid: false, - error: `Failed to read model metadata: ${error instanceof Error ? error.message : 'Unknown error'}` + error: `Failed to read model metadata: ${ + error instanceof Error ? error.message : 'Unknown error' + }`, } } } diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx index a18f5184a..4a1525003 100644 --- a/web-app/src/containers/ModelSetting.tsx +++ b/web-app/src/containers/ModelSetting.tsx @@ -1,5 +1,6 @@ -import { IconSettings } from '@tabler/icons-react' +import { IconSettings, IconLoader } from '@tabler/icons-react' import debounce from 'lodash.debounce' +import { useState } from 'react' import { Sheet, @@ -9,6 +10,7 @@ import { SheetTitle, SheetTrigger, } from '@/components/ui/sheet' +import { Button } from '@/components/ui/button' import { DynamicControllerSetting } from '@/containers/dynamicControllerSetting' import { useModelProvider } from '@/hooks/useModelProvider' import { useServiceHub } from '@/hooks/useServiceHub' @@ -30,11 +32,134 @@ export function ModelSetting({ const { t } = useTranslation() const serviceHub = useServiceHub() + const [isPlanning, setIsPlanning] = useState(false) + // Create a debounced version of stopModel that waits 500ms after the last call const debouncedStopModel = debounce((modelId: string) => { serviceHub.models().stopModel(modelId) }, 500) + const handlePlanModelLoad = async () => { + if (provider.provider !== 'llamacpp') { + console.warn('planModelLoad is only available for llamacpp provider') + return + } + setIsPlanning(true) + try { + // Read the model config to get the actual model path + const modelConfig = await serviceHub.app().readYaml<{ + model_path: string + }>(`llamacpp/models/${model.id}/model.yml`) + + if (modelConfig && modelConfig.model_path) { + const result = await serviceHub + .models() + .planModelLoad(modelConfig.model_path) + + // Apply the recommended settings to the model sequentially to avoid race conditions + const settingsToUpdate: Array<{ + key: string + value: number | boolean + }> = [] + + if (model.settings?.ngl && result.gpuLayers !== undefined) { + settingsToUpdate.push({ key: 'ngl', value: result.gpuLayers }) + } + + if (model.settings?.ctx_len && result.maxContextLength !== undefined) { + settingsToUpdate.push({ + key: 'ctx_len', + value: result.maxContextLength, + }) + } + + if ( + model.settings?.no_kv_offload && + result.noOffloadKVCache !== undefined + ) { + settingsToUpdate.push({ + key: 'no_kv_offload', + value: result.noOffloadKVCache, + }) + } + + // Apply all settings in a single update to avoid race conditions + if (settingsToUpdate.length > 0) { + handleMultipleSettingsChange(settingsToUpdate) + } + } else { + console.warn('No model_path found in config for', model.id) + } + } catch (error) { + console.error('Error calling planModelLoad:', error) + } finally { + setIsPlanning(false) + } + } + + const handleMultipleSettingsChange = ( + settingsToUpdate: Array<{ key: string; value: number | boolean }> + ) => { + if (!provider) return + + // Create a copy of the model with ALL updated settings at once + let updatedModel = { ...model } + + settingsToUpdate.forEach(({ key, value }) => { + const existingSetting = updatedModel.settings?.[key] as ProviderSetting + updatedModel = { + ...updatedModel, + settings: { + ...updatedModel.settings, + [key]: { + ...existingSetting, + controller_props: { + ...existingSetting?.controller_props, + value: value, + }, + } as ProviderSetting, + }, + } + }) + + // Find the model index in the provider's models array + const modelIndex = provider.models.findIndex((m) => m.id === model.id) + + if (modelIndex !== -1) { + // Create a copy of the provider's models array + const updatedModels = [...provider.models] + + // Update the specific model in the array + updatedModels[modelIndex] = updatedModel as Model + + // Update the provider with the new models array + updateProvider(provider.provider, { + models: updatedModels, + }) + + // Check if any of the updated settings require a model restart + const requiresRestart = settingsToUpdate.some( + ({ key }) => + key === 'ctx_len' || + key === 'ngl' || + key === 'chat_template' || + key === 'offload_mmproj' + ) + + if (requiresRestart) { + // Check if model is running before stopping it + serviceHub + .models() + .getActiveModels() + .then((activeModels) => { + if (activeModels.includes(model.id)) { + debouncedStopModel(model.id) + } + }) + } + } + } + const handleSettingChange = ( key: string, value: string | boolean | number @@ -72,8 +197,22 @@ export function ModelSetting({ }) // Call debounced stopModel only when updating ctx_len, ngl, chat_template, or offload_mmproj - if (key === 'ctx_len' || key === 'ngl' || key === 'chat_template' || key === 'offload_mmproj') { - debouncedStopModel(model.id) + // and only if the model is currently running + if ( + key === 'ctx_len' || + key === 'ngl' || + key === 'chat_template' || + key === 'offload_mmproj' + ) { + // Check if model is running before stopping it + serviceHub + .models() + .getActiveModels() + .then((activeModels) => { + if (activeModels.includes(model.id)) { + debouncedStopModel(model.id) + } + }) } } } @@ -98,7 +237,36 @@ export function ModelSetting({ {t('common:modelSettings.description')} + + {/* Model Load Planning Section - Only show for llamacpp provider */} + {provider.provider === 'llamacpp' && ( +
+
+

Optimize Settings

+

+ Analyze your system and model, then apply optimal loading + settings automatically +

+ +
+
+ )} +
{Object.entries(model.settings || {}).map(([key, value]) => { const config = value as ProviderSetting diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts index 86d7f4dba..4d476ae7c 100644 --- a/web-app/src/hooks/useModelProvider.ts +++ b/web-app/src/hooks/useModelProvider.ts @@ -93,7 +93,11 @@ export const useModelProvider = create()( ? legacyModels : models ).find( - (m) => m.id.split(':').slice(0, 2).join(getServiceHub().path().sep()) === model.id + (m) => + m.id + .split(':') + .slice(0, 2) + .join(getServiceHub().path().sep()) === model.id )?.settings || model.settings const existingModel = models.find((m) => m.id === model.id) return { @@ -227,7 +231,7 @@ export const useModelProvider = create()( > } - if (version === 0 && state?.providers) { + if (version <= 1 && state?.providers) { state.providers.forEach((provider) => { // Update cont_batching description for llamacpp provider if (provider.provider === 'llamacpp' && provider.settings) { @@ -270,6 +274,15 @@ export const useModelProvider = create()( }, } } + + if (!model.settings.no_kv_offload) { + model.settings.no_kv_offload = { + ...modelSettings.no_kv_offload, + controller_props: { + ...modelSettings.no_kv_offload.controller_props, + }, + } + } }) } }) @@ -277,7 +290,7 @@ export const useModelProvider = create()( return state }, - version: 1, + version: 2, } ) ) diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts index b4d5164e7..32d05d70c 100644 --- a/web-app/src/lib/predefined.ts +++ b/web-app/src/lib/predefined.ts @@ -144,4 +144,13 @@ export const modelSettings = { type: 'text', }, }, + no_kv_offload: { + key: 'no_kv_offload', + title: 'Disable KV Offload', + description: 'Disable KV cache offload to GPU (if GPU is used).', + controller_type: 'checkbox', + controller_props: { + value: false, + }, + }, } diff --git a/web-app/src/routes/settings/providers/$providerName.tsx b/web-app/src/routes/settings/providers/$providerName.tsx index b24baf5cf..5bcc3de5a 100644 --- a/web-app/src/routes/settings/providers/$providerName.tsx +++ b/web-app/src/routes/settings/providers/$providerName.tsx @@ -1,3 +1,4 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ import { Card, CardItem } from '@/containers/Card' import HeaderPage from '@/containers/HeaderPage' import SettingsMenu from '@/containers/SettingsMenu' @@ -116,22 +117,25 @@ function ProviderDetail() { // Add 'vision' capability if not already present AND if user hasn't manually configured capabilities // Check if model has a custom capabilities config flag - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const hasUserConfiguredCapabilities = (model as any)._userConfiguredCapabilities === true - - if (!capabilities.includes('vision') && !hasUserConfiguredCapabilities) { + + const hasUserConfiguredCapabilities = + (model as any)._userConfiguredCapabilities === true + + if ( + !capabilities.includes('vision') && + !hasUserConfiguredCapabilities + ) { const updatedModels = [...llamacppProvider.models] updatedModels[modelIndex] = { ...model, capabilities: [...capabilities, 'vision'], // Mark this as auto-detected, not user-configured _autoDetectedVision: true, - // eslint-disable-next-line @typescript-eslint/no-explicit-any } as any updateProviderState('llamacpp', { models: updatedModels }) console.log( - `Vision capability auto-added to model after provider refresh: ${importedModelName}` + `Vision capability added to model after provider refresh: ${importedModelName}` ) } } @@ -257,33 +261,36 @@ function ProviderDetail() { } } - const handleStartModel = (modelId: string) => { + const handleStartModel = async (modelId: string) => { // Add model to loading state setLoadingModels((prev) => [...prev, modelId]) - if (provider) - // Original: startModel(provider, modelId).then(() => { setActiveModels((prevModels) => [...prevModels, modelId]) }) - serviceHub - .models() - .startModel(provider, modelId) - .then(() => { - // Refresh active models after starting - serviceHub - .models() - .getActiveModels() - .then((models) => setActiveModels(models || [])) - }) - .catch((error) => { - console.error('Error starting model:', error) - if (error && typeof error === 'object' && 'message' in error) { - setModelLoadError(error) - } else { - setModelLoadError(`${error}`) - } - }) - .finally(() => { - // Remove model from loading state - setLoadingModels((prev) => prev.filter((id) => id !== modelId)) - }) + if (provider) { + try { + // Start the model with plan result + await serviceHub.models().startModel(provider, modelId) + + // Refresh active models after starting + serviceHub + .models() + .getActiveModels() + .then((models) => setActiveModels(models || [])) + } catch (error) { + console.error('Error starting model:', error) + if ( + error && + typeof error === 'object' && + 'message' in error && + typeof error.message === 'string' + ) { + setModelLoadError({ message: error.message }) + } else { + setModelLoadError(typeof error === 'string' ? error : `${error}`) + } + } finally { + // Remove model from loading state + setLoadingModels((prev) => prev.filter((id) => id !== modelId)) + } + } } const handleStopModel = (modelId: string) => { diff --git a/web-app/src/services/models/default.ts b/web-app/src/services/models/default.ts index d4322b971..54595d448 100644 --- a/web-app/src/services/models/default.ts +++ b/web-app/src/services/models/default.ts @@ -17,6 +17,7 @@ import type { HuggingFaceRepo, CatalogModel, ModelValidationResult, + ModelPlan, } from './types' // TODO: Replace this with the actual provider later @@ -491,4 +492,47 @@ export class DefaultModelsService implements ModelsService { } } } + + async planModelLoad( + modelPath: string, + requestedCtx?: number + ): Promise { + try { + const engine = this.getEngine('llamacpp') as AIEngine & { + planModelLoad?: ( + path: string, + requestedCtx?: number + ) => Promise + } + + if (engine && typeof engine.planModelLoad === 'function') { + // Get the full absolute path to the model file + const janDataFolderPath = await import('@janhq/core').then((core) => + core.getJanDataFolderPath() + ) + const joinPath = await import('@janhq/core').then( + (core) => core.joinPath + ) + const fullModelPath = await joinPath([janDataFolderPath, modelPath]) + return await engine.planModelLoad(fullModelPath, requestedCtx) + } + + // Fallback if method is not available + console.warn('planModelLoad method not available in llamacpp engine') + return { + gpuLayers: 0, + maxContextLength: 2048, + noOffloadKVCache: true, + mode: 'Unsupported', + } + } catch (error) { + console.error(`Error planning model load for path ${modelPath}:`, error) + return { + gpuLayers: 0, + maxContextLength: 2048, + noOffloadKVCache: true, + mode: 'Unsupported', + } + } + } } diff --git a/web-app/src/services/models/types.ts b/web-app/src/services/models/types.ts index 7d51d8b09..920cbfe81 100644 --- a/web-app/src/services/models/types.ts +++ b/web-app/src/services/models/types.ts @@ -81,10 +81,20 @@ export interface ModelValidationResult { metadata?: GgufMetadata } +export interface ModelPlan { + gpuLayers: number + maxContextLength: number + noOffloadKVCache: boolean + mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' +} + export interface ModelsService { fetchModels(): Promise fetchModelCatalog(): Promise - fetchHuggingFaceRepo(repoId: string, hfToken?: string): Promise + fetchHuggingFaceRepo( + repoId: string, + hfToken?: string + ): Promise convertHfRepoToCatalogModel(repo: HuggingFaceRepo): CatalogModel updateModel(model: Partial): Promise pullModel( @@ -107,14 +117,24 @@ export interface ModelsService { getActiveModels(provider?: string): Promise stopModel(model: string, provider?: string): Promise stopAllModels(): Promise - startModel(provider: ProviderObject, model: string): Promise + startModel( + provider: ProviderObject, + model: string + ): Promise isToolSupported(modelId: string): Promise checkMmprojExistsAndUpdateOffloadMMprojSetting( modelId: string, - updateProvider?: (providerName: string, data: Partial) => void, + updateProvider?: ( + providerName: string, + data: Partial + ) => void, getProviderByName?: (providerName: string) => ModelProvider | undefined ): Promise<{ exists: boolean; settingsUpdated: boolean }> checkMmprojExists(modelId: string): Promise - isModelSupported(modelPath: string, ctxSize?: number): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'> + isModelSupported( + modelPath: string, + ctxSize?: number + ): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'> validateGgufFile(filePath: string): Promise -} \ No newline at end of file + planModelLoad(modelPath: string, requestedCtx?: number): Promise +}