feat: Smart model management (#6390)

* feat: Smart model management

* **New UI option** – `memory_util` added to `settings.json` with a dropdown (high / medium / low) to let users control how aggressively the engine uses system memory.
* **Configuration updates** – `LlamacppConfig` now includes `memory_util`; the extension class stores it in a new `memoryMode` property and handles updates through `updateConfig`.
* **System memory handling**
  * Introduced `SystemMemory` interface and `getTotalSystemMemory()` to report combined VRAM + RAM.
  * Added helper methods `getKVCachePerToken`, `getLayerSize`, and a new `ModelPlan` type.
* **Smart model‑load planner** – `planModelLoad()` computes:
  * Number of GPU layers that can fit in usable VRAM.
  * Maximum context length based on KV‑cache size and the selected memory utilization mode (high/medium/low).
  * Whether KV‑cache must be off‑loaded to CPU and the overall loading mode (GPU, Hybrid, CPU, Unsupported).
  * Detailed logging of the planning decision.
* **Improved support check** – `isModelSupported()` now:
  * Uses the combined VRAM/RAM totals from `getTotalSystemMemory()`.
  * Applies an 80% usable‑memory heuristic.
  * Returns **GREEN** only when both weights and KV‑cache fit in VRAM, **YELLOW** when they fit only in total memory or require CPU off‑load, and **RED** when the model cannot fit at all.
* **Cleanup** – Removed unused `GgufMetadata` import; updated imports and type definitions accordingly.
* **Documentation/comments** – Added explanatory JSDoc comments for the new methods and clarified the return semantics of `isModelSupported`.

* chore: migrate no_kv_offload from llamacpp setting to model setting

* chore: add UI auto optimize model setting

* feat: improve model loading planner with mmproj support and smarter memory budgeting

* Extend `ModelPlan` with optional `noOffloadMmproj` flag to indicate when a multimodal projector can stay in VRAM.
* Add `mmprojPath` parameter to `planModelLoad` and calculate its size, attempting to keep it on GPU when possible.
* Refactor system memory detection:
  * Use `used_memory` (actual free RAM) instead of total RAM for budgeting.
  * Introduced `usableRAM` placeholder for future use.
* Rewrite KV‑cache size calculation:
  * Properly handle GQA models via `attention.head_count_kv`.
  * Compute bytes per token as `nHeadKV * headDim * 2 * 2 * nLayer`.
* Replace the old 70 % VRAM heuristic with a more flexible budget:
  * Reserve a fixed VRAM amount and apply an overhead factor.
  * Derive usable system RAM from total memory minus VRAM.
* Implement a robust allocation algorithm:
  * Prioritize placing the mmproj in VRAM.
  * Search for the best balance of GPU layers and context length.
  * Fallback strategies for hybrid and pure‑CPU modes with detailed safety checks.
* Add extensive validation of model size, KV‑cache size, layer size, and memory mode.
* Improve logging throughout the planning process for easier debugging.
* Adjust final plan return shape to include the new `noOffloadMmproj` field.

* remove unused variable

---------

Co-authored-by: Faisal Amir <urmauur@gmail.com>
This commit is contained in:
Akarshan Biswas 2025-09-11 09:48:03 +05:30 committed by GitHub
parent 3158722a63
commit 7a174e621a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 741 additions and 94 deletions

View File

@ -36,6 +36,21 @@
"controllerType": "checkbox",
"controllerProps": { "value": true }
},
{
"key": "memory_util",
"title": "Smart Memory utilization",
"description": "Smart memory utilization mode for running local GGUF models",
"controllerType": "dropdown",
"controllerProps": {
"value": "high",
"options": [
{ "value": "high", "name": "High" },
{ "value": "medium", "name": "Medium" },
{ "value": "low", "name": "Low" }
],
"recommended": "high"
}
},
{
"key": "threads",
"title": "Threads",
@ -178,15 +193,6 @@
"value": false
}
},
{
"key": "no_kv_offload",
"title": "Disable KV Offload",
"description": "Disable KV cache offload to GPU (if GPU is used).",
"controllerType": "checkbox",
"controllerProps": {
"value": false
}
},
{
"key": "cache_type_k",
"title": "KV Cache K Type",

View File

@ -35,10 +35,7 @@ import {
import { invoke } from '@tauri-apps/api/core'
import { getProxyConfig } from './util'
import { basename } from '@tauri-apps/api/path'
import {
GgufMetadata,
readGgufMetadata,
} from '@janhq/tauri-plugin-llamacpp-api'
import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
type LlamacppConfig = {
@ -46,6 +43,7 @@ type LlamacppConfig = {
auto_update_engine: boolean
auto_unload: boolean
llamacpp_env: string
memory_util: string
chat_template: string
n_gpu_layers: number
offload_mmproj: boolean
@ -74,6 +72,14 @@ type LlamacppConfig = {
ctx_shift: boolean
}
type ModelPlan = {
gpuLayers: number
maxContextLength: number
noOffloadKVCache: boolean
noOffloadMmproj?: boolean
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
}
interface DownloadItem {
url: string
save_path: string
@ -116,6 +122,12 @@ interface DeviceList {
free: number
}
interface SystemMemory {
totalVRAM: number
totalRAM: number
totalMemory: number
}
/**
* Override the default app.log function to use Jan's logging system.
* @param args
@ -159,6 +171,7 @@ export default class llamacpp_extension extends AIEngine {
provider: string = 'llamacpp'
autoUnload: boolean = true
llamacpp_env: string = ''
memoryMode: string = 'high'
readonly providerId: string = 'llamacpp'
private config: LlamacppConfig
@ -190,6 +203,7 @@ export default class llamacpp_extension extends AIEngine {
this.autoUnload = this.config.auto_unload
this.llamacpp_env = this.config.llamacpp_env
this.memoryMode = this.config.memory_util
// This sets the base directory where model files for this provider are stored.
this.providerPath = await joinPath([
@ -836,6 +850,8 @@ export default class llamacpp_extension extends AIEngine {
this.autoUnload = value as boolean
} else if (key === 'llamacpp_env') {
this.llamacpp_env = value as string
} else if (key === 'memory_util') {
this.memoryMode = value as string
}
}
@ -1864,10 +1880,368 @@ export default class llamacpp_extension extends AIEngine {
'tokenizer.chat_template'
]?.includes('tools')
}
/**
* Get total system memory including both VRAM and RAM
*/
private async getTotalSystemMemory(): Promise<SystemMemory> {
const devices = await this.getDevices()
let totalVRAM = 0
if (devices.length > 0) {
// Sum total VRAM across all GPUs
totalVRAM = devices
.map((d) => d.mem * 1024 * 1024)
.reduce((a, b) => a + b, 0)
}
// Get system RAM
const sys = await getSystemUsage()
const totalRAM = sys.used_memory * 1024 * 1024
const totalMemory = totalVRAM + totalRAM
logger.info(
`Total VRAM: ${totalVRAM} bytes, Total RAM: ${totalRAM} bytes, Free: ${usableRAM} bytes, Total Memory: ${totalMemory} bytes`
)
return {
totalVRAM,
totalRAM,
totalMemory,
}
}
private async getKVCachePerToken(
meta: Record<string, string>
): Promise<number> {
const arch = meta['general.architecture']
const nLayer = Number(meta[`${arch}.block_count`])
const nHead = Number(meta[`${arch}.attention.head_count`])
// Get head dimensions
const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
const embeddingLen = Number(meta[`${arch}.embedding_length`])
const headDim = embeddingLen / nHead
// KV cache uses head_count_kv (for GQA models) or head_count
// Each token needs K and V, both are fp16 (2 bytes)
const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
return bytesPerToken
}
private async getLayerSize(
path: string,
meta: Record<string, string>
): Promise<{ layerSize: number; totalLayers: number }> {
const modelSize = await this.getModelSize(path)
const arch = meta['general.architecture']
const totalLayers = Number(meta[`${arch}.block_count`])
if (!totalLayers) throw new Error('Invalid metadata: block_count not found')
return { layerSize: modelSize / totalLayers, totalLayers }
}
async planModelLoad(
path: string,
requestedCtx?: number,
mmprojPath?: string
): Promise<ModelPlan> {
const modelSize = await this.getModelSize(path)
const memoryInfo = await this.getTotalSystemMemory()
const gguf = await readGgufMetadata(path)
// Get mmproj size if provided
let mmprojSize = 0
if (mmprojPath) {
mmprojSize = await this.getModelSize(mmprojPath)
}
const { layerSize, totalLayers } = await this.getLayerSize(
path,
gguf.metadata
)
// Fixed KV cache calculation
const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
// Debug logging
logger.info(
`Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
)
// Validate critical values
if (!modelSize || modelSize <= 0) {
throw new Error(`Invalid model size: ${modelSize}`)
}
if (!kvCachePerToken || kvCachePerToken <= 0) {
throw new Error(`Invalid KV cache per token: ${kvCachePerToken}`)
}
if (!layerSize || layerSize <= 0) {
throw new Error(`Invalid layer size: ${layerSize}`)
}
// GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
const GPU_OVERHEAD_FACTOR = 0.8
// VRAM budget with overhead consideration
const VRAM_RESERVE_GB = 0.5
const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
const usableVRAM = Math.max(
0,
(memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
)
// Get model's maximum context length
const arch = gguf.metadata['general.architecture']
const modelMaxContextLength =
Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
// Set minimum context length
const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
// System RAM budget
const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
logger.info(
`Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
)
// Validate memory info
if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
}
if (!memoryInfo.totalVRAM || isNaN(memoryInfo.totalVRAM)) {
throw new Error(`Invalid total VRAM: ${memoryInfo.totalVRAM}`)
}
if (!this.memoryMode || !(this.memoryMode in memoryPercentages)) {
throw new Error(
`Invalid memory mode: ${this.memoryMode}. Must be 'high', 'medium', or 'low'`
)
}
// Calculate actual system RAM
const actualSystemRAM = Math.max(
0,
memoryInfo.totalMemory - memoryInfo.totalVRAM
)
const usableSystemMemory =
actualSystemRAM * memoryPercentages[this.memoryMode]
logger.info(
`Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
)
// --- Priority 1: Allocate mmproj (if exists) ---
let noOffloadMmproj = false
let remainingVRAM = usableVRAM
if (mmprojSize > 0) {
if (mmprojSize <= remainingVRAM) {
noOffloadMmproj = true
remainingVRAM -= mmprojSize
logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
} else {
logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
}
}
// --- Priority 2: Calculate optimal layer/context balance ---
let gpuLayers = 0
let maxContextLength = MIN_CONTEXT_LENGTH
let noOffloadKVCache = false
let mode: ModelPlan['mode'] = 'Unsupported'
// Calculate how much VRAM we need for different context sizes
const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
const targetContext = requestedCtx || modelMaxContextLength
// Find the best balance of layers and context
let bestConfig = {
layers: 0,
context: MIN_CONTEXT_LENGTH,
vramUsed: 0,
}
for (const ctxSize of contextSizes) {
if (ctxSize > targetContext) break
const kvCacheSize = ctxSize * kvCachePerToken
const availableForLayers = remainingVRAM - kvCacheSize
if (availableForLayers <= 0) continue
const possibleLayers = Math.min(
Math.floor(availableForLayers / layerSize),
totalLayers
)
if (possibleLayers > 0) {
const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
// Verify this fits with some margin
if (totalVramNeeded <= remainingVRAM * 0.95) {
bestConfig = {
layers: possibleLayers,
context: ctxSize,
vramUsed: totalVramNeeded,
}
}
}
}
// Apply the best configuration found
if (bestConfig.layers > 0) {
gpuLayers = bestConfig.layers
maxContextLength = bestConfig.context
noOffloadKVCache = false
mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
logger.info(
`Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
`VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
)
} else {
// Fallback: Try minimal GPU layers with KV cache on CPU
gpuLayers = Math.min(
Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
totalLayers
)
if (gpuLayers > 0) {
// Calculate available system RAM for KV cache
const cpuLayers = totalLayers - gpuLayers
const modelCPUSize = cpuLayers * layerSize
const mmprojCPUSize =
mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0
const systemRAMUsed = modelCPUSize + mmprojCPUSize
const availableSystemRAMForKVCache = Math.max(
0,
usableSystemMemory - systemRAMUsed
)
// Calculate context that fits in system RAM
const systemRAMContext = Math.min(
Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
targetContext
)
if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
maxContextLength = systemRAMContext
noOffloadKVCache = true
mode = 'Hybrid'
logger.info(
`Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
`${maxContextLength} context on CPU RAM`
)
} else {
// Can't fit reasonable context even with CPU RAM
// Reduce GPU layers further
gpuLayers = Math.floor(gpuLayers / 2)
maxContextLength = MIN_CONTEXT_LENGTH
noOffloadKVCache = true
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
}
} else {
// Pure CPU mode
gpuLayers = 0
noOffloadKVCache = true
// Calculate context for pure CPU mode
const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
const availableForKVCache = Math.max(
0,
usableSystemMemory - totalCPUMemoryNeeded
)
maxContextLength = Math.min(
Math.max(
MIN_CONTEXT_LENGTH,
Math.floor(availableForKVCache / kvCachePerToken)
),
targetContext
)
mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
}
}
// Safety check: Verify total GPU memory usage
if (gpuLayers > 0 && !noOffloadKVCache) {
const estimatedGPUUsage =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(noOffloadMmproj ? mmprojSize : 0)
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
logger.warn(
`GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
)
// Reduce context first
while (
maxContextLength > MIN_CONTEXT_LENGTH &&
estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
) {
maxContextLength = Math.floor(maxContextLength / 2)
const newEstimate =
gpuLayers * layerSize +
maxContextLength * kvCachePerToken +
(noOffloadMmproj ? mmprojSize : 0)
if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
}
// If still too much, reduce layers
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
gpuLayers = Math.floor(gpuLayers * 0.7)
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
noOffloadKVCache = true // Move KV cache to CPU
}
}
}
// Apply user-requested context limit if specified
if (requestedCtx && requestedCtx > 0) {
maxContextLength = Math.min(maxContextLength, requestedCtx)
logger.info(
`User requested context: ${requestedCtx}, final: ${maxContextLength}`
)
}
// Ensure we never exceed model's maximum context
maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
// Final validation
if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
mode = 'Unsupported'
}
// Ensure maxContextLength is valid
maxContextLength = isNaN(maxContextLength)
? MIN_CONTEXT_LENGTH
: Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
// Log final plan
const mmprojInfo = mmprojPath
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}`
: ''
logger.info(
`Final plan for ${path}: gpuLayers=${gpuLayers}/${totalLayers}, ` +
`maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, ` +
`mode=${mode}${mmprojInfo}`
)
return {
gpuLayers,
maxContextLength,
noOffloadKVCache,
mode,
noOffloadMmproj,
}
}
/**
* estimate KVCache size of from a given metadata
*
* estimate KVCache size from a given metadata
*/
private async estimateKVCache(
meta: Record<string, string>,
@ -1907,6 +2281,7 @@ export default class llamacpp_extension extends AIEngine {
`Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}`
)
}
let ctxLen: number
if (!ctx_size) {
ctxLen = Number(meta[`${arch}.context_length`])
@ -1941,13 +2316,13 @@ export default class llamacpp_extension extends AIEngine {
}
}
/*
* check the support status of a model by its path (local/remote)
/**
* Check the support status of a model by its path (local/remote)
*
* * Returns:
* - "RED" weights don't fit
* - "YELLOW" weights fit, KV cache doesn't
* - "GREEN" both weights + KV cache fit
* Returns:
* - "RED" weights don't fit in total memory
* - "YELLOW" weights fit in VRAM but need system RAM, or KV cache doesn't fit
* - "GREEN" both weights + KV cache fit in VRAM
*/
async isModelSupported(
path: string,
@ -1955,46 +2330,48 @@ export default class llamacpp_extension extends AIEngine {
): Promise<'RED' | 'YELLOW' | 'GREEN'> {
try {
const modelSize = await this.getModelSize(path)
const memoryInfo = await this.getTotalSystemMemory()
logger.info(`modelSize: ${modelSize}`)
let gguf: GgufMetadata
gguf = await readGgufMetadata(path)
const gguf = await readGgufMetadata(path)
let kvCacheSize: number
if (ctx_size) {
kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
} else {
kvCacheSize = await this.estimateKVCache(gguf.metadata)
}
// total memory consumption = model weights + kvcache + a small buffer for outputs
// output buffer is small so not considering here
// Total memory consumption = model weights + kvcache
const totalRequired = modelSize + kvCacheSize
logger.info(
`isModelSupported: Total memory requirement: ${totalRequired} for ${path}`
)
let totalMemBytes: number
const devices = await this.getDevices()
if (devices.length > 0) {
// Sum total memory across all GPUs
totalMemBytes = devices
.map((d) => d.mem * 1024 * 1024)
.reduce((a, b) => a + b, 0)
} else {
// CPU fallback
const sys = await getSystemUsage()
totalMemBytes = sys.total_memory * 1024 * 1024
}
// Use 80% of total memory as the usable limit
const USABLE_MEMORY_PERCENTAGE = 0.8
const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE
const usableTotalMemory =
memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
// check model size wrt 80% of system memory
if (modelSize > usableMemBytes) {
// Check if model fits in total memory at all
if (modelSize > usableTotalMemory) {
return 'RED'
} else if (modelSize + kvCacheSize > usableMemBytes) {
return 'YELLOW'
} else {
}
// Check if everything fits in VRAM (ideal case)
if (totalRequired <= usableVRAM) {
return 'GREEN'
}
// Check if model fits in VRAM but total requirement exceeds VRAM
// OR if total requirement fits in total memory but not in VRAM
if (modelSize <= usableVRAM || totalRequired <= usableTotalMemory) {
return 'YELLOW'
}
// If we get here, nothing fits properly
return 'RED'
} catch (e) {
throw new Error(String(e))
}
@ -2006,39 +2383,42 @@ export default class llamacpp_extension extends AIEngine {
async validateGgufFile(filePath: string): Promise<{
isValid: boolean
error?: string
metadata?: GgufMetadata
metadata?: any
}> {
try {
logger.info(`Validating GGUF file: ${filePath}`)
const metadata = await readGgufMetadata(filePath)
// Log full metadata for debugging
logger.info('Full GGUF metadata:', JSON.stringify(metadata, null, 2))
// Check if architecture is 'clip' which is not supported for text generation
const architecture = metadata.metadata?.['general.architecture']
logger.info(`Model architecture: ${architecture}`)
if (architecture === 'clip') {
const errorMessage = 'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
const errorMessage =
'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
logger.error('CLIP architecture detected:', architecture)
return {
isValid: false,
error: errorMessage,
metadata
metadata,
}
}
logger.info('Model validation passed. Architecture:', architecture)
return {
isValid: true,
metadata
metadata,
}
} catch (error) {
logger.error('Failed to validate GGUF file:', error)
return {
isValid: false,
error: `Failed to read model metadata: ${error instanceof Error ? error.message : 'Unknown error'}`
error: `Failed to read model metadata: ${
error instanceof Error ? error.message : 'Unknown error'
}`,
}
}
}

View File

@ -1,5 +1,6 @@
import { IconSettings } from '@tabler/icons-react'
import { IconSettings, IconLoader } from '@tabler/icons-react'
import debounce from 'lodash.debounce'
import { useState } from 'react'
import {
Sheet,
@ -9,6 +10,7 @@ import {
SheetTitle,
SheetTrigger,
} from '@/components/ui/sheet'
import { Button } from '@/components/ui/button'
import { DynamicControllerSetting } from '@/containers/dynamicControllerSetting'
import { useModelProvider } from '@/hooks/useModelProvider'
import { useServiceHub } from '@/hooks/useServiceHub'
@ -30,11 +32,134 @@ export function ModelSetting({
const { t } = useTranslation()
const serviceHub = useServiceHub()
const [isPlanning, setIsPlanning] = useState(false)
// Create a debounced version of stopModel that waits 500ms after the last call
const debouncedStopModel = debounce((modelId: string) => {
serviceHub.models().stopModel(modelId)
}, 500)
const handlePlanModelLoad = async () => {
if (provider.provider !== 'llamacpp') {
console.warn('planModelLoad is only available for llamacpp provider')
return
}
setIsPlanning(true)
try {
// Read the model config to get the actual model path
const modelConfig = await serviceHub.app().readYaml<{
model_path: string
}>(`llamacpp/models/${model.id}/model.yml`)
if (modelConfig && modelConfig.model_path) {
const result = await serviceHub
.models()
.planModelLoad(modelConfig.model_path)
// Apply the recommended settings to the model sequentially to avoid race conditions
const settingsToUpdate: Array<{
key: string
value: number | boolean
}> = []
if (model.settings?.ngl && result.gpuLayers !== undefined) {
settingsToUpdate.push({ key: 'ngl', value: result.gpuLayers })
}
if (model.settings?.ctx_len && result.maxContextLength !== undefined) {
settingsToUpdate.push({
key: 'ctx_len',
value: result.maxContextLength,
})
}
if (
model.settings?.no_kv_offload &&
result.noOffloadKVCache !== undefined
) {
settingsToUpdate.push({
key: 'no_kv_offload',
value: result.noOffloadKVCache,
})
}
// Apply all settings in a single update to avoid race conditions
if (settingsToUpdate.length > 0) {
handleMultipleSettingsChange(settingsToUpdate)
}
} else {
console.warn('No model_path found in config for', model.id)
}
} catch (error) {
console.error('Error calling planModelLoad:', error)
} finally {
setIsPlanning(false)
}
}
const handleMultipleSettingsChange = (
settingsToUpdate: Array<{ key: string; value: number | boolean }>
) => {
if (!provider) return
// Create a copy of the model with ALL updated settings at once
let updatedModel = { ...model }
settingsToUpdate.forEach(({ key, value }) => {
const existingSetting = updatedModel.settings?.[key] as ProviderSetting
updatedModel = {
...updatedModel,
settings: {
...updatedModel.settings,
[key]: {
...existingSetting,
controller_props: {
...existingSetting?.controller_props,
value: value,
},
} as ProviderSetting,
},
}
})
// Find the model index in the provider's models array
const modelIndex = provider.models.findIndex((m) => m.id === model.id)
if (modelIndex !== -1) {
// Create a copy of the provider's models array
const updatedModels = [...provider.models]
// Update the specific model in the array
updatedModels[modelIndex] = updatedModel as Model
// Update the provider with the new models array
updateProvider(provider.provider, {
models: updatedModels,
})
// Check if any of the updated settings require a model restart
const requiresRestart = settingsToUpdate.some(
({ key }) =>
key === 'ctx_len' ||
key === 'ngl' ||
key === 'chat_template' ||
key === 'offload_mmproj'
)
if (requiresRestart) {
// Check if model is running before stopping it
serviceHub
.models()
.getActiveModels()
.then((activeModels) => {
if (activeModels.includes(model.id)) {
debouncedStopModel(model.id)
}
})
}
}
}
const handleSettingChange = (
key: string,
value: string | boolean | number
@ -72,8 +197,22 @@ export function ModelSetting({
})
// Call debounced stopModel only when updating ctx_len, ngl, chat_template, or offload_mmproj
if (key === 'ctx_len' || key === 'ngl' || key === 'chat_template' || key === 'offload_mmproj') {
debouncedStopModel(model.id)
// and only if the model is currently running
if (
key === 'ctx_len' ||
key === 'ngl' ||
key === 'chat_template' ||
key === 'offload_mmproj'
) {
// Check if model is running before stopping it
serviceHub
.models()
.getActiveModels()
.then((activeModels) => {
if (activeModels.includes(model.id)) {
debouncedStopModel(model.id)
}
})
}
}
}
@ -98,7 +237,36 @@ export function ModelSetting({
<SheetDescription>
{t('common:modelSettings.description')}
</SheetDescription>
{/* Model Load Planning Section - Only show for llamacpp provider */}
{provider.provider === 'llamacpp' && (
<div className="pb-4 border-b border-main-view-fg/10 my-4">
<div>
<h3 className="font-medium mb-1">Optimize Settings</h3>
<p className="text-main-view-fg/70 text-xs mb-3">
Analyze your system and model, then apply optimal loading
settings automatically
</p>
<Button
onClick={handlePlanModelLoad}
disabled={isPlanning}
variant="default"
className="w-full"
>
{isPlanning ? (
<>
<IconLoader size={16} className="mr-2 animate-spin" />
Optimizing...
</>
) : (
<>Auto-Optimize Settings</>
)}
</Button>
</div>
</div>
)}
</SheetHeader>
<div className="px-4 space-y-6">
{Object.entries(model.settings || {}).map(([key, value]) => {
const config = value as ProviderSetting

View File

@ -93,7 +93,11 @@ export const useModelProvider = create<ModelProviderState>()(
? legacyModels
: models
).find(
(m) => m.id.split(':').slice(0, 2).join(getServiceHub().path().sep()) === model.id
(m) =>
m.id
.split(':')
.slice(0, 2)
.join(getServiceHub().path().sep()) === model.id
)?.settings || model.settings
const existingModel = models.find((m) => m.id === model.id)
return {
@ -227,7 +231,7 @@ export const useModelProvider = create<ModelProviderState>()(
>
}
if (version === 0 && state?.providers) {
if (version <= 1 && state?.providers) {
state.providers.forEach((provider) => {
// Update cont_batching description for llamacpp provider
if (provider.provider === 'llamacpp' && provider.settings) {
@ -270,6 +274,15 @@ export const useModelProvider = create<ModelProviderState>()(
},
}
}
if (!model.settings.no_kv_offload) {
model.settings.no_kv_offload = {
...modelSettings.no_kv_offload,
controller_props: {
...modelSettings.no_kv_offload.controller_props,
},
}
}
})
}
})
@ -277,7 +290,7 @@ export const useModelProvider = create<ModelProviderState>()(
return state
},
version: 1,
version: 2,
}
)
)

View File

@ -144,4 +144,13 @@ export const modelSettings = {
type: 'text',
},
},
no_kv_offload: {
key: 'no_kv_offload',
title: 'Disable KV Offload',
description: 'Disable KV cache offload to GPU (if GPU is used).',
controller_type: 'checkbox',
controller_props: {
value: false,
},
},
}

View File

@ -1,3 +1,4 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { Card, CardItem } from '@/containers/Card'
import HeaderPage from '@/containers/HeaderPage'
import SettingsMenu from '@/containers/SettingsMenu'
@ -116,22 +117,25 @@ function ProviderDetail() {
// Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
// Check if model has a custom capabilities config flag
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const hasUserConfiguredCapabilities = (model as any)._userConfiguredCapabilities === true
if (!capabilities.includes('vision') && !hasUserConfiguredCapabilities) {
const hasUserConfiguredCapabilities =
(model as any)._userConfiguredCapabilities === true
if (
!capabilities.includes('vision') &&
!hasUserConfiguredCapabilities
) {
const updatedModels = [...llamacppProvider.models]
updatedModels[modelIndex] = {
...model,
capabilities: [...capabilities, 'vision'],
// Mark this as auto-detected, not user-configured
_autoDetectedVision: true,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} as any
updateProviderState('llamacpp', { models: updatedModels })
console.log(
`Vision capability auto-added to model after provider refresh: ${importedModelName}`
`Vision capability added to model after provider refresh: ${importedModelName}`
)
}
}
@ -257,33 +261,36 @@ function ProviderDetail() {
}
}
const handleStartModel = (modelId: string) => {
const handleStartModel = async (modelId: string) => {
// Add model to loading state
setLoadingModels((prev) => [...prev, modelId])
if (provider)
// Original: startModel(provider, modelId).then(() => { setActiveModels((prevModels) => [...prevModels, modelId]) })
serviceHub
.models()
.startModel(provider, modelId)
.then(() => {
// Refresh active models after starting
serviceHub
.models()
.getActiveModels()
.then((models) => setActiveModels(models || []))
})
.catch((error) => {
console.error('Error starting model:', error)
if (error && typeof error === 'object' && 'message' in error) {
setModelLoadError(error)
} else {
setModelLoadError(`${error}`)
}
})
.finally(() => {
// Remove model from loading state
setLoadingModels((prev) => prev.filter((id) => id !== modelId))
})
if (provider) {
try {
// Start the model with plan result
await serviceHub.models().startModel(provider, modelId)
// Refresh active models after starting
serviceHub
.models()
.getActiveModels()
.then((models) => setActiveModels(models || []))
} catch (error) {
console.error('Error starting model:', error)
if (
error &&
typeof error === 'object' &&
'message' in error &&
typeof error.message === 'string'
) {
setModelLoadError({ message: error.message })
} else {
setModelLoadError(typeof error === 'string' ? error : `${error}`)
}
} finally {
// Remove model from loading state
setLoadingModels((prev) => prev.filter((id) => id !== modelId))
}
}
}
const handleStopModel = (modelId: string) => {

View File

@ -17,6 +17,7 @@ import type {
HuggingFaceRepo,
CatalogModel,
ModelValidationResult,
ModelPlan,
} from './types'
// TODO: Replace this with the actual provider later
@ -491,4 +492,47 @@ export class DefaultModelsService implements ModelsService {
}
}
}
async planModelLoad(
modelPath: string,
requestedCtx?: number
): Promise<ModelPlan> {
try {
const engine = this.getEngine('llamacpp') as AIEngine & {
planModelLoad?: (
path: string,
requestedCtx?: number
) => Promise<ModelPlan>
}
if (engine && typeof engine.planModelLoad === 'function') {
// Get the full absolute path to the model file
const janDataFolderPath = await import('@janhq/core').then((core) =>
core.getJanDataFolderPath()
)
const joinPath = await import('@janhq/core').then(
(core) => core.joinPath
)
const fullModelPath = await joinPath([janDataFolderPath, modelPath])
return await engine.planModelLoad(fullModelPath, requestedCtx)
}
// Fallback if method is not available
console.warn('planModelLoad method not available in llamacpp engine')
return {
gpuLayers: 0,
maxContextLength: 2048,
noOffloadKVCache: true,
mode: 'Unsupported',
}
} catch (error) {
console.error(`Error planning model load for path ${modelPath}:`, error)
return {
gpuLayers: 0,
maxContextLength: 2048,
noOffloadKVCache: true,
mode: 'Unsupported',
}
}
}
}

View File

@ -81,10 +81,20 @@ export interface ModelValidationResult {
metadata?: GgufMetadata
}
export interface ModelPlan {
gpuLayers: number
maxContextLength: number
noOffloadKVCache: boolean
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
}
export interface ModelsService {
fetchModels(): Promise<modelInfo[]>
fetchModelCatalog(): Promise<ModelCatalog>
fetchHuggingFaceRepo(repoId: string, hfToken?: string): Promise<HuggingFaceRepo | null>
fetchHuggingFaceRepo(
repoId: string,
hfToken?: string
): Promise<HuggingFaceRepo | null>
convertHfRepoToCatalogModel(repo: HuggingFaceRepo): CatalogModel
updateModel(model: Partial<CoreModel>): Promise<void>
pullModel(
@ -107,14 +117,24 @@ export interface ModelsService {
getActiveModels(provider?: string): Promise<string[]>
stopModel(model: string, provider?: string): Promise<void>
stopAllModels(): Promise<void>
startModel(provider: ProviderObject, model: string): Promise<SessionInfo | undefined>
startModel(
provider: ProviderObject,
model: string
): Promise<SessionInfo | undefined>
isToolSupported(modelId: string): Promise<boolean>
checkMmprojExistsAndUpdateOffloadMMprojSetting(
modelId: string,
updateProvider?: (providerName: string, data: Partial<ModelProvider>) => void,
updateProvider?: (
providerName: string,
data: Partial<ModelProvider>
) => void,
getProviderByName?: (providerName: string) => ModelProvider | undefined
): Promise<{ exists: boolean; settingsUpdated: boolean }>
checkMmprojExists(modelId: string): Promise<boolean>
isModelSupported(modelPath: string, ctxSize?: number): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
isModelSupported(
modelPath: string,
ctxSize?: number
): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
validateGgufFile(filePath: string): Promise<ModelValidationResult>
}
planModelLoad(modelPath: string, requestedCtx?: number): Promise<ModelPlan>
}