feat: Smart model management (#6390)
* feat: Smart model management * **New UI option** – `memory_util` added to `settings.json` with a dropdown (high / medium / low) to let users control how aggressively the engine uses system memory. * **Configuration updates** – `LlamacppConfig` now includes `memory_util`; the extension class stores it in a new `memoryMode` property and handles updates through `updateConfig`. * **System memory handling** * Introduced `SystemMemory` interface and `getTotalSystemMemory()` to report combined VRAM + RAM. * Added helper methods `getKVCachePerToken`, `getLayerSize`, and a new `ModelPlan` type. * **Smart model‑load planner** – `planModelLoad()` computes: * Number of GPU layers that can fit in usable VRAM. * Maximum context length based on KV‑cache size and the selected memory utilization mode (high/medium/low). * Whether KV‑cache must be off‑loaded to CPU and the overall loading mode (GPU, Hybrid, CPU, Unsupported). * Detailed logging of the planning decision. * **Improved support check** – `isModelSupported()` now: * Uses the combined VRAM/RAM totals from `getTotalSystemMemory()`. * Applies an 80% usable‑memory heuristic. * Returns **GREEN** only when both weights and KV‑cache fit in VRAM, **YELLOW** when they fit only in total memory or require CPU off‑load, and **RED** when the model cannot fit at all. * **Cleanup** – Removed unused `GgufMetadata` import; updated imports and type definitions accordingly. * **Documentation/comments** – Added explanatory JSDoc comments for the new methods and clarified the return semantics of `isModelSupported`. * chore: migrate no_kv_offload from llamacpp setting to model setting * chore: add UI auto optimize model setting * feat: improve model loading planner with mmproj support and smarter memory budgeting * Extend `ModelPlan` with optional `noOffloadMmproj` flag to indicate when a multimodal projector can stay in VRAM. * Add `mmprojPath` parameter to `planModelLoad` and calculate its size, attempting to keep it on GPU when possible. * Refactor system memory detection: * Use `used_memory` (actual free RAM) instead of total RAM for budgeting. * Introduced `usableRAM` placeholder for future use. * Rewrite KV‑cache size calculation: * Properly handle GQA models via `attention.head_count_kv`. * Compute bytes per token as `nHeadKV * headDim * 2 * 2 * nLayer`. * Replace the old 70 % VRAM heuristic with a more flexible budget: * Reserve a fixed VRAM amount and apply an overhead factor. * Derive usable system RAM from total memory minus VRAM. * Implement a robust allocation algorithm: * Prioritize placing the mmproj in VRAM. * Search for the best balance of GPU layers and context length. * Fallback strategies for hybrid and pure‑CPU modes with detailed safety checks. * Add extensive validation of model size, KV‑cache size, layer size, and memory mode. * Improve logging throughout the planning process for easier debugging. * Adjust final plan return shape to include the new `noOffloadMmproj` field. * remove unused variable --------- Co-authored-by: Faisal Amir <urmauur@gmail.com>
This commit is contained in:
parent
3158722a63
commit
7a174e621a
@ -36,6 +36,21 @@
|
|||||||
"controllerType": "checkbox",
|
"controllerType": "checkbox",
|
||||||
"controllerProps": { "value": true }
|
"controllerProps": { "value": true }
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"key": "memory_util",
|
||||||
|
"title": "Smart Memory utilization",
|
||||||
|
"description": "Smart memory utilization mode for running local GGUF models",
|
||||||
|
"controllerType": "dropdown",
|
||||||
|
"controllerProps": {
|
||||||
|
"value": "high",
|
||||||
|
"options": [
|
||||||
|
{ "value": "high", "name": "High" },
|
||||||
|
{ "value": "medium", "name": "Medium" },
|
||||||
|
{ "value": "low", "name": "Low" }
|
||||||
|
],
|
||||||
|
"recommended": "high"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"key": "threads",
|
"key": "threads",
|
||||||
"title": "Threads",
|
"title": "Threads",
|
||||||
@ -178,15 +193,6 @@
|
|||||||
"value": false
|
"value": false
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"key": "no_kv_offload",
|
|
||||||
"title": "Disable KV Offload",
|
|
||||||
"description": "Disable KV cache offload to GPU (if GPU is used).",
|
|
||||||
"controllerType": "checkbox",
|
|
||||||
"controllerProps": {
|
|
||||||
"value": false
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"key": "cache_type_k",
|
"key": "cache_type_k",
|
||||||
"title": "KV Cache K Type",
|
"title": "KV Cache K Type",
|
||||||
|
|||||||
@ -35,10 +35,7 @@ import {
|
|||||||
import { invoke } from '@tauri-apps/api/core'
|
import { invoke } from '@tauri-apps/api/core'
|
||||||
import { getProxyConfig } from './util'
|
import { getProxyConfig } from './util'
|
||||||
import { basename } from '@tauri-apps/api/path'
|
import { basename } from '@tauri-apps/api/path'
|
||||||
import {
|
import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api'
|
||||||
GgufMetadata,
|
|
||||||
readGgufMetadata,
|
|
||||||
} from '@janhq/tauri-plugin-llamacpp-api'
|
|
||||||
import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
|
import { getSystemUsage } from '@janhq/tauri-plugin-hardware-api'
|
||||||
|
|
||||||
type LlamacppConfig = {
|
type LlamacppConfig = {
|
||||||
@ -46,6 +43,7 @@ type LlamacppConfig = {
|
|||||||
auto_update_engine: boolean
|
auto_update_engine: boolean
|
||||||
auto_unload: boolean
|
auto_unload: boolean
|
||||||
llamacpp_env: string
|
llamacpp_env: string
|
||||||
|
memory_util: string
|
||||||
chat_template: string
|
chat_template: string
|
||||||
n_gpu_layers: number
|
n_gpu_layers: number
|
||||||
offload_mmproj: boolean
|
offload_mmproj: boolean
|
||||||
@ -74,6 +72,14 @@ type LlamacppConfig = {
|
|||||||
ctx_shift: boolean
|
ctx_shift: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ModelPlan = {
|
||||||
|
gpuLayers: number
|
||||||
|
maxContextLength: number
|
||||||
|
noOffloadKVCache: boolean
|
||||||
|
noOffloadMmproj?: boolean
|
||||||
|
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
|
||||||
|
}
|
||||||
|
|
||||||
interface DownloadItem {
|
interface DownloadItem {
|
||||||
url: string
|
url: string
|
||||||
save_path: string
|
save_path: string
|
||||||
@ -116,6 +122,12 @@ interface DeviceList {
|
|||||||
free: number
|
free: number
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface SystemMemory {
|
||||||
|
totalVRAM: number
|
||||||
|
totalRAM: number
|
||||||
|
totalMemory: number
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Override the default app.log function to use Jan's logging system.
|
* Override the default app.log function to use Jan's logging system.
|
||||||
* @param args
|
* @param args
|
||||||
@ -159,6 +171,7 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
provider: string = 'llamacpp'
|
provider: string = 'llamacpp'
|
||||||
autoUnload: boolean = true
|
autoUnload: boolean = true
|
||||||
llamacpp_env: string = ''
|
llamacpp_env: string = ''
|
||||||
|
memoryMode: string = 'high'
|
||||||
readonly providerId: string = 'llamacpp'
|
readonly providerId: string = 'llamacpp'
|
||||||
|
|
||||||
private config: LlamacppConfig
|
private config: LlamacppConfig
|
||||||
@ -190,6 +203,7 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
|
|
||||||
this.autoUnload = this.config.auto_unload
|
this.autoUnload = this.config.auto_unload
|
||||||
this.llamacpp_env = this.config.llamacpp_env
|
this.llamacpp_env = this.config.llamacpp_env
|
||||||
|
this.memoryMode = this.config.memory_util
|
||||||
|
|
||||||
// This sets the base directory where model files for this provider are stored.
|
// This sets the base directory where model files for this provider are stored.
|
||||||
this.providerPath = await joinPath([
|
this.providerPath = await joinPath([
|
||||||
@ -836,6 +850,8 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
this.autoUnload = value as boolean
|
this.autoUnload = value as boolean
|
||||||
} else if (key === 'llamacpp_env') {
|
} else if (key === 'llamacpp_env') {
|
||||||
this.llamacpp_env = value as string
|
this.llamacpp_env = value as string
|
||||||
|
} else if (key === 'memory_util') {
|
||||||
|
this.memoryMode = value as string
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1864,10 +1880,368 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
'tokenizer.chat_template'
|
'tokenizer.chat_template'
|
||||||
]?.includes('tools')
|
]?.includes('tools')
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Get total system memory including both VRAM and RAM
|
||||||
|
*/
|
||||||
|
private async getTotalSystemMemory(): Promise<SystemMemory> {
|
||||||
|
const devices = await this.getDevices()
|
||||||
|
let totalVRAM = 0
|
||||||
|
|
||||||
|
if (devices.length > 0) {
|
||||||
|
// Sum total VRAM across all GPUs
|
||||||
|
totalVRAM = devices
|
||||||
|
.map((d) => d.mem * 1024 * 1024)
|
||||||
|
.reduce((a, b) => a + b, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get system RAM
|
||||||
|
const sys = await getSystemUsage()
|
||||||
|
const totalRAM = sys.used_memory * 1024 * 1024
|
||||||
|
|
||||||
|
const totalMemory = totalVRAM + totalRAM
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Total VRAM: ${totalVRAM} bytes, Total RAM: ${totalRAM} bytes, Free: ${usableRAM} bytes, Total Memory: ${totalMemory} bytes`
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalVRAM,
|
||||||
|
totalRAM,
|
||||||
|
totalMemory,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private async getKVCachePerToken(
|
||||||
|
meta: Record<string, string>
|
||||||
|
): Promise<number> {
|
||||||
|
const arch = meta['general.architecture']
|
||||||
|
const nLayer = Number(meta[`${arch}.block_count`])
|
||||||
|
const nHead = Number(meta[`${arch}.attention.head_count`])
|
||||||
|
|
||||||
|
// Get head dimensions
|
||||||
|
const nHeadKV = Number(meta[`${arch}.attention.head_count_kv`]) || nHead
|
||||||
|
const embeddingLen = Number(meta[`${arch}.embedding_length`])
|
||||||
|
const headDim = embeddingLen / nHead
|
||||||
|
|
||||||
|
// KV cache uses head_count_kv (for GQA models) or head_count
|
||||||
|
// Each token needs K and V, both are fp16 (2 bytes)
|
||||||
|
const bytesPerToken = nHeadKV * headDim * 2 * 2 * nLayer // K+V, fp16, all layers
|
||||||
|
|
||||||
|
return bytesPerToken
|
||||||
|
}
|
||||||
|
|
||||||
|
private async getLayerSize(
|
||||||
|
path: string,
|
||||||
|
meta: Record<string, string>
|
||||||
|
): Promise<{ layerSize: number; totalLayers: number }> {
|
||||||
|
const modelSize = await this.getModelSize(path)
|
||||||
|
const arch = meta['general.architecture']
|
||||||
|
const totalLayers = Number(meta[`${arch}.block_count`])
|
||||||
|
if (!totalLayers) throw new Error('Invalid metadata: block_count not found')
|
||||||
|
return { layerSize: modelSize / totalLayers, totalLayers }
|
||||||
|
}
|
||||||
|
|
||||||
|
async planModelLoad(
|
||||||
|
path: string,
|
||||||
|
requestedCtx?: number,
|
||||||
|
mmprojPath?: string
|
||||||
|
): Promise<ModelPlan> {
|
||||||
|
const modelSize = await this.getModelSize(path)
|
||||||
|
const memoryInfo = await this.getTotalSystemMemory()
|
||||||
|
const gguf = await readGgufMetadata(path)
|
||||||
|
|
||||||
|
// Get mmproj size if provided
|
||||||
|
let mmprojSize = 0
|
||||||
|
if (mmprojPath) {
|
||||||
|
mmprojSize = await this.getModelSize(mmprojPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
const { layerSize, totalLayers } = await this.getLayerSize(
|
||||||
|
path,
|
||||||
|
gguf.metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
// Fixed KV cache calculation
|
||||||
|
const kvCachePerToken = await this.getKVCachePerToken(gguf.metadata)
|
||||||
|
|
||||||
|
// Debug logging
|
||||||
|
logger.info(
|
||||||
|
`Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}`
|
||||||
|
)
|
||||||
|
|
||||||
|
// Validate critical values
|
||||||
|
if (!modelSize || modelSize <= 0) {
|
||||||
|
throw new Error(`Invalid model size: ${modelSize}`)
|
||||||
|
}
|
||||||
|
if (!kvCachePerToken || kvCachePerToken <= 0) {
|
||||||
|
throw new Error(`Invalid KV cache per token: ${kvCachePerToken}`)
|
||||||
|
}
|
||||||
|
if (!layerSize || layerSize <= 0) {
|
||||||
|
throw new Error(`Invalid layer size: ${layerSize}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU overhead factor (20% reserved for GPU operations, alignment, etc.)
|
||||||
|
const GPU_OVERHEAD_FACTOR = 0.8
|
||||||
|
|
||||||
|
// VRAM budget with overhead consideration
|
||||||
|
const VRAM_RESERVE_GB = 0.5
|
||||||
|
const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024
|
||||||
|
const usableVRAM = Math.max(
|
||||||
|
0,
|
||||||
|
(memoryInfo.totalVRAM - VRAM_RESERVE_BYTES) * GPU_OVERHEAD_FACTOR
|
||||||
|
)
|
||||||
|
|
||||||
|
// Get model's maximum context length
|
||||||
|
const arch = gguf.metadata['general.architecture']
|
||||||
|
const modelMaxContextLength =
|
||||||
|
Number(gguf.metadata[`${arch}.context_length`]) || 131072 // Default fallback
|
||||||
|
|
||||||
|
// Set minimum context length
|
||||||
|
const MIN_CONTEXT_LENGTH = 2048 // Reduced from 4096 for better compatibility
|
||||||
|
|
||||||
|
// System RAM budget
|
||||||
|
const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 }
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}`
|
||||||
|
)
|
||||||
|
|
||||||
|
// Validate memory info
|
||||||
|
if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) {
|
||||||
|
throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`)
|
||||||
|
}
|
||||||
|
if (!memoryInfo.totalVRAM || isNaN(memoryInfo.totalVRAM)) {
|
||||||
|
throw new Error(`Invalid total VRAM: ${memoryInfo.totalVRAM}`)
|
||||||
|
}
|
||||||
|
if (!this.memoryMode || !(this.memoryMode in memoryPercentages)) {
|
||||||
|
throw new Error(
|
||||||
|
`Invalid memory mode: ${this.memoryMode}. Must be 'high', 'medium', or 'low'`
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate actual system RAM
|
||||||
|
const actualSystemRAM = Math.max(
|
||||||
|
0,
|
||||||
|
memoryInfo.totalMemory - memoryInfo.totalVRAM
|
||||||
|
)
|
||||||
|
const usableSystemMemory =
|
||||||
|
actualSystemRAM * memoryPercentages[this.memoryMode]
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Actual System RAM: ${actualSystemRAM}, Usable VRAM: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}`
|
||||||
|
)
|
||||||
|
|
||||||
|
// --- Priority 1: Allocate mmproj (if exists) ---
|
||||||
|
let noOffloadMmproj = false
|
||||||
|
let remainingVRAM = usableVRAM
|
||||||
|
|
||||||
|
if (mmprojSize > 0) {
|
||||||
|
if (mmprojSize <= remainingVRAM) {
|
||||||
|
noOffloadMmproj = true
|
||||||
|
remainingVRAM -= mmprojSize
|
||||||
|
logger.info(`MMProj allocated to VRAM: ${mmprojSize} bytes`)
|
||||||
|
} else {
|
||||||
|
logger.info(`MMProj will use CPU RAM: ${mmprojSize} bytes`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Priority 2: Calculate optimal layer/context balance ---
|
||||||
|
let gpuLayers = 0
|
||||||
|
let maxContextLength = MIN_CONTEXT_LENGTH
|
||||||
|
let noOffloadKVCache = false
|
||||||
|
let mode: ModelPlan['mode'] = 'Unsupported'
|
||||||
|
|
||||||
|
// Calculate how much VRAM we need for different context sizes
|
||||||
|
const contextSizes = [2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
||||||
|
const targetContext = requestedCtx || modelMaxContextLength
|
||||||
|
|
||||||
|
// Find the best balance of layers and context
|
||||||
|
let bestConfig = {
|
||||||
|
layers: 0,
|
||||||
|
context: MIN_CONTEXT_LENGTH,
|
||||||
|
vramUsed: 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const ctxSize of contextSizes) {
|
||||||
|
if (ctxSize > targetContext) break
|
||||||
|
|
||||||
|
const kvCacheSize = ctxSize * kvCachePerToken
|
||||||
|
const availableForLayers = remainingVRAM - kvCacheSize
|
||||||
|
|
||||||
|
if (availableForLayers <= 0) continue
|
||||||
|
|
||||||
|
const possibleLayers = Math.min(
|
||||||
|
Math.floor(availableForLayers / layerSize),
|
||||||
|
totalLayers
|
||||||
|
)
|
||||||
|
|
||||||
|
if (possibleLayers > 0) {
|
||||||
|
const totalVramNeeded = possibleLayers * layerSize + kvCacheSize
|
||||||
|
|
||||||
|
// Verify this fits with some margin
|
||||||
|
if (totalVramNeeded <= remainingVRAM * 0.95) {
|
||||||
|
bestConfig = {
|
||||||
|
layers: possibleLayers,
|
||||||
|
context: ctxSize,
|
||||||
|
vramUsed: totalVramNeeded,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply the best configuration found
|
||||||
|
if (bestConfig.layers > 0) {
|
||||||
|
gpuLayers = bestConfig.layers
|
||||||
|
maxContextLength = bestConfig.context
|
||||||
|
noOffloadKVCache = false
|
||||||
|
mode = gpuLayers === totalLayers ? 'GPU' : 'Hybrid'
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Best GPU config: ${gpuLayers}/${totalLayers} layers, ${maxContextLength} context, ` +
|
||||||
|
`VRAM used: ${bestConfig.vramUsed}/${remainingVRAM} bytes`
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
// Fallback: Try minimal GPU layers with KV cache on CPU
|
||||||
|
gpuLayers = Math.min(
|
||||||
|
Math.floor((remainingVRAM * 0.9) / layerSize), // Use 90% for layers
|
||||||
|
totalLayers
|
||||||
|
)
|
||||||
|
|
||||||
|
if (gpuLayers > 0) {
|
||||||
|
// Calculate available system RAM for KV cache
|
||||||
|
const cpuLayers = totalLayers - gpuLayers
|
||||||
|
const modelCPUSize = cpuLayers * layerSize
|
||||||
|
const mmprojCPUSize =
|
||||||
|
mmprojSize > 0 && !noOffloadMmproj ? mmprojSize : 0
|
||||||
|
const systemRAMUsed = modelCPUSize + mmprojCPUSize
|
||||||
|
const availableSystemRAMForKVCache = Math.max(
|
||||||
|
0,
|
||||||
|
usableSystemMemory - systemRAMUsed
|
||||||
|
)
|
||||||
|
|
||||||
|
// Calculate context that fits in system RAM
|
||||||
|
const systemRAMContext = Math.min(
|
||||||
|
Math.floor(availableSystemRAMForKVCache / kvCachePerToken),
|
||||||
|
targetContext
|
||||||
|
)
|
||||||
|
|
||||||
|
if (systemRAMContext >= MIN_CONTEXT_LENGTH) {
|
||||||
|
maxContextLength = systemRAMContext
|
||||||
|
noOffloadKVCache = true
|
||||||
|
mode = 'Hybrid'
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Hybrid mode: ${gpuLayers}/${totalLayers} layers on GPU, ` +
|
||||||
|
`${maxContextLength} context on CPU RAM`
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
// Can't fit reasonable context even with CPU RAM
|
||||||
|
// Reduce GPU layers further
|
||||||
|
gpuLayers = Math.floor(gpuLayers / 2)
|
||||||
|
maxContextLength = MIN_CONTEXT_LENGTH
|
||||||
|
noOffloadKVCache = true
|
||||||
|
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Pure CPU mode
|
||||||
|
gpuLayers = 0
|
||||||
|
noOffloadKVCache = true
|
||||||
|
|
||||||
|
// Calculate context for pure CPU mode
|
||||||
|
const totalCPUMemoryNeeded = modelSize + (mmprojSize || 0)
|
||||||
|
const availableForKVCache = Math.max(
|
||||||
|
0,
|
||||||
|
usableSystemMemory - totalCPUMemoryNeeded
|
||||||
|
)
|
||||||
|
|
||||||
|
maxContextLength = Math.min(
|
||||||
|
Math.max(
|
||||||
|
MIN_CONTEXT_LENGTH,
|
||||||
|
Math.floor(availableForKVCache / kvCachePerToken)
|
||||||
|
),
|
||||||
|
targetContext
|
||||||
|
)
|
||||||
|
|
||||||
|
mode = maxContextLength >= MIN_CONTEXT_LENGTH ? 'CPU' : 'Unsupported'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Safety check: Verify total GPU memory usage
|
||||||
|
if (gpuLayers > 0 && !noOffloadKVCache) {
|
||||||
|
const estimatedGPUUsage =
|
||||||
|
gpuLayers * layerSize +
|
||||||
|
maxContextLength * kvCachePerToken +
|
||||||
|
(noOffloadMmproj ? mmprojSize : 0)
|
||||||
|
|
||||||
|
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
|
||||||
|
logger.warn(
|
||||||
|
`GPU memory usage (${estimatedGPUUsage}) exceeds safe limit. Adjusting...`
|
||||||
|
)
|
||||||
|
|
||||||
|
// Reduce context first
|
||||||
|
while (
|
||||||
|
maxContextLength > MIN_CONTEXT_LENGTH &&
|
||||||
|
estimatedGPUUsage > memoryInfo.totalVRAM * 0.9
|
||||||
|
) {
|
||||||
|
maxContextLength = Math.floor(maxContextLength / 2)
|
||||||
|
const newEstimate =
|
||||||
|
gpuLayers * layerSize +
|
||||||
|
maxContextLength * kvCachePerToken +
|
||||||
|
(noOffloadMmproj ? mmprojSize : 0)
|
||||||
|
if (newEstimate <= memoryInfo.totalVRAM * 0.9) break
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still too much, reduce layers
|
||||||
|
if (estimatedGPUUsage > memoryInfo.totalVRAM * 0.9) {
|
||||||
|
gpuLayers = Math.floor(gpuLayers * 0.7)
|
||||||
|
mode = gpuLayers > 0 ? 'Hybrid' : 'CPU'
|
||||||
|
noOffloadKVCache = true // Move KV cache to CPU
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply user-requested context limit if specified
|
||||||
|
if (requestedCtx && requestedCtx > 0) {
|
||||||
|
maxContextLength = Math.min(maxContextLength, requestedCtx)
|
||||||
|
logger.info(
|
||||||
|
`User requested context: ${requestedCtx}, final: ${maxContextLength}`
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure we never exceed model's maximum context
|
||||||
|
maxContextLength = Math.min(maxContextLength, modelMaxContextLength)
|
||||||
|
|
||||||
|
// Final validation
|
||||||
|
if (gpuLayers <= 0 && maxContextLength < MIN_CONTEXT_LENGTH) {
|
||||||
|
mode = 'Unsupported'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure maxContextLength is valid
|
||||||
|
maxContextLength = isNaN(maxContextLength)
|
||||||
|
? MIN_CONTEXT_LENGTH
|
||||||
|
: Math.max(MIN_CONTEXT_LENGTH, maxContextLength)
|
||||||
|
|
||||||
|
// Log final plan
|
||||||
|
const mmprojInfo = mmprojPath
|
||||||
|
? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, noOffloadMmproj=${noOffloadMmproj}`
|
||||||
|
: ''
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Final plan for ${path}: gpuLayers=${gpuLayers}/${totalLayers}, ` +
|
||||||
|
`maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, ` +
|
||||||
|
`mode=${mode}${mmprojInfo}`
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
gpuLayers,
|
||||||
|
maxContextLength,
|
||||||
|
noOffloadKVCache,
|
||||||
|
mode,
|
||||||
|
noOffloadMmproj,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* estimate KVCache size of from a given metadata
|
* estimate KVCache size from a given metadata
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
private async estimateKVCache(
|
private async estimateKVCache(
|
||||||
meta: Record<string, string>,
|
meta: Record<string, string>,
|
||||||
@ -1907,6 +2281,7 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
`Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}`
|
`Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}`
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
let ctxLen: number
|
let ctxLen: number
|
||||||
if (!ctx_size) {
|
if (!ctx_size) {
|
||||||
ctxLen = Number(meta[`${arch}.context_length`])
|
ctxLen = Number(meta[`${arch}.context_length`])
|
||||||
@ -1941,13 +2316,13 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* check the support status of a model by its path (local/remote)
|
* Check the support status of a model by its path (local/remote)
|
||||||
*
|
*
|
||||||
* * Returns:
|
* Returns:
|
||||||
* - "RED" → weights don't fit
|
* - "RED" → weights don't fit in total memory
|
||||||
* - "YELLOW" → weights fit, KV cache doesn't
|
* - "YELLOW" → weights fit in VRAM but need system RAM, or KV cache doesn't fit
|
||||||
* - "GREEN" → both weights + KV cache fit
|
* - "GREEN" → both weights + KV cache fit in VRAM
|
||||||
*/
|
*/
|
||||||
async isModelSupported(
|
async isModelSupported(
|
||||||
path: string,
|
path: string,
|
||||||
@ -1955,46 +2330,48 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
): Promise<'RED' | 'YELLOW' | 'GREEN'> {
|
): Promise<'RED' | 'YELLOW' | 'GREEN'> {
|
||||||
try {
|
try {
|
||||||
const modelSize = await this.getModelSize(path)
|
const modelSize = await this.getModelSize(path)
|
||||||
|
const memoryInfo = await this.getTotalSystemMemory()
|
||||||
|
|
||||||
logger.info(`modelSize: ${modelSize}`)
|
logger.info(`modelSize: ${modelSize}`)
|
||||||
let gguf: GgufMetadata
|
|
||||||
gguf = await readGgufMetadata(path)
|
const gguf = await readGgufMetadata(path)
|
||||||
let kvCacheSize: number
|
let kvCacheSize: number
|
||||||
if (ctx_size) {
|
if (ctx_size) {
|
||||||
kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
|
kvCacheSize = await this.estimateKVCache(gguf.metadata, ctx_size)
|
||||||
} else {
|
} else {
|
||||||
kvCacheSize = await this.estimateKVCache(gguf.metadata)
|
kvCacheSize = await this.estimateKVCache(gguf.metadata)
|
||||||
}
|
}
|
||||||
// total memory consumption = model weights + kvcache + a small buffer for outputs
|
|
||||||
// output buffer is small so not considering here
|
// Total memory consumption = model weights + kvcache
|
||||||
const totalRequired = modelSize + kvCacheSize
|
const totalRequired = modelSize + kvCacheSize
|
||||||
logger.info(
|
logger.info(
|
||||||
`isModelSupported: Total memory requirement: ${totalRequired} for ${path}`
|
`isModelSupported: Total memory requirement: ${totalRequired} for ${path}`
|
||||||
)
|
)
|
||||||
let totalMemBytes: number
|
|
||||||
const devices = await this.getDevices()
|
|
||||||
if (devices.length > 0) {
|
|
||||||
// Sum total memory across all GPUs
|
|
||||||
totalMemBytes = devices
|
|
||||||
.map((d) => d.mem * 1024 * 1024)
|
|
||||||
.reduce((a, b) => a + b, 0)
|
|
||||||
} else {
|
|
||||||
// CPU fallback
|
|
||||||
const sys = await getSystemUsage()
|
|
||||||
totalMemBytes = sys.total_memory * 1024 * 1024
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use 80% of total memory as the usable limit
|
// Use 80% of total memory as the usable limit
|
||||||
const USABLE_MEMORY_PERCENTAGE = 0.8
|
const USABLE_MEMORY_PERCENTAGE = 0.8
|
||||||
const usableMemBytes = totalMemBytes * USABLE_MEMORY_PERCENTAGE
|
const usableTotalMemory =
|
||||||
|
memoryInfo.totalMemory * USABLE_MEMORY_PERCENTAGE
|
||||||
|
const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE
|
||||||
|
|
||||||
// check model size wrt 80% of system memory
|
// Check if model fits in total memory at all
|
||||||
if (modelSize > usableMemBytes) {
|
if (modelSize > usableTotalMemory) {
|
||||||
return 'RED'
|
return 'RED'
|
||||||
} else if (modelSize + kvCacheSize > usableMemBytes) {
|
}
|
||||||
return 'YELLOW'
|
|
||||||
} else {
|
// Check if everything fits in VRAM (ideal case)
|
||||||
|
if (totalRequired <= usableVRAM) {
|
||||||
return 'GREEN'
|
return 'GREEN'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if model fits in VRAM but total requirement exceeds VRAM
|
||||||
|
// OR if total requirement fits in total memory but not in VRAM
|
||||||
|
if (modelSize <= usableVRAM || totalRequired <= usableTotalMemory) {
|
||||||
|
return 'YELLOW'
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we get here, nothing fits properly
|
||||||
|
return 'RED'
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
throw new Error(String(e))
|
throw new Error(String(e))
|
||||||
}
|
}
|
||||||
@ -2006,39 +2383,42 @@ export default class llamacpp_extension extends AIEngine {
|
|||||||
async validateGgufFile(filePath: string): Promise<{
|
async validateGgufFile(filePath: string): Promise<{
|
||||||
isValid: boolean
|
isValid: boolean
|
||||||
error?: string
|
error?: string
|
||||||
metadata?: GgufMetadata
|
metadata?: any
|
||||||
}> {
|
}> {
|
||||||
try {
|
try {
|
||||||
logger.info(`Validating GGUF file: ${filePath}`)
|
logger.info(`Validating GGUF file: ${filePath}`)
|
||||||
const metadata = await readGgufMetadata(filePath)
|
const metadata = await readGgufMetadata(filePath)
|
||||||
|
|
||||||
// Log full metadata for debugging
|
// Log full metadata for debugging
|
||||||
logger.info('Full GGUF metadata:', JSON.stringify(metadata, null, 2))
|
logger.info('Full GGUF metadata:', JSON.stringify(metadata, null, 2))
|
||||||
|
|
||||||
// Check if architecture is 'clip' which is not supported for text generation
|
// Check if architecture is 'clip' which is not supported for text generation
|
||||||
const architecture = metadata.metadata?.['general.architecture']
|
const architecture = metadata.metadata?.['general.architecture']
|
||||||
logger.info(`Model architecture: ${architecture}`)
|
logger.info(`Model architecture: ${architecture}`)
|
||||||
|
|
||||||
if (architecture === 'clip') {
|
if (architecture === 'clip') {
|
||||||
const errorMessage = 'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
|
const errorMessage =
|
||||||
|
'This model has CLIP architecture and cannot be imported as a text generation model. CLIP models are designed for vision tasks and require different handling.'
|
||||||
logger.error('CLIP architecture detected:', architecture)
|
logger.error('CLIP architecture detected:', architecture)
|
||||||
return {
|
return {
|
||||||
isValid: false,
|
isValid: false,
|
||||||
error: errorMessage,
|
error: errorMessage,
|
||||||
metadata
|
metadata,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info('Model validation passed. Architecture:', architecture)
|
logger.info('Model validation passed. Architecture:', architecture)
|
||||||
return {
|
return {
|
||||||
isValid: true,
|
isValid: true,
|
||||||
metadata
|
metadata,
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error('Failed to validate GGUF file:', error)
|
logger.error('Failed to validate GGUF file:', error)
|
||||||
return {
|
return {
|
||||||
isValid: false,
|
isValid: false,
|
||||||
error: `Failed to read model metadata: ${error instanceof Error ? error.message : 'Unknown error'}`
|
error: `Failed to read model metadata: ${
|
||||||
|
error instanceof Error ? error.message : 'Unknown error'
|
||||||
|
}`,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import { IconSettings } from '@tabler/icons-react'
|
import { IconSettings, IconLoader } from '@tabler/icons-react'
|
||||||
import debounce from 'lodash.debounce'
|
import debounce from 'lodash.debounce'
|
||||||
|
import { useState } from 'react'
|
||||||
|
|
||||||
import {
|
import {
|
||||||
Sheet,
|
Sheet,
|
||||||
@ -9,6 +10,7 @@ import {
|
|||||||
SheetTitle,
|
SheetTitle,
|
||||||
SheetTrigger,
|
SheetTrigger,
|
||||||
} from '@/components/ui/sheet'
|
} from '@/components/ui/sheet'
|
||||||
|
import { Button } from '@/components/ui/button'
|
||||||
import { DynamicControllerSetting } from '@/containers/dynamicControllerSetting'
|
import { DynamicControllerSetting } from '@/containers/dynamicControllerSetting'
|
||||||
import { useModelProvider } from '@/hooks/useModelProvider'
|
import { useModelProvider } from '@/hooks/useModelProvider'
|
||||||
import { useServiceHub } from '@/hooks/useServiceHub'
|
import { useServiceHub } from '@/hooks/useServiceHub'
|
||||||
@ -30,11 +32,134 @@ export function ModelSetting({
|
|||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
const serviceHub = useServiceHub()
|
const serviceHub = useServiceHub()
|
||||||
|
|
||||||
|
const [isPlanning, setIsPlanning] = useState(false)
|
||||||
|
|
||||||
// Create a debounced version of stopModel that waits 500ms after the last call
|
// Create a debounced version of stopModel that waits 500ms after the last call
|
||||||
const debouncedStopModel = debounce((modelId: string) => {
|
const debouncedStopModel = debounce((modelId: string) => {
|
||||||
serviceHub.models().stopModel(modelId)
|
serviceHub.models().stopModel(modelId)
|
||||||
}, 500)
|
}, 500)
|
||||||
|
|
||||||
|
const handlePlanModelLoad = async () => {
|
||||||
|
if (provider.provider !== 'llamacpp') {
|
||||||
|
console.warn('planModelLoad is only available for llamacpp provider')
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setIsPlanning(true)
|
||||||
|
try {
|
||||||
|
// Read the model config to get the actual model path
|
||||||
|
const modelConfig = await serviceHub.app().readYaml<{
|
||||||
|
model_path: string
|
||||||
|
}>(`llamacpp/models/${model.id}/model.yml`)
|
||||||
|
|
||||||
|
if (modelConfig && modelConfig.model_path) {
|
||||||
|
const result = await serviceHub
|
||||||
|
.models()
|
||||||
|
.planModelLoad(modelConfig.model_path)
|
||||||
|
|
||||||
|
// Apply the recommended settings to the model sequentially to avoid race conditions
|
||||||
|
const settingsToUpdate: Array<{
|
||||||
|
key: string
|
||||||
|
value: number | boolean
|
||||||
|
}> = []
|
||||||
|
|
||||||
|
if (model.settings?.ngl && result.gpuLayers !== undefined) {
|
||||||
|
settingsToUpdate.push({ key: 'ngl', value: result.gpuLayers })
|
||||||
|
}
|
||||||
|
|
||||||
|
if (model.settings?.ctx_len && result.maxContextLength !== undefined) {
|
||||||
|
settingsToUpdate.push({
|
||||||
|
key: 'ctx_len',
|
||||||
|
value: result.maxContextLength,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
model.settings?.no_kv_offload &&
|
||||||
|
result.noOffloadKVCache !== undefined
|
||||||
|
) {
|
||||||
|
settingsToUpdate.push({
|
||||||
|
key: 'no_kv_offload',
|
||||||
|
value: result.noOffloadKVCache,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply all settings in a single update to avoid race conditions
|
||||||
|
if (settingsToUpdate.length > 0) {
|
||||||
|
handleMultipleSettingsChange(settingsToUpdate)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.warn('No model_path found in config for', model.id)
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error calling planModelLoad:', error)
|
||||||
|
} finally {
|
||||||
|
setIsPlanning(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleMultipleSettingsChange = (
|
||||||
|
settingsToUpdate: Array<{ key: string; value: number | boolean }>
|
||||||
|
) => {
|
||||||
|
if (!provider) return
|
||||||
|
|
||||||
|
// Create a copy of the model with ALL updated settings at once
|
||||||
|
let updatedModel = { ...model }
|
||||||
|
|
||||||
|
settingsToUpdate.forEach(({ key, value }) => {
|
||||||
|
const existingSetting = updatedModel.settings?.[key] as ProviderSetting
|
||||||
|
updatedModel = {
|
||||||
|
...updatedModel,
|
||||||
|
settings: {
|
||||||
|
...updatedModel.settings,
|
||||||
|
[key]: {
|
||||||
|
...existingSetting,
|
||||||
|
controller_props: {
|
||||||
|
...existingSetting?.controller_props,
|
||||||
|
value: value,
|
||||||
|
},
|
||||||
|
} as ProviderSetting,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// Find the model index in the provider's models array
|
||||||
|
const modelIndex = provider.models.findIndex((m) => m.id === model.id)
|
||||||
|
|
||||||
|
if (modelIndex !== -1) {
|
||||||
|
// Create a copy of the provider's models array
|
||||||
|
const updatedModels = [...provider.models]
|
||||||
|
|
||||||
|
// Update the specific model in the array
|
||||||
|
updatedModels[modelIndex] = updatedModel as Model
|
||||||
|
|
||||||
|
// Update the provider with the new models array
|
||||||
|
updateProvider(provider.provider, {
|
||||||
|
models: updatedModels,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Check if any of the updated settings require a model restart
|
||||||
|
const requiresRestart = settingsToUpdate.some(
|
||||||
|
({ key }) =>
|
||||||
|
key === 'ctx_len' ||
|
||||||
|
key === 'ngl' ||
|
||||||
|
key === 'chat_template' ||
|
||||||
|
key === 'offload_mmproj'
|
||||||
|
)
|
||||||
|
|
||||||
|
if (requiresRestart) {
|
||||||
|
// Check if model is running before stopping it
|
||||||
|
serviceHub
|
||||||
|
.models()
|
||||||
|
.getActiveModels()
|
||||||
|
.then((activeModels) => {
|
||||||
|
if (activeModels.includes(model.id)) {
|
||||||
|
debouncedStopModel(model.id)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const handleSettingChange = (
|
const handleSettingChange = (
|
||||||
key: string,
|
key: string,
|
||||||
value: string | boolean | number
|
value: string | boolean | number
|
||||||
@ -72,8 +197,22 @@ export function ModelSetting({
|
|||||||
})
|
})
|
||||||
|
|
||||||
// Call debounced stopModel only when updating ctx_len, ngl, chat_template, or offload_mmproj
|
// Call debounced stopModel only when updating ctx_len, ngl, chat_template, or offload_mmproj
|
||||||
if (key === 'ctx_len' || key === 'ngl' || key === 'chat_template' || key === 'offload_mmproj') {
|
// and only if the model is currently running
|
||||||
debouncedStopModel(model.id)
|
if (
|
||||||
|
key === 'ctx_len' ||
|
||||||
|
key === 'ngl' ||
|
||||||
|
key === 'chat_template' ||
|
||||||
|
key === 'offload_mmproj'
|
||||||
|
) {
|
||||||
|
// Check if model is running before stopping it
|
||||||
|
serviceHub
|
||||||
|
.models()
|
||||||
|
.getActiveModels()
|
||||||
|
.then((activeModels) => {
|
||||||
|
if (activeModels.includes(model.id)) {
|
||||||
|
debouncedStopModel(model.id)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -98,7 +237,36 @@ export function ModelSetting({
|
|||||||
<SheetDescription>
|
<SheetDescription>
|
||||||
{t('common:modelSettings.description')}
|
{t('common:modelSettings.description')}
|
||||||
</SheetDescription>
|
</SheetDescription>
|
||||||
|
|
||||||
|
{/* Model Load Planning Section - Only show for llamacpp provider */}
|
||||||
|
{provider.provider === 'llamacpp' && (
|
||||||
|
<div className="pb-4 border-b border-main-view-fg/10 my-4">
|
||||||
|
<div>
|
||||||
|
<h3 className="font-medium mb-1">Optimize Settings</h3>
|
||||||
|
<p className="text-main-view-fg/70 text-xs mb-3">
|
||||||
|
Analyze your system and model, then apply optimal loading
|
||||||
|
settings automatically
|
||||||
|
</p>
|
||||||
|
<Button
|
||||||
|
onClick={handlePlanModelLoad}
|
||||||
|
disabled={isPlanning}
|
||||||
|
variant="default"
|
||||||
|
className="w-full"
|
||||||
|
>
|
||||||
|
{isPlanning ? (
|
||||||
|
<>
|
||||||
|
<IconLoader size={16} className="mr-2 animate-spin" />
|
||||||
|
Optimizing...
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<>Auto-Optimize Settings</>
|
||||||
|
)}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</SheetHeader>
|
</SheetHeader>
|
||||||
|
|
||||||
<div className="px-4 space-y-6">
|
<div className="px-4 space-y-6">
|
||||||
{Object.entries(model.settings || {}).map(([key, value]) => {
|
{Object.entries(model.settings || {}).map(([key, value]) => {
|
||||||
const config = value as ProviderSetting
|
const config = value as ProviderSetting
|
||||||
|
|||||||
@ -93,7 +93,11 @@ export const useModelProvider = create<ModelProviderState>()(
|
|||||||
? legacyModels
|
? legacyModels
|
||||||
: models
|
: models
|
||||||
).find(
|
).find(
|
||||||
(m) => m.id.split(':').slice(0, 2).join(getServiceHub().path().sep()) === model.id
|
(m) =>
|
||||||
|
m.id
|
||||||
|
.split(':')
|
||||||
|
.slice(0, 2)
|
||||||
|
.join(getServiceHub().path().sep()) === model.id
|
||||||
)?.settings || model.settings
|
)?.settings || model.settings
|
||||||
const existingModel = models.find((m) => m.id === model.id)
|
const existingModel = models.find((m) => m.id === model.id)
|
||||||
return {
|
return {
|
||||||
@ -227,7 +231,7 @@ export const useModelProvider = create<ModelProviderState>()(
|
|||||||
>
|
>
|
||||||
}
|
}
|
||||||
|
|
||||||
if (version === 0 && state?.providers) {
|
if (version <= 1 && state?.providers) {
|
||||||
state.providers.forEach((provider) => {
|
state.providers.forEach((provider) => {
|
||||||
// Update cont_batching description for llamacpp provider
|
// Update cont_batching description for llamacpp provider
|
||||||
if (provider.provider === 'llamacpp' && provider.settings) {
|
if (provider.provider === 'llamacpp' && provider.settings) {
|
||||||
@ -270,6 +274,15 @@ export const useModelProvider = create<ModelProviderState>()(
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!model.settings.no_kv_offload) {
|
||||||
|
model.settings.no_kv_offload = {
|
||||||
|
...modelSettings.no_kv_offload,
|
||||||
|
controller_props: {
|
||||||
|
...modelSettings.no_kv_offload.controller_props,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -277,7 +290,7 @@ export const useModelProvider = create<ModelProviderState>()(
|
|||||||
|
|
||||||
return state
|
return state
|
||||||
},
|
},
|
||||||
version: 1,
|
version: 2,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -144,4 +144,13 @@ export const modelSettings = {
|
|||||||
type: 'text',
|
type: 'text',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
no_kv_offload: {
|
||||||
|
key: 'no_kv_offload',
|
||||||
|
title: 'Disable KV Offload',
|
||||||
|
description: 'Disable KV cache offload to GPU (if GPU is used).',
|
||||||
|
controller_type: 'checkbox',
|
||||||
|
controller_props: {
|
||||||
|
value: false,
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||||||
import { Card, CardItem } from '@/containers/Card'
|
import { Card, CardItem } from '@/containers/Card'
|
||||||
import HeaderPage from '@/containers/HeaderPage'
|
import HeaderPage from '@/containers/HeaderPage'
|
||||||
import SettingsMenu from '@/containers/SettingsMenu'
|
import SettingsMenu from '@/containers/SettingsMenu'
|
||||||
@ -116,22 +117,25 @@ function ProviderDetail() {
|
|||||||
|
|
||||||
// Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
|
// Add 'vision' capability if not already present AND if user hasn't manually configured capabilities
|
||||||
// Check if model has a custom capabilities config flag
|
// Check if model has a custom capabilities config flag
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const hasUserConfiguredCapabilities = (model as any)._userConfiguredCapabilities === true
|
const hasUserConfiguredCapabilities =
|
||||||
|
(model as any)._userConfiguredCapabilities === true
|
||||||
if (!capabilities.includes('vision') && !hasUserConfiguredCapabilities) {
|
|
||||||
|
if (
|
||||||
|
!capabilities.includes('vision') &&
|
||||||
|
!hasUserConfiguredCapabilities
|
||||||
|
) {
|
||||||
const updatedModels = [...llamacppProvider.models]
|
const updatedModels = [...llamacppProvider.models]
|
||||||
updatedModels[modelIndex] = {
|
updatedModels[modelIndex] = {
|
||||||
...model,
|
...model,
|
||||||
capabilities: [...capabilities, 'vision'],
|
capabilities: [...capabilities, 'vision'],
|
||||||
// Mark this as auto-detected, not user-configured
|
// Mark this as auto-detected, not user-configured
|
||||||
_autoDetectedVision: true,
|
_autoDetectedVision: true,
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
} as any
|
} as any
|
||||||
|
|
||||||
updateProviderState('llamacpp', { models: updatedModels })
|
updateProviderState('llamacpp', { models: updatedModels })
|
||||||
console.log(
|
console.log(
|
||||||
`Vision capability auto-added to model after provider refresh: ${importedModelName}`
|
`Vision capability added to model after provider refresh: ${importedModelName}`
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -257,33 +261,36 @@ function ProviderDetail() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const handleStartModel = (modelId: string) => {
|
const handleStartModel = async (modelId: string) => {
|
||||||
// Add model to loading state
|
// Add model to loading state
|
||||||
setLoadingModels((prev) => [...prev, modelId])
|
setLoadingModels((prev) => [...prev, modelId])
|
||||||
if (provider)
|
if (provider) {
|
||||||
// Original: startModel(provider, modelId).then(() => { setActiveModels((prevModels) => [...prevModels, modelId]) })
|
try {
|
||||||
serviceHub
|
// Start the model with plan result
|
||||||
.models()
|
await serviceHub.models().startModel(provider, modelId)
|
||||||
.startModel(provider, modelId)
|
|
||||||
.then(() => {
|
// Refresh active models after starting
|
||||||
// Refresh active models after starting
|
serviceHub
|
||||||
serviceHub
|
.models()
|
||||||
.models()
|
.getActiveModels()
|
||||||
.getActiveModels()
|
.then((models) => setActiveModels(models || []))
|
||||||
.then((models) => setActiveModels(models || []))
|
} catch (error) {
|
||||||
})
|
console.error('Error starting model:', error)
|
||||||
.catch((error) => {
|
if (
|
||||||
console.error('Error starting model:', error)
|
error &&
|
||||||
if (error && typeof error === 'object' && 'message' in error) {
|
typeof error === 'object' &&
|
||||||
setModelLoadError(error)
|
'message' in error &&
|
||||||
} else {
|
typeof error.message === 'string'
|
||||||
setModelLoadError(`${error}`)
|
) {
|
||||||
}
|
setModelLoadError({ message: error.message })
|
||||||
})
|
} else {
|
||||||
.finally(() => {
|
setModelLoadError(typeof error === 'string' ? error : `${error}`)
|
||||||
// Remove model from loading state
|
}
|
||||||
setLoadingModels((prev) => prev.filter((id) => id !== modelId))
|
} finally {
|
||||||
})
|
// Remove model from loading state
|
||||||
|
setLoadingModels((prev) => prev.filter((id) => id !== modelId))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const handleStopModel = (modelId: string) => {
|
const handleStopModel = (modelId: string) => {
|
||||||
|
|||||||
@ -17,6 +17,7 @@ import type {
|
|||||||
HuggingFaceRepo,
|
HuggingFaceRepo,
|
||||||
CatalogModel,
|
CatalogModel,
|
||||||
ModelValidationResult,
|
ModelValidationResult,
|
||||||
|
ModelPlan,
|
||||||
} from './types'
|
} from './types'
|
||||||
|
|
||||||
// TODO: Replace this with the actual provider later
|
// TODO: Replace this with the actual provider later
|
||||||
@ -491,4 +492,47 @@ export class DefaultModelsService implements ModelsService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async planModelLoad(
|
||||||
|
modelPath: string,
|
||||||
|
requestedCtx?: number
|
||||||
|
): Promise<ModelPlan> {
|
||||||
|
try {
|
||||||
|
const engine = this.getEngine('llamacpp') as AIEngine & {
|
||||||
|
planModelLoad?: (
|
||||||
|
path: string,
|
||||||
|
requestedCtx?: number
|
||||||
|
) => Promise<ModelPlan>
|
||||||
|
}
|
||||||
|
|
||||||
|
if (engine && typeof engine.planModelLoad === 'function') {
|
||||||
|
// Get the full absolute path to the model file
|
||||||
|
const janDataFolderPath = await import('@janhq/core').then((core) =>
|
||||||
|
core.getJanDataFolderPath()
|
||||||
|
)
|
||||||
|
const joinPath = await import('@janhq/core').then(
|
||||||
|
(core) => core.joinPath
|
||||||
|
)
|
||||||
|
const fullModelPath = await joinPath([janDataFolderPath, modelPath])
|
||||||
|
return await engine.planModelLoad(fullModelPath, requestedCtx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback if method is not available
|
||||||
|
console.warn('planModelLoad method not available in llamacpp engine')
|
||||||
|
return {
|
||||||
|
gpuLayers: 0,
|
||||||
|
maxContextLength: 2048,
|
||||||
|
noOffloadKVCache: true,
|
||||||
|
mode: 'Unsupported',
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error planning model load for path ${modelPath}:`, error)
|
||||||
|
return {
|
||||||
|
gpuLayers: 0,
|
||||||
|
maxContextLength: 2048,
|
||||||
|
noOffloadKVCache: true,
|
||||||
|
mode: 'Unsupported',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -81,10 +81,20 @@ export interface ModelValidationResult {
|
|||||||
metadata?: GgufMetadata
|
metadata?: GgufMetadata
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface ModelPlan {
|
||||||
|
gpuLayers: number
|
||||||
|
maxContextLength: number
|
||||||
|
noOffloadKVCache: boolean
|
||||||
|
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
|
||||||
|
}
|
||||||
|
|
||||||
export interface ModelsService {
|
export interface ModelsService {
|
||||||
fetchModels(): Promise<modelInfo[]>
|
fetchModels(): Promise<modelInfo[]>
|
||||||
fetchModelCatalog(): Promise<ModelCatalog>
|
fetchModelCatalog(): Promise<ModelCatalog>
|
||||||
fetchHuggingFaceRepo(repoId: string, hfToken?: string): Promise<HuggingFaceRepo | null>
|
fetchHuggingFaceRepo(
|
||||||
|
repoId: string,
|
||||||
|
hfToken?: string
|
||||||
|
): Promise<HuggingFaceRepo | null>
|
||||||
convertHfRepoToCatalogModel(repo: HuggingFaceRepo): CatalogModel
|
convertHfRepoToCatalogModel(repo: HuggingFaceRepo): CatalogModel
|
||||||
updateModel(model: Partial<CoreModel>): Promise<void>
|
updateModel(model: Partial<CoreModel>): Promise<void>
|
||||||
pullModel(
|
pullModel(
|
||||||
@ -107,14 +117,24 @@ export interface ModelsService {
|
|||||||
getActiveModels(provider?: string): Promise<string[]>
|
getActiveModels(provider?: string): Promise<string[]>
|
||||||
stopModel(model: string, provider?: string): Promise<void>
|
stopModel(model: string, provider?: string): Promise<void>
|
||||||
stopAllModels(): Promise<void>
|
stopAllModels(): Promise<void>
|
||||||
startModel(provider: ProviderObject, model: string): Promise<SessionInfo | undefined>
|
startModel(
|
||||||
|
provider: ProviderObject,
|
||||||
|
model: string
|
||||||
|
): Promise<SessionInfo | undefined>
|
||||||
isToolSupported(modelId: string): Promise<boolean>
|
isToolSupported(modelId: string): Promise<boolean>
|
||||||
checkMmprojExistsAndUpdateOffloadMMprojSetting(
|
checkMmprojExistsAndUpdateOffloadMMprojSetting(
|
||||||
modelId: string,
|
modelId: string,
|
||||||
updateProvider?: (providerName: string, data: Partial<ModelProvider>) => void,
|
updateProvider?: (
|
||||||
|
providerName: string,
|
||||||
|
data: Partial<ModelProvider>
|
||||||
|
) => void,
|
||||||
getProviderByName?: (providerName: string) => ModelProvider | undefined
|
getProviderByName?: (providerName: string) => ModelProvider | undefined
|
||||||
): Promise<{ exists: boolean; settingsUpdated: boolean }>
|
): Promise<{ exists: boolean; settingsUpdated: boolean }>
|
||||||
checkMmprojExists(modelId: string): Promise<boolean>
|
checkMmprojExists(modelId: string): Promise<boolean>
|
||||||
isModelSupported(modelPath: string, ctxSize?: number): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
|
isModelSupported(
|
||||||
|
modelPath: string,
|
||||||
|
ctxSize?: number
|
||||||
|
): Promise<'RED' | 'YELLOW' | 'GREEN' | 'GREY'>
|
||||||
validateGgufFile(filePath: string): Promise<ModelValidationResult>
|
validateGgufFile(filePath: string): Promise<ModelValidationResult>
|
||||||
}
|
planModelLoad(modelPath: string, requestedCtx?: number): Promise<ModelPlan>
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user