diff --git a/.github/workflows/template-tauri-build-linux-x64.yml b/.github/workflows/template-tauri-build-linux-x64.yml index 9e30d5627..bd9b38369 100644 --- a/.github/workflows/template-tauri-build-linux-x64.yml +++ b/.github/workflows/template-tauri-build-linux-x64.yml @@ -53,7 +53,7 @@ on: value: ${{ jobs.build-linux-x64.outputs.APPIMAGE_FILE_NAME }} jobs: build-linux-x64: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: DEB_SIG: ${{ steps.packageinfo.outputs.DEB_SIG }} APPIMAGE_SIG: ${{ steps.packageinfo.outputs.APPIMAGE_SIG }} diff --git a/core/src/browser/extensions/engines/AIEngine.ts b/core/src/browser/extensions/engines/AIEngine.ts index 855f6e4dc..af63d9b19 100644 --- a/core/src/browser/extensions/engines/AIEngine.ts +++ b/core/src/browser/extensions/engines/AIEngine.ts @@ -289,11 +289,6 @@ export abstract class AIEngine extends BaseExtension { */ abstract getLoadedModels(): Promise - /** - * Optional method to get the underlying chat client - */ - getChatClient?(sessionId: string): any - /** * Check if a tool is supported by the model * @param modelId diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json index 2bca12c0f..ce5fc62e4 100644 --- a/extensions/llamacpp-extension/settings.json +++ b/extensions/llamacpp-extension/settings.json @@ -96,18 +96,6 @@ "textAlign": "right" } }, - { - "key": "batch_size", - "title": "Batch Size", - "description": "Logical maximum batch size for processing prompts.", - "controllerType": "input", - "controllerProps": { - "value": 2048, - "placeholder": "2048", - "type": "number", - "textAlign": "right" - } - }, { "key": "ubatch_size", "title": "uBatch Size", diff --git a/extensions/llamacpp-extension/src/backend.ts b/extensions/llamacpp-extension/src/backend.ts index 7f5e8a22b..d60ecc138 100644 --- a/extensions/llamacpp-extension/src/backend.ts +++ b/extensions/llamacpp-extension/src/backend.ts @@ -46,7 +46,6 @@ export async function getLocalInstalledBackends(): Promise< } } } - console.debug(local) return local } diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 8fad4fd87..78e7c04f3 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -37,7 +37,13 @@ import { import { invoke } from '@tauri-apps/api/core' import { getProxyConfig } from './util' import { basename } from '@tauri-apps/api/path' -import { readGgufMetadata } from '@janhq/tauri-plugin-llamacpp-api' +import { + readGgufMetadata, + estimateKVCacheSize, + getModelSize, + isModelSupported, + planModelLoadInternal, +} from '@janhq/tauri-plugin-llamacpp-api' import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api' // Error message constant - matches web-app/src/utils/error.ts @@ -82,6 +88,7 @@ type ModelPlan = { maxContextLength: number noOffloadKVCache: boolean offloadMmproj?: boolean + batchSize: number mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' } @@ -2006,11 +2013,6 @@ export default class llamacpp_extension extends AIEngine { return responseData as EmbeddingResponse } - // Optional method for direct client access - override getChatClient(sessionId: string): any { - throw new Error('method not implemented yet') - } - /** * Check if a tool is supported by the model * Currently read from GGUF chat_template @@ -2073,7 +2075,7 @@ export default class llamacpp_extension extends AIEngine { path: string, meta: Record ): Promise<{ layerSize: number; totalLayers: number }> { - const modelSize = await this.getModelSize(path) + const modelSize = await getModelSize(path) const arch = meta['general.architecture'] const totalLayers = Number(meta[`${arch}.block_count`]) + 2 // 1 for lm_head layer and 1 for embedding layer if (!totalLayers) throw new Error('Invalid metadata: block_count not found') @@ -2089,335 +2091,27 @@ export default class llamacpp_extension extends AIEngine { /^\/\/[^/]+/.test(norm) // UNC path //server/share ) } - + /* + * if (!this.isAbsolutePath(path)) + path = await joinPath([await getJanDataFolderPath(), path]) + if (mmprojPath && !this.isAbsolutePath(mmprojPath)) + mmprojPath = await joinPath([await getJanDataFolderPath(), path]) + */ async planModelLoad( path: string, mmprojPath?: string, requestedCtx?: number ): Promise { - if (!this.isAbsolutePath(path)) + if (!this.isAbsolutePath(path)) { path = await joinPath([await getJanDataFolderPath(), path]) + } if (mmprojPath && !this.isAbsolutePath(mmprojPath)) mmprojPath = await joinPath([await getJanDataFolderPath(), path]) - const modelSize = await this.getModelSize(path) - const memoryInfo = await this.getTotalSystemMemory() - const gguf = await readGgufMetadata(path) - - // Get mmproj size if provided - let mmprojSize = 0 - if (mmprojPath) { - mmprojSize = await this.getModelSize(mmprojPath) - } - - const { layerSize, totalLayers } = await this.getLayerSize( - path, - gguf.metadata - ) - - const kvCachePerToken = (await this.estimateKVCache(gguf.metadata)) - .perTokenSize - - logger.info( - `Model size: ${modelSize}, Layer size: ${layerSize}, Total layers: ${totalLayers}, KV cache per token: ${kvCachePerToken}` - ) - - // Validate critical values - if (!modelSize || modelSize <= 0) { - throw new Error(`Invalid model size: ${modelSize}`) - } - if (!kvCachePerToken || kvCachePerToken <= 0) { - throw new Error(`Invalid KV cache per token: ${kvCachePerToken}`) - } - if (!layerSize || layerSize <= 0) { - throw new Error(`Invalid layer size: ${layerSize}`) - } - - // Reserve memory for OS, other applications, and fixed engine overhead. - const VRAM_RESERVE_GB = 0.5 - const VRAM_RESERVE_BYTES = VRAM_RESERVE_GB * 1024 * 1024 * 1024 - const ENGINE_FIXED_OVERHEAD_BYTES = 0.2 * 1024 * 1024 * 1024 // For scratch buffers etc. - - // Get model's maximum context length - const arch = gguf.metadata['general.architecture'] - const modelMaxContextLength = - Number(gguf.metadata[`${arch}.context_length`]) || 8192 - - const MIN_CONTEXT_LENGTH = 1024 - - // Memory percentages applied to both VRAM and RAM - const memoryPercentages = { high: 0.7, medium: 0.5, low: 0.4 } - - logger.info( - `Memory info - Total (VRAM + RAM): ${memoryInfo.totalMemory}, Total VRAM: ${memoryInfo.totalVRAM}, Mode: ${this.memoryMode}` - ) - - if (!memoryInfo.totalMemory || isNaN(memoryInfo.totalMemory)) { - throw new Error(`Invalid total memory: ${memoryInfo.totalMemory}`) - } - if (!memoryInfo.totalVRAM || isNaN(memoryInfo.totalVRAM)) { - throw new Error(`Invalid total VRAM: ${memoryInfo.totalVRAM}`) - } - if (!this.memoryMode || !(this.memoryMode in memoryPercentages)) { - throw new Error( - `Invalid memory mode: ${this.memoryMode}. Must be 'high', 'medium', or 'low'` - ) - } - - // Apply memory mode to both VRAM and RAM separately - const memoryModeMultiplier = memoryPercentages[this.memoryMode] - const usableVRAM = Math.max( - 0, - memoryInfo.totalVRAM * memoryModeMultiplier - - VRAM_RESERVE_BYTES - - ENGINE_FIXED_OVERHEAD_BYTES - ) - - const actualSystemRAM = Math.max(0, memoryInfo.totalRAM) - const usableSystemMemory = actualSystemRAM * memoryModeMultiplier - - logger.info( - `Actual System RAM: ${actualSystemRAM}, Usable VRAM for plan: ${usableVRAM}, Usable System Memory: ${usableSystemMemory}` - ) - - let gpuLayers = 0 - let maxContextLength = 0 - let noOffloadKVCache = false - let mode: ModelPlan['mode'] = 'Unsupported' - let offloadMmproj = false - - let remainingVRAM = usableVRAM - if (mmprojSize > 0 && mmprojSize <= remainingVRAM) { - offloadMmproj = true - remainingVRAM -= mmprojSize - } - const vramForMinContext = ( - await this.estimateKVCache(gguf.metadata, MIN_CONTEXT_LENGTH) - ).size - - const ramForModel = modelSize + (offloadMmproj ? 0 : mmprojSize) - if (ramForModel + vramForMinContext > usableSystemMemory + usableVRAM) { - logger.error( - `Model unsupported. Not enough resources for model and min context.` - ) - return { - gpuLayers: 0, - maxContextLength: 0, - noOffloadKVCache: true, - mode: 'Unsupported', - offloadMmproj: false, - } - } - - const targetContext = Math.min( - requestedCtx || modelMaxContextLength, - modelMaxContextLength - ) - - let targetContextSize = ( - await this.estimateKVCache(gguf.metadata, targetContext) - ).size - - // Use `kvCachePerToken` for all VRAM calculations - if (modelSize + targetContextSize <= remainingVRAM) { - mode = 'GPU' - gpuLayers = totalLayers - maxContextLength = targetContext - noOffloadKVCache = false - logger.info( - 'Planning: Ideal case fits. All layers and target context in VRAM.' - ) - } else if (modelSize <= remainingVRAM) { - mode = 'GPU' - gpuLayers = totalLayers - noOffloadKVCache = false - const vramLeftForContext = remainingVRAM - modelSize - maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken) - - // Add safety check to prevent OOM - const safetyBuffer = 0.9 // Use 90% of calculated context to be safe - maxContextLength = Math.floor(maxContextLength * safetyBuffer) - - logger.info( - `Planning: All layers fit in VRAM, but context must be reduced. VRAM left: ${vramLeftForContext}, kvCachePerToken: ${kvCachePerToken}, calculated context: ${maxContextLength}` - ) - } else { - const vramAvailableForLayers = remainingVRAM - vramForMinContext - - if (vramAvailableForLayers >= layerSize) { - mode = 'Hybrid' - gpuLayers = Math.min( - Math.floor(vramAvailableForLayers / layerSize), - totalLayers - ) - noOffloadKVCache = false - const vramUsedByLayers = gpuLayers * layerSize - const vramLeftForContext = remainingVRAM - vramUsedByLayers - maxContextLength = Math.floor(vramLeftForContext / kvCachePerToken) - - logger.info( - 'Planning: Hybrid mode. Offloading layers to fit context in VRAM.' - ) - } - } - - // Fallback logic: try different configurations if no VRAM-based plan worked - if (mode === 'Unsupported') { - logger.info('Planning: Trying fallback configurations...') - - // Try putting some layers on GPU with KV cache in RAM - const possibleGpuLayers = Math.floor(remainingVRAM / layerSize) - if (possibleGpuLayers > 0) { - gpuLayers = Math.min(possibleGpuLayers, totalLayers) - const ramUsedByCpuLayers = (totalLayers - gpuLayers) * layerSize - const ramUsedByMmproj = !offloadMmproj ? mmprojSize : 0 - const availableRamForKv = - usableSystemMemory - (ramUsedByCpuLayers + ramUsedByMmproj) - // Note: Use `kvCachePerToken` for RAM calculation, as the overhead is GPU-specific - const contextInRam = Math.floor(availableRamForKv / kvCachePerToken) - - if (contextInRam >= MIN_CONTEXT_LENGTH) { - mode = 'Hybrid' - maxContextLength = contextInRam - noOffloadKVCache = true - logger.info( - `Planning: Fallback hybrid - GPU layers: ${gpuLayers}, Context in RAM: ${maxContextLength}` - ) - } - } - - // If still unsupported, try pure CPU mode - if (mode === 'Unsupported') { - gpuLayers = 0 - noOffloadKVCache = true - offloadMmproj = false - const ramUsedByModel = modelSize + mmprojSize - const availableRamForKv = usableSystemMemory - ramUsedByModel - maxContextLength = Math.floor(availableRamForKv / kvCachePerToken) - if (maxContextLength >= MIN_CONTEXT_LENGTH) { - mode = 'CPU' - logger.info(`Planning: CPU mode - Context: ${maxContextLength}`) - } - } - } - - if (mode === 'CPU' || noOffloadKVCache) { - offloadMmproj = false - } - - if (requestedCtx && requestedCtx > 0) { - maxContextLength = Math.min(maxContextLength, requestedCtx) - } - - maxContextLength = Math.min(maxContextLength, modelMaxContextLength) - - if (maxContextLength < MIN_CONTEXT_LENGTH) { - mode = 'Unsupported' - } - - if (mode === 'Unsupported') { - gpuLayers = 0 - maxContextLength = 0 - } - - maxContextLength = isNaN(maxContextLength) - ? 0 - : Math.floor(maxContextLength) - - const mmprojInfo = mmprojPath - ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed( - 2 - )}MB, offloadMmproj=${offloadMmproj}` - : '' - - logger.info( - `Final plan for ${path}: gpuLayers=${gpuLayers}/${totalLayers}, ` + - `maxContextLength=${maxContextLength}, noOffloadKVCache=${noOffloadKVCache}, ` + - `mode=${mode}${mmprojInfo}` - ) - - return { - gpuLayers, - maxContextLength, - noOffloadKVCache, - mode, - offloadMmproj, - } - } - /** - * estimate KVCache size from a given metadata - */ - private async estimateKVCache( - meta: Record, - ctx_size?: number - ): Promise<{ size: number; perTokenSize: number }> { - const arch = meta['general.architecture'] - if (!arch) throw new Error('Invalid metadata: architecture not found') - - const nLayer = Number(meta[`${arch}.block_count`]) - if (!nLayer) throw new Error('Invalid metadata: block_count not found') - - const nHead = Number(meta[`${arch}.attention.head_count`]) - if (!nHead) throw new Error('Invalid metadata: head_count not found') - - // Try to get key/value lengths first (more accurate) - const keyLen = Number(meta[`${arch}.attention.key_length`]) - const valLen = Number(meta[`${arch}.attention.value_length`]) - - let headDim: number - - if (keyLen && valLen) { - // Use explicit key/value lengths if available - logger.info( - `Using explicit key_length: ${keyLen}, value_length: ${valLen}` - ) - headDim = keyLen + valLen - } else { - // Fall back to embedding_length estimation - const embeddingLen = Number(meta[`${arch}.embedding_length`]) - if (!embeddingLen) - throw new Error('Invalid metadata: embedding_length not found') - - // Standard transformer: head_dim = embedding_dim / num_heads - // For KV cache: we need both K and V, so 2 * head_dim per head - headDim = (embeddingLen / nHead) * 2 - logger.info( - `Using embedding_length estimation: ${embeddingLen}, calculated head_dim: ${headDim}` - ) - } - - const maxCtx = Number(meta[`${arch}.context_length`]) - if (!maxCtx) throw new Error('Invalid metadata: context_length not found') - - // b) If the user supplied a value, clamp it to the model's max - let ctxLen = ctx_size ? Math.min(ctx_size, maxCtx) : maxCtx - - logger.info(`Final context length used for KV size: ${ctxLen}`) - logger.info(`nLayer: ${nLayer}, nHead: ${nHead}, headDim (K+V): ${headDim}`) - - logger.info(`ctxLen: ${ctxLen}`) - logger.info(`nLayer: ${nLayer}`) - logger.info(`nHead: ${nHead}`) - logger.info(`headDim: ${headDim}`) - - // Consider f16 by default - // Can be extended by checking cache-type-v and cache-type-k - // but we are checking overall compatibility with the default settings - // fp16 = 8 bits * 2 = 16 - const bytesPerElement = 2 - - // Total KV cache size per token = nHead * headDim * bytesPerElement * nLayer - const kvPerToken = nHead * headDim * bytesPerElement * nLayer - - return { size: ctxLen * kvPerToken, perTokenSize: kvPerToken } - } - - private async getModelSize(path: string): Promise { - if (path.startsWith('https://')) { - const res = await fetch(path, { method: 'HEAD' }) - const len = res.headers.get('content-length') - return len ? parseInt(len, 10) : 0 - } else { - return (await fs.fileStat(path)).size + try { + const result = await planModelLoadInternal(path, this.memoryMode, mmprojPath, requestedCtx) + return result + } catch (e) { + throw new Error(String(e)) } } @@ -2431,50 +2125,11 @@ export default class llamacpp_extension extends AIEngine { */ async isModelSupported( path: string, - ctx_size?: number + ctxSize?: number ): Promise<'RED' | 'YELLOW' | 'GREEN'> { try { - const modelSize = await this.getModelSize(path) - const memoryInfo = await this.getTotalSystemMemory() - - logger.info(`modelSize: ${modelSize}`) - - const gguf = await readGgufMetadata(path) - let kvCacheSize: number - if (ctx_size) { - kvCacheSize = (await this.estimateKVCache(gguf.metadata, ctx_size)).size - } else { - kvCacheSize = (await this.estimateKVCache(gguf.metadata)).size - } - - // Total memory consumption = model weights + kvcache - const totalRequired = modelSize + kvCacheSize - logger.info( - `isModelSupported: Total memory requirement: ${totalRequired} for ${path}` - ) - - // Use 80% of total memory as the usable limit - const USABLE_MEMORY_PERCENTAGE = 0.9 - const usableTotalMemory = - memoryInfo.totalRAM * USABLE_MEMORY_PERCENTAGE + - memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE - const usableVRAM = memoryInfo.totalVRAM * USABLE_MEMORY_PERCENTAGE - - // Check if model fits in total memory at all (this is the hard limit) - if (totalRequired > usableTotalMemory) { - return 'RED' // Truly impossible to run - } - - // Check if everything fits in VRAM (ideal case) - if (totalRequired <= usableVRAM) { - return 'GREEN' - } - - // If we get here, it means: - // - Total requirement fits in combined memory - // - But doesn't fit entirely in VRAM - // This is the CPU-GPU hybrid scenario - return 'YELLOW' + const result = await isModelSupported(path, Number(ctxSize)) + return result } catch (e) { throw new Error(String(e)) } diff --git a/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs b/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs index 8f0427a6b..228a3731e 100644 --- a/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs +++ b/src-tauri/plugins/tauri-plugin-hardware/src/lib.rs @@ -15,6 +15,8 @@ use tauri::Runtime; static SYSTEM_INFO: OnceLock = OnceLock::new(); +pub use commands::get_system_info; + /// Initialize the hardware plugin pub fn init() -> tauri::plugin::TauriPlugin { tauri::plugin::Builder::new("hardware") diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml b/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml index fd58f6225..e1a57b962 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml +++ b/src-tauri/plugins/tauri-plugin-llamacpp/Cargo.toml @@ -24,6 +24,7 @@ tauri = { version = "2.5.0", default-features = false, features = [] } thiserror = "2.0.12" tokio = { version = "1", features = ["full"] } reqwest = { version = "0.11", features = ["json", "blocking", "stream"] } +tauri-plugin-hardware = { path = "../tauri-plugin-hardware" } # Unix-specific dependencies [target.'cfg(unix)'.dependencies] diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/build.rs b/src-tauri/plugins/tauri-plugin-llamacpp/build.rs index ca32eb4d5..93c0f405b 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/build.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/build.rs @@ -14,6 +14,10 @@ const COMMANDS: &[&str] = &[ "get_session_by_model", // GGUF commands "read_gguf_metadata", + "estimate_kv_cache_size", + "get_model_size", + "is_model_supported", + "plan_model_load" ]; fn main() { diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts b/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts index 0380e4fe7..957839a63 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts +++ b/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts @@ -2,28 +2,28 @@ import { invoke } from '@tauri-apps/api/core' // Types export interface SessionInfo { - pid: number; - port: number; - model_id: string; - model_path: string; - api_key: string; + pid: number + port: number + model_id: string + model_path: string + api_key: string } export interface DeviceInfo { - id: string; - name: string; - memory: number; + id: string + name: string + memory: number } export interface GgufMetadata { - version: number; - tensor_count: number; - metadata: Record; + version: number + tensor_count: number + metadata: Record } // Cleanup commands export async function cleanupLlamaProcesses(): Promise { - return await invoke('plugin:llamacpp|cleanup_llama_processes'); + return await invoke('plugin:llamacpp|cleanup_llama_processes') } // LlamaCpp server commands @@ -35,12 +35,12 @@ export async function loadLlamaModel( return await invoke('plugin:llamacpp|load_llama_model', { backendPath, libraryPath, - args - }); + args, + }) } export async function unloadLlamaModel(pid: number): Promise { - return await invoke('plugin:llamacpp|unload_llama_model', { pid }); + return await invoke('plugin:llamacpp|unload_llama_model', { pid }) } export async function getDevices( @@ -49,8 +49,8 @@ export async function getDevices( ): Promise { return await invoke('plugin:llamacpp|get_devices', { backendPath, - libraryPath - }); + libraryPath, + }) } export async function generateApiKey( @@ -59,35 +59,84 @@ export async function generateApiKey( ): Promise { return await invoke('plugin:llamacpp|generate_api_key', { modelId, - apiSecret - }); + apiSecret, + }) } export async function isProcessRunning(pid: number): Promise { - return await invoke('plugin:llamacpp|is_process_running', { pid }); + return await invoke('plugin:llamacpp|is_process_running', { pid }) } export async function getRandomPort(): Promise { - return await invoke('plugin:llamacpp|get_random_port'); + return await invoke('plugin:llamacpp|get_random_port') } -export async function findSessionByModel(modelId: string): Promise { - return await invoke('plugin:llamacpp|find_session_by_model', { modelId }); +export async function findSessionByModel( + modelId: string +): Promise { + return await invoke('plugin:llamacpp|find_session_by_model', { modelId }) } export async function getLoadedModels(): Promise { - return await invoke('plugin:llamacpp|get_loaded_models'); + return await invoke('plugin:llamacpp|get_loaded_models') } export async function getAllSessions(): Promise { - return await invoke('plugin:llamacpp|get_all_sessions'); + return await invoke('plugin:llamacpp|get_all_sessions') } -export async function getSessionByModel(modelId: string): Promise { - return await invoke('plugin:llamacpp|get_session_by_model', { modelId }); +export async function getSessionByModel( + modelId: string +): Promise { + return await invoke('plugin:llamacpp|get_session_by_model', { modelId }) } // GGUF commands export async function readGgufMetadata(path: string): Promise { - return await invoke('plugin:llamacpp|read_gguf_metadata', { path }); + return await invoke('plugin:llamacpp|read_gguf_metadata', { path }) +} + +export async function estimateKVCacheSize( + meta: Record, + ctxSize?: number +): Promise<{ size: number; per_token_size: number }> { + return await invoke('plugin:llamacpp|estimate_kv_cache_size', { + meta, + ctxSize, + }) +} + +export async function getModelSize(path: string): Promise { + return await invoke('plugin:llamacpp|get_model_size', { path }) +} + +export async function isModelSupported( + path: string, + ctxSize?: number +): Promise<'RED' | 'YELLOW' | 'GREEN'> { + return await invoke('plugin:llamacpp|is_model_supported', { + path, + ctxSize, + }) +} + +export async function planModelLoadInternal( + path: string, + memoryMode: string, + mmprojPath?: string, + requestedContext?: number +): Promise<{ + gpuLayers: number + maxContextLength: number + noOffloadKVCache: boolean + offloadMmproj?: boolean + batchSize: number + mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' +}> { + return await invoke('plugin:llamacpp|plan_model_load', { + path, + memoryMode, + mmprojPath, + requestedContext, + }) } diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/estimate_kv_cache_size.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/estimate_kv_cache_size.toml new file mode 100644 index 000000000..39d419214 --- /dev/null +++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/estimate_kv_cache_size.toml @@ -0,0 +1,13 @@ +# Automatically generated - DO NOT EDIT! + +"$schema" = "../../schemas/schema.json" + +[[permission]] +identifier = "allow-estimate-kv-cache-size" +description = "Enables the estimate_kv_cache_size command without any pre-configured scope." +commands.allow = ["estimate_kv_cache_size"] + +[[permission]] +identifier = "deny-estimate-kv-cache-size" +description = "Denies the estimate_kv_cache_size command without any pre-configured scope." +commands.deny = ["estimate_kv_cache_size"] diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/get_model_size.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/get_model_size.toml new file mode 100644 index 000000000..7ba81c38e --- /dev/null +++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/get_model_size.toml @@ -0,0 +1,13 @@ +# Automatically generated - DO NOT EDIT! + +"$schema" = "../../schemas/schema.json" + +[[permission]] +identifier = "allow-get-model-size" +description = "Enables the get_model_size command without any pre-configured scope." +commands.allow = ["get_model_size"] + +[[permission]] +identifier = "deny-get-model-size" +description = "Denies the get_model_size command without any pre-configured scope." +commands.deny = ["get_model_size"] diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/is_model_supported.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/is_model_supported.toml new file mode 100644 index 000000000..e9c5ad23f --- /dev/null +++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/is_model_supported.toml @@ -0,0 +1,13 @@ +# Automatically generated - DO NOT EDIT! + +"$schema" = "../../schemas/schema.json" + +[[permission]] +identifier = "allow-is-model-supported" +description = "Enables the is_model_supported command without any pre-configured scope." +commands.allow = ["is_model_supported"] + +[[permission]] +identifier = "deny-is-model-supported" +description = "Denies the is_model_supported command without any pre-configured scope." +commands.deny = ["is_model_supported"] diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/plan_model_load.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/plan_model_load.toml new file mode 100644 index 000000000..a1a315967 --- /dev/null +++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/commands/plan_model_load.toml @@ -0,0 +1,13 @@ +# Automatically generated - DO NOT EDIT! + +"$schema" = "../../schemas/schema.json" + +[[permission]] +identifier = "allow-plan-model-load" +description = "Enables the plan_model_load command without any pre-configured scope." +commands.allow = ["plan_model_load"] + +[[permission]] +identifier = "deny-plan-model-load" +description = "Denies the plan_model_load command without any pre-configured scope." +commands.deny = ["plan_model_load"] diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md index 898cfe530..faab24f25 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md +++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/autogenerated/reference.md @@ -16,6 +16,10 @@ Default permissions for the llamacpp plugin - `allow-get-all-sessions` - `allow-get-session-by-model` - `allow-read-gguf-metadata` +- `allow-estimate-kv-cache-size` +- `allow-get-model-size` +- `allow-is-model-supported` +- `allow-plan-model-load` ## Permission Table @@ -55,6 +59,32 @@ Denies the cleanup_llama_processes command without any pre-configured scope. +`llamacpp:allow-estimate-kv-cache-size` + + + + +Enables the estimate_kv_cache_size command without any pre-configured scope. + + + + + + + +`llamacpp:deny-estimate-kv-cache-size` + + + + +Denies the estimate_kv_cache_size command without any pre-configured scope. + + + + + + + `llamacpp:allow-find-session-by-model` @@ -185,6 +215,32 @@ Denies the get_loaded_models command without any pre-configured scope. +`llamacpp:allow-get-model-size` + + + + +Enables the get_model_size command without any pre-configured scope. + + + + + + + +`llamacpp:deny-get-model-size` + + + + +Denies the get_model_size command without any pre-configured scope. + + + + + + + `llamacpp:allow-get-random-port` @@ -237,6 +293,32 @@ Denies the get_session_by_model command without any pre-configured scope. +`llamacpp:allow-is-model-supported` + + + + +Enables the is_model_supported command without any pre-configured scope. + + + + + + + +`llamacpp:deny-is-model-supported` + + + + +Denies the is_model_supported command without any pre-configured scope. + + + + + + + `llamacpp:allow-is-process-running` @@ -289,6 +371,32 @@ Denies the load_llama_model command without any pre-configured scope. +`llamacpp:allow-plan-model-load` + + + + +Enables the plan_model_load command without any pre-configured scope. + + + + + + + +`llamacpp:deny-plan-model-load` + + + + +Denies the plan_model_load command without any pre-configured scope. + + + + + + + `llamacpp:allow-read-gguf-metadata` diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml index 08339b766..91938e047 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml +++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/default.toml @@ -3,10 +3,10 @@ description = "Default permissions for the llamacpp plugin" permissions = [ # Cleanup commands "allow-cleanup-llama-processes", - + # LlamaCpp server commands "allow-load-llama-model", - "allow-unload-llama-model", + "allow-unload-llama-model", "allow-get-devices", "allow-generate-api-key", "allow-is-process-running", @@ -15,7 +15,11 @@ permissions = [ "allow-get-loaded-models", "allow-get-all-sessions", "allow-get-session-by-model", - + # GGUF commands - "allow-read-gguf-metadata" + "allow-read-gguf-metadata", + "allow-estimate-kv-cache-size", + "allow-get-model-size", + "allow-is-model-supported", + "allow-plan-model-load" ] diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json index 70ccaf6f7..0dd243ff0 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json +++ b/src-tauri/plugins/tauri-plugin-llamacpp/permissions/schemas/schema.json @@ -306,6 +306,18 @@ "const": "deny-cleanup-llama-processes", "markdownDescription": "Denies the cleanup_llama_processes command without any pre-configured scope." }, + { + "description": "Enables the estimate_kv_cache_size command without any pre-configured scope.", + "type": "string", + "const": "allow-estimate-kv-cache-size", + "markdownDescription": "Enables the estimate_kv_cache_size command without any pre-configured scope." + }, + { + "description": "Denies the estimate_kv_cache_size command without any pre-configured scope.", + "type": "string", + "const": "deny-estimate-kv-cache-size", + "markdownDescription": "Denies the estimate_kv_cache_size command without any pre-configured scope." + }, { "description": "Enables the find_session_by_model command without any pre-configured scope.", "type": "string", @@ -366,6 +378,18 @@ "const": "deny-get-loaded-models", "markdownDescription": "Denies the get_loaded_models command without any pre-configured scope." }, + { + "description": "Enables the get_model_size command without any pre-configured scope.", + "type": "string", + "const": "allow-get-model-size", + "markdownDescription": "Enables the get_model_size command without any pre-configured scope." + }, + { + "description": "Denies the get_model_size command without any pre-configured scope.", + "type": "string", + "const": "deny-get-model-size", + "markdownDescription": "Denies the get_model_size command without any pre-configured scope." + }, { "description": "Enables the get_random_port command without any pre-configured scope.", "type": "string", @@ -390,6 +414,18 @@ "const": "deny-get-session-by-model", "markdownDescription": "Denies the get_session_by_model command without any pre-configured scope." }, + { + "description": "Enables the is_model_supported command without any pre-configured scope.", + "type": "string", + "const": "allow-is-model-supported", + "markdownDescription": "Enables the is_model_supported command without any pre-configured scope." + }, + { + "description": "Denies the is_model_supported command without any pre-configured scope.", + "type": "string", + "const": "deny-is-model-supported", + "markdownDescription": "Denies the is_model_supported command without any pre-configured scope." + }, { "description": "Enables the is_process_running command without any pre-configured scope.", "type": "string", @@ -414,6 +450,18 @@ "const": "deny-load-llama-model", "markdownDescription": "Denies the load_llama_model command without any pre-configured scope." }, + { + "description": "Enables the plan_model_load command without any pre-configured scope.", + "type": "string", + "const": "allow-plan-model-load", + "markdownDescription": "Enables the plan_model_load command without any pre-configured scope." + }, + { + "description": "Denies the plan_model_load command without any pre-configured scope.", + "type": "string", + "const": "deny-plan-model-load", + "markdownDescription": "Denies the plan_model_load command without any pre-configured scope." + }, { "description": "Enables the read_gguf_metadata command without any pre-configured scope.", "type": "string", @@ -439,10 +487,10 @@ "markdownDescription": "Denies the unload_llama_model command without any pre-configured scope." }, { - "description": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`", + "description": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`\n- `allow-estimate-kv-cache-size`\n- `allow-get-model-size`\n- `allow-is-model-supported`\n- `allow-plan-model-load`", "type": "string", "const": "default", - "markdownDescription": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`" + "markdownDescription": "Default permissions for the llamacpp plugin\n#### This default permission set includes:\n\n- `allow-cleanup-llama-processes`\n- `allow-load-llama-model`\n- `allow-unload-llama-model`\n- `allow-get-devices`\n- `allow-generate-api-key`\n- `allow-is-process-running`\n- `allow-get-random-port`\n- `allow-find-session-by-model`\n- `allow-get-loaded-models`\n- `allow-get-all-sessions`\n- `allow-get-session-by-model`\n- `allow-read-gguf-metadata`\n- `allow-estimate-kv-cache-size`\n- `allow-get-model-size`\n- `allow-is-model-supported`\n- `allow-plan-model-load`" } ] } diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs index ae38f56f3..c636fa8bd 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/commands.rs @@ -1,58 +1,141 @@ -use super::helpers; use super::types::GgufMetadata; -use reqwest; -use std::fs::File; -use std::io::BufReader; - +use super::utils::{estimate_kv_cache_internal, read_gguf_metadata_internal}; +use crate::gguf::types::{KVCacheError, KVCacheEstimate, ModelSupportStatus}; +use std::collections::HashMap; +use std::fs; +use tauri::Runtime; +use tauri_plugin_hardware::get_system_info; /// Read GGUF metadata from a model file #[tauri::command] pub async fn read_gguf_metadata(path: String) -> Result { - if path.starts_with("http://") || path.starts_with("https://") { - // Remote: read in 2MB chunks until successful + return read_gguf_metadata_internal(path).await; +} + +#[tauri::command] +pub async fn estimate_kv_cache_size( + meta: HashMap, + ctx_size: Option, +) -> Result { + estimate_kv_cache_internal(meta, ctx_size).await +} + +#[tauri::command] +pub async fn get_model_size(path: String) -> Result { + if path.starts_with("https://") { + // Handle remote URL let client = reqwest::Client::new(); - let chunk_size = 2 * 1024 * 1024; // Fixed 2MB chunks - let max_total_size = 120 * 1024 * 1024; // Don't exceed 120MB total - let mut total_downloaded = 0; - let mut accumulated_data = Vec::new(); + let response = client + .head(&path) + .send() + .await + .map_err(|e| format!("Failed to fetch HEAD request: {}", e))?; - while total_downloaded < max_total_size { - let start = total_downloaded; - let end = std::cmp::min(start + chunk_size - 1, max_total_size - 1); - - let resp = client - .get(&path) - .header("Range", format!("bytes={}-{}", start, end)) - .send() - .await - .map_err(|e| format!("Failed to fetch chunk {}-{}: {}", start, end, e))?; - - let chunk_data = resp - .bytes() - .await - .map_err(|e| format!("Failed to read chunk response: {}", e))?; - - accumulated_data.extend_from_slice(&chunk_data); - total_downloaded += chunk_data.len(); - - // Try parsing after each chunk - let cursor = std::io::Cursor::new(&accumulated_data); - if let Ok(metadata) = helpers::read_gguf_metadata(cursor) { - return Ok(metadata); - } - - // If we got less data than expected, we've reached EOF - if chunk_data.len() < chunk_size { - break; - } + if let Some(content_length) = response.headers().get("content-length") { + let content_length_str = content_length + .to_str() + .map_err(|e| format!("Invalid content-length header: {}", e))?; + content_length_str + .parse::() + .map_err(|e| format!("Failed to parse content-length: {}", e)) + } else { + Ok(0) } - Err("Could not parse GGUF metadata from downloaded data".to_string()) } else { - // Local: use streaming file reader - let file = - File::open(&path).map_err(|e| format!("Failed to open local file {}: {}", path, e))?; - let reader = BufReader::new(file); - - helpers::read_gguf_metadata(reader) - .map_err(|e| format!("Failed to parse GGUF metadata: {}", e)) + // Handle local file using standard fs + let metadata = + fs::metadata(&path).map_err(|e| format!("Failed to get file metadata: {}", e))?; + Ok(metadata.len()) } } + +#[tauri::command] +pub async fn is_model_supported( + path: String, + ctx_size: Option, + app_handle: tauri::AppHandle, +) -> Result { + // Get model size + let model_size = get_model_size(path.clone()).await?; + + // Get system info + let system_info = get_system_info(app_handle.clone()); + + log::info!("modelSize: {}", model_size); + + // Read GGUF metadata + let gguf = read_gguf_metadata(path.clone()).await?; + + // Calculate KV cache size + let kv_cache_size = if let Some(ctx_size) = ctx_size { + log::info!("Using ctx_size: {}", ctx_size); + estimate_kv_cache_internal(gguf.metadata, Some(ctx_size as u64)) + .await + .map_err(|e| e.to_string())? + .size + } else { + estimate_kv_cache_internal(gguf.metadata, None) + .await + .map_err(|e| e.to_string())? + .size + }; + + // Total memory consumption = model weights + kvcache + let total_required = model_size + kv_cache_size; + log::info!( + "isModelSupported: Total memory requirement: {} for {}; Got kvCacheSize: {} from BE", + total_required, + path, + kv_cache_size + ); + + const RESERVE_BYTES: u64 = 2288490189; + let total_system_memory = system_info.total_memory * 1024 * 1024; + // Calculate total VRAM from all GPUs + let total_vram: u64 = if system_info.gpus.is_empty() { + // On macOS with unified memory, GPU info may be empty + // Use total RAM as VRAM since memory is shared + log::info!("No GPUs detected (likely unified memory system), using total RAM as VRAM"); + total_system_memory + } else { + system_info + .gpus + .iter() + .map(|g| g.total_memory * 1024 * 1024) + .sum::() + }; + + log::info!("Total VRAM reported/calculated (in bytes): {}", &total_vram); + + let usable_vram = if total_vram > RESERVE_BYTES { + total_vram - RESERVE_BYTES + } else { + 0 + }; + + let usable_total_memory = if total_system_memory > RESERVE_BYTES { + (total_system_memory - RESERVE_BYTES) + usable_vram + } else { + 0 + }; + log::info!("System RAM: {} bytes", &total_system_memory); + log::info!("Total VRAM: {} bytes", &total_vram); + log::info!("Usable total memory: {} bytes", &usable_total_memory); + log::info!("Usable VRAM: {} bytes", &usable_vram); + log::info!("Required: {} bytes", &total_required); + + // Check if model fits in total memory at all (this is the hard limit) + if total_required > usable_total_memory { + return Ok(ModelSupportStatus::Red); // Truly impossible to run + } + + // Check if everything fits in VRAM (ideal case) + if total_required <= usable_vram { + return Ok(ModelSupportStatus::Green); + } + + // If we get here, it means: + // - Total requirement fits in combined memory + // - But doesn't fit entirely in VRAM + // This is the CPU-GPU hybrid scenario + Ok(ModelSupportStatus::Yellow) +} diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs index 44fa1911f..935b17b83 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/mod.rs @@ -1,3 +1,5 @@ pub mod commands; pub mod helpers; pub mod types; +pub mod utils; +pub mod model_planner; diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/model_planner.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/model_planner.rs new file mode 100644 index 000000000..118894871 --- /dev/null +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/model_planner.rs @@ -0,0 +1,318 @@ +use crate::gguf::commands::get_model_size; +use crate::gguf::utils::estimate_kv_cache_internal; +use crate::gguf::utils::read_gguf_metadata_internal; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use tauri::Runtime; +use tauri_plugin_hardware::get_system_info; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "camelCase")] +pub struct ModelPlan { + pub gpu_layers: u64, + pub max_context_length: u64, + pub no_offload_kv_cache: bool, + pub offload_mmproj: bool, + pub batch_size: u64, + pub mode: ModelMode, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[serde(rename_all = "UPPERCASE")] +pub enum ModelMode { + GPU, + Hybrid, + CPU, + Unsupported, +} + +#[tauri::command] +pub async fn plan_model_load( + path: String, + memory_mode: String, + mmproj_path: Option, + requested_ctx: Option, + app: tauri::AppHandle, +) -> Result { + let model_size = get_model_size(path.clone()).await?; + let sys_info = get_system_info(app.clone()); + let gguf = read_gguf_metadata_internal(path.clone()).await?; + + let mut mmproj_size: u64 = 0; + if let Some(ref mmproj) = mmproj_path { + mmproj_size = get_model_size(mmproj.clone()).await?; + } + + let arch = gguf + .metadata + .get("general.architecture") + .ok_or("Missing architecture")?; + let repeating_layers: u64 = gguf + .metadata + .get(&format!("{arch}.block_count")) + .ok_or("Missing block_count")? + .parse() + .map_err(|_| "Invalid block_count")?; + let total_layers = repeating_layers + 1; + let layer_size = model_size / total_layers; + + let kv_cache = estimate_kv_cache_internal(gguf.metadata.clone(), None) + .await + .map_err(|e| e.to_string())?; + let kv_cache_per_token = kv_cache.per_token_size; + + if model_size == 0 || layer_size == 0 || kv_cache_per_token == 0 { + return Err("Invalid model/layer/cache sizes".into()); + } + + const RESERVE_BYTES: u64 = 2288490189; + const MIN_CONTEXT_LENGTH: u64 = 2048; + + let model_max_ctx: u64 = gguf + .metadata + .get(&format!("{arch}.context_length")) + .and_then(|s| s.parse().ok()) + .unwrap_or(8192); + + let memory_percentages = HashMap::from([("high", 0.7), ("medium", 0.5), ("low", 0.4)]); + + let multiplier = *memory_percentages + .get(memory_mode.as_str()) + .ok_or("Invalid memory mode")?; + + log::info!("Got GPUs:\n{:?}", &sys_info.gpus); + + let total_ram: u64 = sys_info.total_memory * 1024 * 1024; + log::info!( + "Total system memory reported from tauri_plugin_hardware(in bytes): {}", + &total_ram + ); + + let total_vram: u64 = if sys_info.gpus.is_empty() { + // On macOS with unified memory, GPU info may be empty + // Use total RAM as VRAM since memory is shared + log::info!("No GPUs detected (likely unified memory system), using total RAM as VRAM"); + total_ram + } else { + sys_info + .gpus + .iter() + .map(|g| g.total_memory * 1024 * 1024) + .sum::() + }; + + log::info!("Total VRAM reported/calculated (in bytes): {}", &total_vram); + let usable_vram: u64 = if total_vram > RESERVE_BYTES { + (((total_vram - RESERVE_BYTES) as f64) * multiplier) as u64 + } else { + 0 + }; + log::info!("Usable vram calculated: {}", &usable_vram); + + let usable_ram: u64 = if total_ram > RESERVE_BYTES { + (((total_ram - RESERVE_BYTES) as f64) * multiplier).max(0.0) as u64 + } else { + 0 + }; + log::info!("Usable ram calculated (in bytes): {}", &usable_ram); + + let mut gpu_layers = 0; + let mut max_ctx_len = 0; + let mut no_offload_kv_cache = false; + let mut mode = ModelMode::Unsupported; + let mut offload_mmproj = false; + let mut batch_size = 2048; + + let total_available_mem = usable_vram.saturating_add(usable_ram); + if model_size + mmproj_size > total_available_mem { + log::info!("Model not supported in this system!"); + return Ok(ModelPlan { + gpu_layers: 0, + max_context_length: 0, + no_offload_kv_cache: true, + batch_size: 64, + mode: ModelMode::Unsupported, + offload_mmproj: false, + }); + } + if mmproj_size > 0 { + offload_mmproj = true; + } + + let kv_min_size = estimate_kv_cache_internal(gguf.metadata.clone(), Some(MIN_CONTEXT_LENGTH)) + .await + .map_err(|e| e.to_string())? + .size; + + if model_size + kv_min_size + mmproj_size <= usable_vram { + log::info!("Planning mode: Full GPU offload is possible."); + mode = ModelMode::GPU; + gpu_layers = total_layers; + let vram_left_for_ctx = usable_vram.saturating_sub(model_size); + let max_ctx_by_vram = (vram_left_for_ctx / kv_cache_per_token) as u64; + let requested_target = requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx); + max_ctx_len = requested_target.min(max_ctx_by_vram); + no_offload_kv_cache = false; + offload_mmproj = true; + } else { + let mut found_plan = false; + + log::info!("Attempting VRAM-Maximized Hybrid plan (KV cache in VRAM only)."); + for candidate_gpu_layers in (0..=total_layers).rev() { + let vram_used_by_layers = candidate_gpu_layers.saturating_mul(layer_size); + if vram_used_by_layers > usable_vram { + continue; + } + + let ram_used_by_cpu_layers = + (total_layers.saturating_sub(candidate_gpu_layers)).saturating_mul(layer_size); + let ram_used_by_mmproj = if offload_mmproj { 0 } else { mmproj_size }; + let required_ram_for_model = ram_used_by_cpu_layers.saturating_add(ram_used_by_mmproj); + + if required_ram_for_model > usable_ram { + continue; + } + + let vram_left_for_kv = usable_vram.saturating_sub(vram_used_by_layers); + let ctx_in_vram_only = (vram_left_for_kv / kv_cache_per_token) as u64; + + if ctx_in_vram_only >= MIN_CONTEXT_LENGTH { + log::info!( + "Found VRAM-Maximized Hybrid plan with {} GPU layers.", + candidate_gpu_layers + ); + mode = ModelMode::Hybrid; + gpu_layers = candidate_gpu_layers; + let requested_target = requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx); + max_ctx_len = requested_target.min(ctx_in_vram_only); + no_offload_kv_cache = false; + found_plan = true; + break; + } + } + + if !found_plan { + log::info!("VRAM-Maximized plan not feasible. Falling back to Standard Hybrid (KV cache in VRAM+RAM)."); + for candidate_gpu_layers in (0..=total_layers).rev() { + let vram_used_by_layers = candidate_gpu_layers.saturating_mul(layer_size); + if vram_used_by_layers > usable_vram { + continue; + } + let vram_left_for_kv = usable_vram.saturating_sub(vram_used_by_layers); + let kv_in_vram = (vram_left_for_kv / kv_cache_per_token) as u64; + + let ram_used_by_cpu_layers = + (total_layers.saturating_sub(candidate_gpu_layers)).saturating_mul(layer_size); + let ram_used_by_mmproj = if offload_mmproj { 0 } else { mmproj_size }; + let required_ram_for_model = + ram_used_by_cpu_layers.saturating_add(ram_used_by_mmproj); + + if required_ram_for_model > usable_ram { + continue; + } + + let available_ram_for_kv = usable_ram.saturating_sub(required_ram_for_model); + let kv_in_ram = (available_ram_for_kv / kv_cache_per_token) as u64; + + let total_kv_tokens = kv_in_vram.saturating_add(kv_in_ram); + + if total_kv_tokens >= MIN_CONTEXT_LENGTH { + log::info!( + "Found Standard Hybrid plan with {} GPU layers.", + candidate_gpu_layers + ); + mode = if candidate_gpu_layers > 0 { + ModelMode::Hybrid + } else { + ModelMode::CPU + }; + gpu_layers = candidate_gpu_layers; + let requested_target = + requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx); + let max_possible_ctx = total_kv_tokens.min(model_max_ctx); + max_ctx_len = requested_target.min(max_possible_ctx); + no_offload_kv_cache = kv_in_ram > 0 && kv_in_vram == 0; + found_plan = true; + break; + } + } + } + + if !found_plan { + log::info!("No hybrid plan found. Attempting CPU-only plan."); + if model_size + mmproj_size <= usable_ram { + let available_ram_for_kv = usable_ram.saturating_sub(model_size + mmproj_size); + let kv_tokens = (available_ram_for_kv / kv_cache_per_token) as u64; + if kv_tokens >= MIN_CONTEXT_LENGTH { + mode = ModelMode::CPU; + gpu_layers = 0; + max_ctx_len = kv_tokens + .min(requested_ctx.unwrap_or(model_max_ctx)) + .min(model_max_ctx); + no_offload_kv_cache = true; + offload_mmproj = false; + } + } + } + } + + if let Some(req) = requested_ctx { + if req > 0 { + max_ctx_len = max_ctx_len.min(req); + } + } + max_ctx_len = max_ctx_len.min(model_max_ctx); + + if max_ctx_len > 0 { + log::info!("Max context before power-of-2 adjustment: {}", max_ctx_len); + max_ctx_len = 1u64 << (63 - max_ctx_len.leading_zeros()); + log::info!("Adjusted max context to power of 2: {}", max_ctx_len); + } + + if mode == ModelMode::Unsupported { + if max_ctx_len >= MIN_CONTEXT_LENGTH { + // do nothing, plan is viable but wasn't assigned a mode + } else { + gpu_layers = 0; + max_ctx_len = 0; + offload_mmproj = false; + } + } else if max_ctx_len < MIN_CONTEXT_LENGTH { + log::info!( + "Final context length {} is less than minimum required {}. Marking as unsupported.", + max_ctx_len, + MIN_CONTEXT_LENGTH + ); + mode = ModelMode::Unsupported; + gpu_layers = 0; + max_ctx_len = 0; + offload_mmproj = false; + } + + if mode == ModelMode::Hybrid { + batch_size = 256; + } else if mode == ModelMode::CPU || no_offload_kv_cache || mode == ModelMode::Unsupported { + batch_size = 64; + } + + if max_ctx_len > 0 { + batch_size = batch_size.min(max_ctx_len); + } else { + batch_size = 64; + } + + if mode == ModelMode::CPU || no_offload_kv_cache { + offload_mmproj = false; + } + + log::info!("Planned model load params: GPU Layers: {}, max_ctx_len: {}, kv_cache offload: {}, offload mmproj: {}, batch_size: {}", + gpu_layers, max_ctx_len, !no_offload_kv_cache, offload_mmproj, batch_size); + Ok(ModelPlan { + gpu_layers, + max_context_length: max_ctx_len, + no_offload_kv_cache, + offload_mmproj, + batch_size, + mode, + }) +} diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs index a2bc73c59..49a497cf1 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/types.rs @@ -1,4 +1,4 @@ -use serde::Serialize; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::convert::TryFrom; use std::io; @@ -52,3 +52,42 @@ pub struct GgufMetadata { pub tensor_count: u64, pub metadata: HashMap, } + +#[derive(Debug, Serialize, Deserialize)] +pub struct KVCacheEstimate { + pub size: u64, + pub per_token_size: u64, +} +#[derive(Debug, thiserror::Error)] +pub enum KVCacheError { + #[error("Invalid metadata: architecture not found")] + ArchitectureNotFound, + #[error("Invalid metadata: block_count not found or invalid")] + BlockCountInvalid, + #[error("Invalid metadata: head_count not found or invalid")] + HeadCountInvalid, + #[error("Invalid metadata: embedding_length not found or invalid")] + EmbeddingLengthInvalid, + #[error("Invalid metadata: context_length not found or invalid")] + ContextLengthInvalid, +} + +impl serde::Serialize for KVCacheError { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} + + +#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize)] +pub enum ModelSupportStatus { + #[serde(rename = "RED")] + Red, + #[serde(rename = "YELLOW")] + Yellow, + #[serde(rename = "GREEN")] + Green, +} diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs new file mode 100644 index 000000000..50e3f4a14 --- /dev/null +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs @@ -0,0 +1,164 @@ +use crate::gguf::helpers; +use crate::gguf::types::{GgufMetadata, KVCacheError, KVCacheEstimate}; +use std::collections::HashMap; +use std::fs::File; +use std::io::BufReader; + +// read gguf metadata +pub async fn read_gguf_metadata_internal(path: String) -> Result { + if path.starts_with("http://") || path.starts_with("https://") { + // Remote: read in 2MB chunks until successful + let client = reqwest::Client::new(); + let chunk_size = 2 * 1024 * 1024; // Fixed 2MB chunks + let max_total_size = 120 * 1024 * 1024; // Don't exceed 120MB total + let mut total_downloaded = 0; + let mut accumulated_data = Vec::new(); + + while total_downloaded < max_total_size { + let start = total_downloaded; + let end = std::cmp::min(start + chunk_size - 1, max_total_size - 1); + + let resp = client + .get(&path) + .header("Range", format!("bytes={}-{}", start, end)) + .send() + .await + .map_err(|e| format!("Failed to fetch chunk {}-{}: {}", start, end, e))?; + + let chunk_data = resp + .bytes() + .await + .map_err(|e| format!("Failed to read chunk response: {}", e))?; + + accumulated_data.extend_from_slice(&chunk_data); + total_downloaded += chunk_data.len(); + + // Try parsing after each chunk + let cursor = std::io::Cursor::new(&accumulated_data); + if let Ok(metadata) = helpers::read_gguf_metadata(cursor) { + return Ok(metadata); + } + + // If we got less data than expected, we've reached EOF + if chunk_data.len() < chunk_size { + break; + } + } + Err("Could not parse GGUF metadata from downloaded data".to_string()) + } else { + // Local: use streaming file reader + let file = + File::open(&path).map_err(|e| format!("Failed to open local file {}: {}", path, e))?; + let reader = BufReader::new(file); + + helpers::read_gguf_metadata(reader) + .map_err(|e| format!("Failed to parse GGUF metadata: {}", e)) + } +} + +/// Estimate KVCache size from a given metadata +pub async fn estimate_kv_cache_internal( + meta: HashMap, + ctx_size: Option, +) -> Result { + log::info!("Received ctx_size parameter: {:?}", ctx_size); + let arch = meta + .get("general.architecture") + .ok_or(KVCacheError::ArchitectureNotFound)?; + + // Number of layers + let n_layer_key = format!("{}.block_count", arch); + let n_layer = meta + .get(&n_layer_key) + .and_then(|s| s.parse::().ok()) + .filter(|&n| n > 0) + .ok_or(KVCacheError::BlockCountInvalid)?; + + // Attention heads (use kv heads if present, else full heads) + let n_head_key = format!("{}.attention.head_count", arch); + let n_head_kv_key = format!("{}.attention.head_count_kv", arch); + let n_head = meta + .get(&n_head_kv_key) + .and_then(|s| s.parse::().ok()) + .filter(|&n| n > 0) + .unwrap_or_else(|| { + meta.get(&n_head_key) + .and_then(|s| s.parse::().ok()) + .unwrap_or(0) + }); + if n_head == 0 { + return Err(KVCacheError::HeadCountInvalid); + } + + // Key/value dimensions + let key_len_key = format!("{}.attention.key_length", arch); + let val_len_key = format!("{}.attention.value_length", arch); + + let key_len = meta + .get(&key_len_key) + .and_then(|s| s.parse::().ok()) + .unwrap_or(0); + let val_len = meta + .get(&val_len_key) + .and_then(|s| s.parse::().ok()) + .unwrap_or(0); + + if key_len == 0 || val_len == 0 { + return Err(KVCacheError::EmbeddingLengthInvalid); + } + + // Context length + let max_ctx_key = format!("{}.context_length", arch); + let max_ctx = meta + .get(&max_ctx_key) + .and_then(|s| s.parse::().ok()) + .filter(|&n| n > 0) + .ok_or(KVCacheError::ContextLengthInvalid)?; + let ctx_len = ctx_size.map(|size| size.min(max_ctx)).unwrap_or(max_ctx); + + // Sliding window if present + let sliding_key = format!("{}.attention.sliding_window", arch); + let sliding_window = meta + .get(&sliding_key) + .and_then(|s| s.parse::().ok()) + .filter(|&n| n > 0); + + // Assume fp16 + const BYTES_PER_ELEMENT: u64 = 2; + + // Per-token KV size + let kv_per_token = n_layer * n_head * (key_len + val_len) * BYTES_PER_ELEMENT; + + // Pure full-attention cost + let full_cost = ctx_len * kv_per_token; + + // Pure sliding-window cost (tiny, only keeps last W tokens) + let sliding_cost = sliding_window.map(|w| w * kv_per_token); + + // Middle estimate: average of sliding + full if sliding_window is present + let chosen_size = if let Some(slide) = sliding_cost { + let middle = (full_cost + slide) / 2; + log::info!( + "KV estimates -> sliding: {} bytes (~{:.2} MB), full: {} bytes (~{:.2} MB), middle: {} bytes (~{:.2} MB)", + slide, + slide as f64 / (1024.0 * 1024.0), + full_cost, + full_cost as f64 / (1024.0 * 1024.0), + middle, + middle as f64 / (1024.0 * 1024.0) + ); + middle + } else { + log::info!( + "KV estimate (no SWA detected) -> full: {} bytes (~{:.2} MB)", + full_cost, + full_cost as f64 / (1024.0 * 1024.0) + ); + full_cost + }; + + Ok(KVCacheEstimate { + size: chosen_size, + per_token_size: kv_per_token, + }) +} diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs index d35cb24cf..0cd09cb2b 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/lib.rs @@ -33,6 +33,10 @@ pub fn init() -> TauriPlugin { commands::get_session_by_model, // GGUF commands gguf::commands::read_gguf_metadata, + gguf::commands::estimate_kv_cache_size, + gguf::commands::get_model_size, + gguf::commands::is_model_supported, + gguf::model_planner::plan_model_load ]) .setup(|app, _api| { // Initialize and manage the plugin state diff --git a/src-tauri/src/core/filesystem/commands.rs b/src-tauri/src/core/filesystem/commands.rs index 7cf7803c6..6bb3f534a 100644 --- a/src-tauri/src/core/filesystem/commands.rs +++ b/src-tauri/src/core/filesystem/commands.rs @@ -193,7 +193,7 @@ pub fn decompress(app: tauri::AppHandle, path: &str, output_dir: &str) -> Result fs::File::open(&path_buf).map_err(|e| e.to_string())? } }; - + #[cfg(not(windows))] let file = fs::File::open(&path_buf).map_err(|e| e.to_string())?; if path.ends_with(".tar.gz") { @@ -222,7 +222,10 @@ pub fn decompress(app: tauri::AppHandle, path: &str, output_dir: &str) -> Result { use std::os::unix::fs::PermissionsExt; if let Some(mode) = entry.unix_mode() { - let _ = std::fs::set_permissions(&outpath, std::fs::Permissions::from_mode(mode)); + let _ = std::fs::set_permissions( + &outpath, + std::fs::Permissions::from_mode(mode), + ); } } } diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx index 39a587cbc..9a3bfd814 100644 --- a/web-app/src/containers/ModelSetting.tsx +++ b/web-app/src/containers/ModelSetting.tsx @@ -103,6 +103,13 @@ export function ModelSetting({ }) } + if (model.settings?.batch_size && result.batchSize !== undefined) { + settingsToUpdate.push({ + key: 'batch_size', + value: result.batchSize, + }) + } + // Apply all settings in a single update to avoid race conditions if (settingsToUpdate.length > 0) { handleMultipleSettingsChange(settingsToUpdate) @@ -163,7 +170,8 @@ export function ModelSetting({ key === 'ctx_len' || key === 'ngl' || key === 'chat_template' || - key === 'offload_mmproj' + key === 'offload_mmproj' || + key === 'batch_size' ) if (requiresRestart) { @@ -222,7 +230,8 @@ export function ModelSetting({ key === 'ctx_len' || key === 'ngl' || key === 'chat_template' || - key === 'offload_mmproj' + key === 'offload_mmproj' || + key === 'batch_size' ) { // Check if model is running before stopping it serviceHub diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts index 4d476ae7c..bd3dbc49b 100644 --- a/web-app/src/hooks/useModelProvider.ts +++ b/web-app/src/hooks/useModelProvider.ts @@ -288,9 +288,40 @@ export const useModelProvider = create()( }) } + if (version <= 2 && state?.providers) { + state.providers.forEach((provider) => { + // Update cont_batching description for llamacpp provider + if (provider.provider === 'llamacpp' && provider.settings) { + const contBatchingSetting = provider.settings.find( + (s) => s.key === 'cont_batching' + ) + if (contBatchingSetting) { + contBatchingSetting.description = + 'Enable continuous batching (a.k.a dynamic batching) for concurrent requests.' + } + } + + // Migrate model settings + if (provider.models && provider.provider === 'llamacpp') { + provider.models.forEach((model) => { + if (!model.settings) model.settings = {} + + if (!model.settings.batch_size) { + model.settings.batch_size = { + ...modelSettings.batch_size, + controller_props: { + ...modelSettings.batch_size.controller_props, + }, + } + } + }) + } + }) + } + return state }, - version: 2, + version: 3, } ) ) diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts index 32d05d70c..1b90ee732 100644 --- a/web-app/src/lib/predefined.ts +++ b/web-app/src/lib/predefined.ts @@ -153,4 +153,16 @@ export const modelSettings = { value: false, }, }, + batch_size: { + key: 'batch_size', + title: 'Batch Size', + description: 'Logical maximum batch size for processing prompts.', + controller_type: 'input', + controller_props: { + value: 2048, + placeholder: '2048', + type: 'number', + textAlign: 'right', + }, + }, } diff --git a/web-app/src/services/models/default.ts b/web-app/src/services/models/default.ts index 186706334..5d18e2985 100644 --- a/web-app/src/services/models/default.ts +++ b/web-app/src/services/models/default.ts @@ -533,19 +533,21 @@ export class DefaultModelsService implements ModelsService { // Fallback if method is not available console.warn('planModelLoad method not available in llamacpp engine') return { - gpuLayers: 0, + gpuLayers: 100, maxContextLength: 2048, - noOffloadKVCache: true, + noOffloadKVCache: false, offloadMmproj: false, + batchSize: 2048, mode: 'Unsupported', } } catch (error) { console.error(`Error planning model load for path ${modelPath}:`, error) return { - gpuLayers: 0, + gpuLayers: 100, maxContextLength: 2048, - noOffloadKVCache: true, + noOffloadKVCache: false, offloadMmproj: false, + batchSize: 2048, mode: 'Unsupported', } } diff --git a/web-app/src/services/models/types.ts b/web-app/src/services/models/types.ts index d92dae38a..6248e82ac 100644 --- a/web-app/src/services/models/types.ts +++ b/web-app/src/services/models/types.ts @@ -86,6 +86,7 @@ export interface ModelPlan { maxContextLength: number noOffloadKVCache: boolean offloadMmproj: boolean + batchSize: number mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported' }