diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 8590891b6..f1a750138 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -39,7 +39,6 @@ import { getProxyConfig } from './util' import { basename } from '@tauri-apps/api/path' import { readGgufMetadata, - estimateKVCacheSize, getModelSize, isModelSupported, planModelLoadInternal, @@ -58,6 +57,8 @@ type LlamacppConfig = { chat_template: string n_gpu_layers: number offload_mmproj: boolean + cpu_moe: boolean + n_cpu_moe: number override_tensor_buffer_t: string ctx_size: number threads: number @@ -1583,6 +1584,10 @@ export default class llamacpp_extension extends AIEngine { ]) args.push('--jinja') args.push('-m', modelPath) + if (cfg.cpu_moe) args.push('--cpu-moe') + if (cfg.n_cpu_moe && cfg.n_cpu_moe > 0) { + args.push('--n-cpu-moe', String(cfg.n_cpu_moe)) + } // For overriding tensor buffer type, useful where // massive MOE models can be made faster by keeping attention on the GPU // and offloading the expert FFNs to the CPU. diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx index 079b735aa..3f3391d51 100644 --- a/web-app/src/containers/ModelSetting.tsx +++ b/web-app/src/containers/ModelSetting.tsx @@ -171,7 +171,9 @@ export function ModelSetting({ key === 'ngl' || key === 'chat_template' || key === 'offload_mmproj' || - key === 'batch_size' + key === 'batch_size' || + key === 'cpu_moe' || + key === 'n_cpu_moe' ) if (requiresRestart) { @@ -231,7 +233,9 @@ export function ModelSetting({ key === 'ngl' || key === 'chat_template' || key === 'offload_mmproj' || - key === 'batch_size' + key === 'batch_size' || + key === 'cpu_moe' || + key === 'n_cpu_moe' ) { // Check if model is running before stopping it serviceHub @@ -261,7 +265,9 @@ export function ModelSetting({ - {t('common:modelSettings.title', { modelId: getModelDisplayName(model) })} + {t('common:modelSettings.title', { + modelId: getModelDisplayName(model), + })} {t('common:modelSettings.description')} diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts index 926cbdd0d..99380a6a7 100644 --- a/web-app/src/hooks/useModelProvider.ts +++ b/web-app/src/hooks/useModelProvider.ts @@ -364,7 +364,9 @@ export const useModelProvider = create()( } if (provider.provider === 'cohere') { - if (provider.base_url === 'https://api.cohere.ai/compatibility/v1') { + if ( + provider.base_url === 'https://api.cohere.ai/compatibility/v1' + ) { provider.base_url = 'https://api.cohere.ai/v1' } @@ -389,13 +391,41 @@ export const useModelProvider = create()( } } } + }) + } + if (version <= 4 && state?.providers) { + state.providers.forEach((provider) => { + // Migrate model settings + if (provider.models && provider.provider === 'llamacpp') { + provider.models.forEach((model) => { + if (!model.settings) model.settings = {} + + if (!model.settings.cpu_moe) { + model.settings.cpu_moe = { + ...modelSettings.cpu_moe, + controller_props: { + ...modelSettings.cpu_moe.controller_props, + }, + } + } + + if (!model.settings.n_cpu_moe) { + model.settings.n_cpu_moe = { + ...modelSettings.n_cpu_moe, + controller_props: { + ...modelSettings.n_cpu_moe.controller_props, + }, + } + } + }) + } }) } return state }, - version: 4, + version: 5, } ) ) diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts index 1b90ee732..bd1bf3d6b 100644 --- a/web-app/src/lib/predefined.ts +++ b/web-app/src/lib/predefined.ts @@ -133,6 +133,28 @@ export const modelSettings = { textAlign: 'right', }, }, + cpu_moe: { + key: 'cpu_moe', + title: 'Keep all Experts in CPU', + description: + 'Keep all Mixture of Experts (MoE) weights in the CPU (if GPU is used).', + controller_type: 'checkbox', + controller_props: { + value: false, + }, + }, + n_cpu_moe: { + key: 'n_cpu_moe', + title: 'Number of MoE weights in the CPU', + description: + 'Keep the Mixture of Experts (MoE) weights of the first N layers in the CPU (if GPU is used)', + controller_type: 'input', + controller_props: { + value: '', + placeholder: '24', + type: 'number', + }, + }, override_tensor_buffer_t: { key: 'override_tensor_buffer_t', title: 'Override Tensor Buffer Type',