feat: Add support for llamacpp MoE offloading setting (#6748)

* feat: Add support for llamacpp MoE offloading setting

Introduces the n_cpu_moe configuration setting for the llamacpp provider. This allows users to specify the number of Mixture of Experts (MoE) layers whose weights should be offloaded to the CPU via the --n-cpu-moe flag in llama.cpp.

This is useful for running large MoE models by balancing resource usage, for example, by keeping attention on the GPU and offloading expert FFNs to the CPU.

The changes include:

 - Updating the llamacpp-extension to accept and pass the --n-cpu-moe argument.

 - Adding the input field to the Model Settings UI (ModelSetting.tsx).

 - Including model setting migration logic and bumping the store version to 4.

* remove unused import

* feat: add cpu-moe boolean flag

* chore: remove unused migration cont_batching

* chore: fix migration delete old key and add new one

* chore: fix migration

---------

Co-authored-by: Faisal Amir <urmauur@gmail.com>
This commit is contained in:
Akarshan Biswas 2025-10-07 19:37:58 +05:30 committed by GitHub
parent e5be683a97
commit 706dad2687
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 69 additions and 6 deletions

View File

@ -39,7 +39,6 @@ import { getProxyConfig } from './util'
import { basename } from '@tauri-apps/api/path'
import {
readGgufMetadata,
estimateKVCacheSize,
getModelSize,
isModelSupported,
planModelLoadInternal,
@ -58,6 +57,8 @@ type LlamacppConfig = {
chat_template: string
n_gpu_layers: number
offload_mmproj: boolean
cpu_moe: boolean
n_cpu_moe: number
override_tensor_buffer_t: string
ctx_size: number
threads: number
@ -1583,6 +1584,10 @@ export default class llamacpp_extension extends AIEngine {
])
args.push('--jinja')
args.push('-m', modelPath)
if (cfg.cpu_moe) args.push('--cpu-moe')
if (cfg.n_cpu_moe && cfg.n_cpu_moe > 0) {
args.push('--n-cpu-moe', String(cfg.n_cpu_moe))
}
// For overriding tensor buffer type, useful where
// massive MOE models can be made faster by keeping attention on the GPU
// and offloading the expert FFNs to the CPU.

View File

@ -171,7 +171,9 @@ export function ModelSetting({
key === 'ngl' ||
key === 'chat_template' ||
key === 'offload_mmproj' ||
key === 'batch_size'
key === 'batch_size' ||
key === 'cpu_moe' ||
key === 'n_cpu_moe'
)
if (requiresRestart) {
@ -231,7 +233,9 @@ export function ModelSetting({
key === 'ngl' ||
key === 'chat_template' ||
key === 'offload_mmproj' ||
key === 'batch_size'
key === 'batch_size' ||
key === 'cpu_moe' ||
key === 'n_cpu_moe'
) {
// Check if model is running before stopping it
serviceHub
@ -261,7 +265,9 @@ export function ModelSetting({
<SheetContent className="h-[calc(100%-8px)] top-1 right-1 rounded-e-md overflow-y-auto">
<SheetHeader>
<SheetTitle>
{t('common:modelSettings.title', { modelId: getModelDisplayName(model) })}
{t('common:modelSettings.title', {
modelId: getModelDisplayName(model),
})}
</SheetTitle>
<SheetDescription>
{t('common:modelSettings.description')}

View File

@ -364,7 +364,9 @@ export const useModelProvider = create<ModelProviderState>()(
}
if (provider.provider === 'cohere') {
if (provider.base_url === 'https://api.cohere.ai/compatibility/v1') {
if (
provider.base_url === 'https://api.cohere.ai/compatibility/v1'
) {
provider.base_url = 'https://api.cohere.ai/v1'
}
@ -389,13 +391,41 @@ export const useModelProvider = create<ModelProviderState>()(
}
}
}
})
}
if (version <= 4 && state?.providers) {
state.providers.forEach((provider) => {
// Migrate model settings
if (provider.models && provider.provider === 'llamacpp') {
provider.models.forEach((model) => {
if (!model.settings) model.settings = {}
if (!model.settings.cpu_moe) {
model.settings.cpu_moe = {
...modelSettings.cpu_moe,
controller_props: {
...modelSettings.cpu_moe.controller_props,
},
}
}
if (!model.settings.n_cpu_moe) {
model.settings.n_cpu_moe = {
...modelSettings.n_cpu_moe,
controller_props: {
...modelSettings.n_cpu_moe.controller_props,
},
}
}
})
}
})
}
return state
},
version: 4,
version: 5,
}
)
)

View File

@ -133,6 +133,28 @@ export const modelSettings = {
textAlign: 'right',
},
},
cpu_moe: {
key: 'cpu_moe',
title: 'Keep all Experts in CPU',
description:
'Keep all Mixture of Experts (MoE) weights in the CPU (if GPU is used).',
controller_type: 'checkbox',
controller_props: {
value: false,
},
},
n_cpu_moe: {
key: 'n_cpu_moe',
title: 'Number of MoE weights in the CPU',
description:
'Keep the Mixture of Experts (MoE) weights of the first N layers in the CPU (if GPU is used)',
controller_type: 'input',
controller_props: {
value: '',
placeholder: '24',
type: 'number',
},
},
override_tensor_buffer_t: {
key: 'override_tensor_buffer_t',
title: 'Override Tensor Buffer Type',