feat: Add support for llamacpp MoE offloading setting (#6748)
* feat: Add support for llamacpp MoE offloading setting Introduces the n_cpu_moe configuration setting for the llamacpp provider. This allows users to specify the number of Mixture of Experts (MoE) layers whose weights should be offloaded to the CPU via the --n-cpu-moe flag in llama.cpp. This is useful for running large MoE models by balancing resource usage, for example, by keeping attention on the GPU and offloading expert FFNs to the CPU. The changes include: - Updating the llamacpp-extension to accept and pass the --n-cpu-moe argument. - Adding the input field to the Model Settings UI (ModelSetting.tsx). - Including model setting migration logic and bumping the store version to 4. * remove unused import * feat: add cpu-moe boolean flag * chore: remove unused migration cont_batching * chore: fix migration delete old key and add new one * chore: fix migration --------- Co-authored-by: Faisal Amir <urmauur@gmail.com>
This commit is contained in:
parent
e5be683a97
commit
706dad2687
@ -39,7 +39,6 @@ import { getProxyConfig } from './util'
|
||||
import { basename } from '@tauri-apps/api/path'
|
||||
import {
|
||||
readGgufMetadata,
|
||||
estimateKVCacheSize,
|
||||
getModelSize,
|
||||
isModelSupported,
|
||||
planModelLoadInternal,
|
||||
@ -58,6 +57,8 @@ type LlamacppConfig = {
|
||||
chat_template: string
|
||||
n_gpu_layers: number
|
||||
offload_mmproj: boolean
|
||||
cpu_moe: boolean
|
||||
n_cpu_moe: number
|
||||
override_tensor_buffer_t: string
|
||||
ctx_size: number
|
||||
threads: number
|
||||
@ -1583,6 +1584,10 @@ export default class llamacpp_extension extends AIEngine {
|
||||
])
|
||||
args.push('--jinja')
|
||||
args.push('-m', modelPath)
|
||||
if (cfg.cpu_moe) args.push('--cpu-moe')
|
||||
if (cfg.n_cpu_moe && cfg.n_cpu_moe > 0) {
|
||||
args.push('--n-cpu-moe', String(cfg.n_cpu_moe))
|
||||
}
|
||||
// For overriding tensor buffer type, useful where
|
||||
// massive MOE models can be made faster by keeping attention on the GPU
|
||||
// and offloading the expert FFNs to the CPU.
|
||||
|
||||
@ -171,7 +171,9 @@ export function ModelSetting({
|
||||
key === 'ngl' ||
|
||||
key === 'chat_template' ||
|
||||
key === 'offload_mmproj' ||
|
||||
key === 'batch_size'
|
||||
key === 'batch_size' ||
|
||||
key === 'cpu_moe' ||
|
||||
key === 'n_cpu_moe'
|
||||
)
|
||||
|
||||
if (requiresRestart) {
|
||||
@ -231,7 +233,9 @@ export function ModelSetting({
|
||||
key === 'ngl' ||
|
||||
key === 'chat_template' ||
|
||||
key === 'offload_mmproj' ||
|
||||
key === 'batch_size'
|
||||
key === 'batch_size' ||
|
||||
key === 'cpu_moe' ||
|
||||
key === 'n_cpu_moe'
|
||||
) {
|
||||
// Check if model is running before stopping it
|
||||
serviceHub
|
||||
@ -261,7 +265,9 @@ export function ModelSetting({
|
||||
<SheetContent className="h-[calc(100%-8px)] top-1 right-1 rounded-e-md overflow-y-auto">
|
||||
<SheetHeader>
|
||||
<SheetTitle>
|
||||
{t('common:modelSettings.title', { modelId: getModelDisplayName(model) })}
|
||||
{t('common:modelSettings.title', {
|
||||
modelId: getModelDisplayName(model),
|
||||
})}
|
||||
</SheetTitle>
|
||||
<SheetDescription>
|
||||
{t('common:modelSettings.description')}
|
||||
|
||||
@ -364,7 +364,9 @@ export const useModelProvider = create<ModelProviderState>()(
|
||||
}
|
||||
|
||||
if (provider.provider === 'cohere') {
|
||||
if (provider.base_url === 'https://api.cohere.ai/compatibility/v1') {
|
||||
if (
|
||||
provider.base_url === 'https://api.cohere.ai/compatibility/v1'
|
||||
) {
|
||||
provider.base_url = 'https://api.cohere.ai/v1'
|
||||
}
|
||||
|
||||
@ -389,13 +391,41 @@ export const useModelProvider = create<ModelProviderState>()(
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if (version <= 4 && state?.providers) {
|
||||
state.providers.forEach((provider) => {
|
||||
// Migrate model settings
|
||||
if (provider.models && provider.provider === 'llamacpp') {
|
||||
provider.models.forEach((model) => {
|
||||
if (!model.settings) model.settings = {}
|
||||
|
||||
if (!model.settings.cpu_moe) {
|
||||
model.settings.cpu_moe = {
|
||||
...modelSettings.cpu_moe,
|
||||
controller_props: {
|
||||
...modelSettings.cpu_moe.controller_props,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if (!model.settings.n_cpu_moe) {
|
||||
model.settings.n_cpu_moe = {
|
||||
...modelSettings.n_cpu_moe,
|
||||
controller_props: {
|
||||
...modelSettings.n_cpu_moe.controller_props,
|
||||
},
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
return state
|
||||
},
|
||||
version: 4,
|
||||
version: 5,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
@ -133,6 +133,28 @@ export const modelSettings = {
|
||||
textAlign: 'right',
|
||||
},
|
||||
},
|
||||
cpu_moe: {
|
||||
key: 'cpu_moe',
|
||||
title: 'Keep all Experts in CPU',
|
||||
description:
|
||||
'Keep all Mixture of Experts (MoE) weights in the CPU (if GPU is used).',
|
||||
controller_type: 'checkbox',
|
||||
controller_props: {
|
||||
value: false,
|
||||
},
|
||||
},
|
||||
n_cpu_moe: {
|
||||
key: 'n_cpu_moe',
|
||||
title: 'Number of MoE weights in the CPU',
|
||||
description:
|
||||
'Keep the Mixture of Experts (MoE) weights of the first N layers in the CPU (if GPU is used)',
|
||||
controller_type: 'input',
|
||||
controller_props: {
|
||||
value: '',
|
||||
placeholder: '24',
|
||||
type: 'number',
|
||||
},
|
||||
},
|
||||
override_tensor_buffer_t: {
|
||||
key: 'override_tensor_buffer_t',
|
||||
title: 'Override Tensor Buffer Type',
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user