feat: Add support for llamacpp MoE offloading setting (#6748)

* feat: Add support for llamacpp MoE offloading setting Introduces the n_cpu_moe configuration setting for the llamacpp provider. This allows users to specify the number of Mixture of Experts (MoE) layers whose weights should be offloaded to the CPU via the --n-cpu-moe flag in llama.cpp. This is useful for running large MoE models by balancing resource usage, for example, by keeping attention on the GPU and offloading expert FFNs to the CPU. The changes include: - Updating the llamacpp-extension to accept and pass the --n-cpu-moe argument. - Adding the input field to the Model Settings UI (ModelSetting.tsx). - Including model setting migration logic and bumping the store version to 4. * remove unused import * feat: add cpu-moe boolean flag * chore: remove unused migration cont_batching * chore: fix migration delete old key and add new one * chore: fix migration --------- Co-authored-by: Faisal Amir <urmauur@gmail.com>
2025-10-07 19:37:58 +05:30 · 2025-10-07 19:37:58 +05:30 · 706dad2687
commit 706dad2687
parent e5be683a97
4 changed files with 69 additions and 6 deletions
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -39,7 +39,6 @@ import { getProxyConfig } from './util'
 import { basename } from '@tauri-apps/api/path'
 import {
  readGgufMetadata,
-  estimateKVCacheSize,
  getModelSize,
  isModelSupported,
  planModelLoadInternal,
@ -58,6 +57,8 @@ type LlamacppConfig = {
  chat_template: string
  n_gpu_layers: number
  offload_mmproj: boolean
+  cpu_moe: boolean
+  n_cpu_moe: number
  override_tensor_buffer_t: string
  ctx_size: number
  threads: number
@ -1583,6 +1584,10 @@ export default class llamacpp_extension extends AIEngine {
    ])
    args.push('--jinja')
    args.push('-m', modelPath)
+    if (cfg.cpu_moe) args.push('--cpu-moe')
+    if (cfg.n_cpu_moe && cfg.n_cpu_moe > 0) {
+      args.push('--n-cpu-moe', String(cfg.n_cpu_moe))
+    }
    // For overriding tensor buffer type, useful where
    // massive MOE models can be made faster by keeping attention on the GPU
    // and offloading the expert FFNs to the CPU.
--- a/web-app/src/containers/ModelSetting.tsx
+++ b/web-app/src/containers/ModelSetting.tsx
@ -171,7 +171,9 @@ export function ModelSetting({
          key === 'ngl' ||
          key === 'chat_template' ||
          key === 'offload_mmproj' ||
-          key === 'batch_size'
+          key === 'batch_size' ||
+          key === 'cpu_moe' ||
+          key === 'n_cpu_moe'
      )

      if (requiresRestart) {
@ -231,7 +233,9 @@ export function ModelSetting({
        key === 'ngl' ||
        key === 'chat_template' ||
        key === 'offload_mmproj' ||
-        key === 'batch_size'
+        key === 'batch_size' ||
+        key === 'cpu_moe' ||
+        key === 'n_cpu_moe'
      ) {
        // Check if model is running before stopping it
        serviceHub
@ -261,7 +265,9 @@ export function ModelSetting({
      <SheetContent className="h-[calc(100%-8px)] top-1 right-1 rounded-e-md overflow-y-auto">
        <SheetHeader>
          <SheetTitle>
-            {t('common:modelSettings.title', { modelId: getModelDisplayName(model) })}
+            {t('common:modelSettings.title', {
+              modelId: getModelDisplayName(model),
+            })}
          </SheetTitle>
          <SheetDescription>
            {t('common:modelSettings.description')}
--- a/web-app/src/hooks/useModelProvider.ts
+++ b/web-app/src/hooks/useModelProvider.ts
@ -364,7 +364,9 @@ export const useModelProvider = create<ModelProviderState>()(
            }

            if (provider.provider === 'cohere') {
-              if (provider.base_url === 'https://api.cohere.ai/compatibility/v1') {
+              if (
+                provider.base_url === 'https://api.cohere.ai/compatibility/v1'
+              ) {
                provider.base_url = 'https://api.cohere.ai/v1'
              }

@ -389,13 +391,41 @@ export const useModelProvider = create<ModelProviderState>()(
                }
              }
            }
+          })
+        }

+        if (version <= 4 && state?.providers) {
+          state.providers.forEach((provider) => {
+            // Migrate model settings
+            if (provider.models && provider.provider === 'llamacpp') {
+              provider.models.forEach((model) => {
+                if (!model.settings) model.settings = {}
+
+                if (!model.settings.cpu_moe) {
+                  model.settings.cpu_moe = {
+                    ...modelSettings.cpu_moe,
+                    controller_props: {
+                      ...modelSettings.cpu_moe.controller_props,
+                    },
+                  }
+                }
+
+                if (!model.settings.n_cpu_moe) {
+                  model.settings.n_cpu_moe = {
+                    ...modelSettings.n_cpu_moe,
+                    controller_props: {
+                      ...modelSettings.n_cpu_moe.controller_props,
+                    },
+                  }
+                }
+              })
+            }
          })
        }

        return state
      },
-      version: 4,
+      version: 5,
    }
  )
 )
--- a/web-app/src/lib/predefined.ts
+++ b/web-app/src/lib/predefined.ts
@ -133,6 +133,28 @@ export const modelSettings = {
      textAlign: 'right',
    },
  },
+  cpu_moe: {
+    key: 'cpu_moe',
+    title: 'Keep all Experts in CPU',
+    description:
+      'Keep all Mixture of Experts (MoE) weights in the CPU (if GPU is used).',
+    controller_type: 'checkbox',
+    controller_props: {
+      value: false,
+    },
+  },
+  n_cpu_moe: {
+    key: 'n_cpu_moe',
+    title: 'Number of MoE weights in the CPU',
+    description:
+      'Keep the Mixture of Experts (MoE) weights of the first N layers in the CPU (if GPU is used)',
+    controller_type: 'input',
+    controller_props: {
+      value: '',
+      placeholder: '24',
+      type: 'number',
+    },
+  },
  override_tensor_buffer_t: {
    key: 'override_tensor_buffer_t',
    title: 'Override Tensor Buffer Type',