From 706dad268794018fd0600849c0abbbe4a182a7ab Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Tue, 7 Oct 2025 19:37:58 +0530
Subject: [PATCH] feat: Add support for llamacpp MoE offloading setting (#6748)

* feat: Add support for llamacpp MoE offloading setting

Introduces the n_cpu_moe configuration setting for the llamacpp provider. This allows users to specify the number of Mixture of Experts (MoE) layers whose weights should be offloaded to the CPU via the --n-cpu-moe flag in llama.cpp.

This is useful for running large MoE models by balancing resource usage, for example, by keeping attention on the GPU and offloading expert FFNs to the CPU.

The changes include:

 - Updating the llamacpp-extension to accept and pass the --n-cpu-moe argument.

 - Adding the input field to the Model Settings UI (ModelSetting.tsx).

 - Including model setting migration logic and bumping the store version to 4.

* remove unused import

* feat: add cpu-moe boolean flag

* chore: remove unused migration cont_batching

* chore: fix migration delete old key and add new one

* chore: fix migration

---------

Co-authored-by: Faisal Amir <urmauur@gmail.com>
---
 extensions/llamacpp-extension/src/index.ts |  7 ++++-
 web-app/src/containers/ModelSetting.tsx    | 12 ++++++--
 web-app/src/hooks/useModelProvider.ts      | 34 ++++++++++++++++++++--
 web-app/src/lib/predefined.ts              | 22 ++++++++++++++
 4 files changed, 69 insertions(+), 6 deletions(-)
diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
index 8590891b6..f1a750138 100644
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@@ -39,7 +39,6 @@ import { getProxyConfig } from './util'
 import { basename } from '@tauri-apps/api/path'
 import {
   readGgufMetadata,
-  estimateKVCacheSize,
   getModelSize,
   isModelSupported,
   planModelLoadInternal,
@@ -58,6 +57,8 @@ type LlamacppConfig = {
   chat_template: string
   n_gpu_layers: number
   offload_mmproj: boolean
+  cpu_moe: boolean
+  n_cpu_moe: number
   override_tensor_buffer_t: string
   ctx_size: number
   threads: number
@@ -1583,6 +1584,10 @@ export default class llamacpp_extension extends AIEngine {
     ])
     args.push('--jinja')
     args.push('-m', modelPath)
+    if (cfg.cpu_moe) args.push('--cpu-moe')
+    if (cfg.n_cpu_moe && cfg.n_cpu_moe > 0) {
+      args.push('--n-cpu-moe', String(cfg.n_cpu_moe))
+    }
     // For overriding tensor buffer type, useful where
     // massive MOE models can be made faster by keeping attention on the GPU
     // and offloading the expert FFNs to the CPU.
diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx
index 079b735aa..3f3391d51 100644
--- a/web-app/src/containers/ModelSetting.tsx
+++ b/web-app/src/containers/ModelSetting.tsx
@@ -171,7 +171,9 @@ export function ModelSetting({
           key === 'ngl' ||
           key === 'chat_template' ||
           key === 'offload_mmproj' ||
-          key === 'batch_size'
+          key === 'batch_size' ||
+          key === 'cpu_moe' ||
+          key === 'n_cpu_moe'
       )
 
       if (requiresRestart) {
@@ -231,7 +233,9 @@ export function ModelSetting({
         key === 'ngl' ||
         key === 'chat_template' ||
         key === 'offload_mmproj' ||
-        key === 'batch_size'
+        key === 'batch_size' ||
+        key === 'cpu_moe' ||
+        key === 'n_cpu_moe'
       ) {
         // Check if model is running before stopping it
         serviceHub
@@ -261,7 +265,9 @@ export function ModelSetting({
       <SheetContent className="h-[calc(100%-8px)] top-1 right-1 rounded-e-md overflow-y-auto">
         <SheetHeader>
           <SheetTitle>
-            {t('common:modelSettings.title', { modelId: getModelDisplayName(model) })}
+            {t('common:modelSettings.title', {
+              modelId: getModelDisplayName(model),
+            })}
           </SheetTitle>
           <SheetDescription>
             {t('common:modelSettings.description')}
diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts
index 926cbdd0d..99380a6a7 100644
--- a/web-app/src/hooks/useModelProvider.ts
+++ b/web-app/src/hooks/useModelProvider.ts
@@ -364,7 +364,9 @@ export const useModelProvider = create<ModelProviderState>()(
             }
 
             if (provider.provider === 'cohere') {
-              if (provider.base_url === 'https://api.cohere.ai/compatibility/v1') {
+              if (
+                provider.base_url === 'https://api.cohere.ai/compatibility/v1'
+              ) {
                 provider.base_url = 'https://api.cohere.ai/v1'
               }
 
@@ -389,13 +391,41 @@ export const useModelProvider = create<ModelProviderState>()(
                 }
               }
             }
+          })
+        }
 
+        if (version <= 4 && state?.providers) {
+          state.providers.forEach((provider) => {
+            // Migrate model settings
+            if (provider.models && provider.provider === 'llamacpp') {
+              provider.models.forEach((model) => {
+                if (!model.settings) model.settings = {}
+
+                if (!model.settings.cpu_moe) {
+                  model.settings.cpu_moe = {
+                    ...modelSettings.cpu_moe,
+                    controller_props: {
+                      ...modelSettings.cpu_moe.controller_props,
+                    },
+                  }
+                }
+
+                if (!model.settings.n_cpu_moe) {
+                  model.settings.n_cpu_moe = {
+                    ...modelSettings.n_cpu_moe,
+                    controller_props: {
+                      ...modelSettings.n_cpu_moe.controller_props,
+                    },
+                  }
+                }
+              })
+            }
           })
         }
 
         return state
       },
-      version: 4,
+      version: 5,
     }
   )
 )
diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts
index 1b90ee732..bd1bf3d6b 100644
--- a/web-app/src/lib/predefined.ts
+++ b/web-app/src/lib/predefined.ts
@@ -133,6 +133,28 @@ export const modelSettings = {
       textAlign: 'right',
     },
   },
+  cpu_moe: {
+    key: 'cpu_moe',
+    title: 'Keep all Experts in CPU',
+    description:
+      'Keep all Mixture of Experts (MoE) weights in the CPU (if GPU is used).',
+    controller_type: 'checkbox',
+    controller_props: {
+      value: false,
+    },
+  },
+  n_cpu_moe: {
+    key: 'n_cpu_moe',
+    title: 'Number of MoE weights in the CPU',
+    description:
+      'Keep the Mixture of Experts (MoE) weights of the first N layers in the CPU (if GPU is used)',
+    controller_type: 'input',
+    controller_props: {
+      value: '',
+      placeholder: '24',
+      type: 'number',
+    },
+  },
   override_tensor_buffer_t: {
     key: 'override_tensor_buffer_t',
     title: 'Override Tensor Buffer Type',