From 1f1605bdf98cf88433647ac04827c0afedaf526b Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 7 Aug 2025 10:31:34 +0530 Subject: [PATCH] feat: Add support for overriding tensor buffer type (#6062) * feat: Add support for overriding tensor buffer type This commit introduces a new configuration option, `override_tensor_buffer_t`, which allows users to specify a regex for matching tensor names to override their buffer type. This is an advanced setting primarily useful for optimizing the performance of large models, particularly Mixture of Experts (MoE) models. By overriding the tensor buffer type, users can keep critical parts of the model, like the attention layers, on the GPU while offloading other parts, such as the expert feed-forward networks, to the CPU. This can lead to significant speed improvements for massive models. Additionally, this change refines the error message to be more specific when a model fails to load. The previous message "Failed to load llama-server" has been updated to "Failed to load model" to be more accurate. * chore: update FE to suppoer override-tensor --------- Co-authored-by: Faisal Amir --- extensions/llamacpp-extension/src/index.ts | 13 +++++++++-- web-app/src/containers/ModelSetting.tsx | 6 +++-- web-app/src/hooks/useModelProvider.ts | 27 +++++++++++++++++++++- web-app/src/lib/predefined.ts | 11 +++++++++ 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 140b08418..91fe4dd34 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -39,6 +39,7 @@ type LlamacppConfig = { auto_unload: boolean chat_template: string n_gpu_layers: number + override_tensor_buffer_t: string ctx_size: number threads: number threads_batch: number @@ -1262,6 +1263,14 @@ export default class llamacpp_extension extends AIEngine { args.push('--jinja') args.push('--reasoning-format', 'none') args.push('-m', modelPath) + // For overriding tensor buffer type, useful where + // massive MOE models can be made faster by keeping attention on the GPU + // and offloading the expert FFNs to the CPU. + // This is an expert level settings and should only be used by people + // who knows what they are doing. + // Takes a regex with matching tensor name as input + if (cfg.override_tensor_buffer_t) + args.push('--override-tensor', cfg.override_tensor_buffer_t) args.push('-a', modelId) args.push('--port', String(port)) if (modelConfig.mmproj_path) { @@ -1340,8 +1349,8 @@ export default class llamacpp_extension extends AIEngine { return sInfo } catch (error) { - logger.error('Error loading llama-server:\n', error) - throw new Error(`Failed to load llama-server: ${error}`) + logger.error('Error in load command:\n', error) + throw new Error(`Failed to load model:\n${error}`) } } diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx index 9d8406801..29d996382 100644 --- a/web-app/src/containers/ModelSetting.tsx +++ b/web-app/src/containers/ModelSetting.tsx @@ -106,8 +106,10 @@ export function ModelSetting({
diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts index e6c1ae74c..b1a988183 100644 --- a/web-app/src/hooks/useModelProvider.ts +++ b/web-app/src/hooks/useModelProvider.ts @@ -276,9 +276,34 @@ export const useModelProvider = create()( }) } + // Migration for override_tensor_buffer_type key (version 2 -> 3) + if (version === 2 && state?.providers) { + state.providers.forEach((provider) => { + if (provider.models) { + provider.models.forEach((model) => { + // Initialize settings if it doesn't exist + if (!model.settings) { + model.settings = {} + } + + // Add missing override_tensor_buffer_type setting if it doesn't exist + if (!model.settings.override_tensor_buffer_t) { + model.settings.override_tensor_buffer_t = { + ...modelSettings.override_tensor_buffer_t, + controller_props: { + ...modelSettings.override_tensor_buffer_t + .controller_props, + }, + } + } + }) + } + }) + } + return state }, - version: 2, + version: 3, } ) ) diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts index 1d73fbacc..b4d5164e7 100644 --- a/web-app/src/lib/predefined.ts +++ b/web-app/src/lib/predefined.ts @@ -133,4 +133,15 @@ export const modelSettings = { textAlign: 'right', }, }, + override_tensor_buffer_t: { + key: 'override_tensor_buffer_t', + title: 'Override Tensor Buffer Type', + description: 'Override the tensor buffer type for the model', + controller_type: 'input', + controller_props: { + value: '', + placeholder: 'e.g., layers\\.\\d+\\.ffn_.*=CPU', + type: 'text', + }, + }, }