feat: Add support for overriding tensor buffer type (#6062)

* feat: Add support for overriding tensor buffer type

This commit introduces a new configuration option, `override_tensor_buffer_t`, which allows users to specify a regex for matching tensor names to override their buffer type. This is an advanced setting primarily useful for optimizing the performance of large models, particularly Mixture of Experts (MoE) models.

By overriding the tensor buffer type, users can keep critical parts of the model, like the attention layers, on the GPU while offloading other parts, such as the expert feed-forward networks, to the CPU. This can lead to significant speed improvements for massive models.

Additionally, this change refines the error message to be more specific when a model fails to load. The previous message "Failed to load llama-server" has been updated to "Failed to load model" to be more accurate.

* chore: update FE to suppoer override-tensor

---------

Co-authored-by: Faisal Amir <urmauur@gmail.com>
This commit is contained in:
Akarshan Biswas 2025-08-07 10:31:34 +05:30 committed by GitHub
parent c1cdc434a8
commit 1f1605bdf9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 52 additions and 5 deletions

View File

@ -39,6 +39,7 @@ type LlamacppConfig = {
auto_unload: boolean auto_unload: boolean
chat_template: string chat_template: string
n_gpu_layers: number n_gpu_layers: number
override_tensor_buffer_t: string
ctx_size: number ctx_size: number
threads: number threads: number
threads_batch: number threads_batch: number
@ -1262,6 +1263,14 @@ export default class llamacpp_extension extends AIEngine {
args.push('--jinja') args.push('--jinja')
args.push('--reasoning-format', 'none') args.push('--reasoning-format', 'none')
args.push('-m', modelPath) args.push('-m', modelPath)
// For overriding tensor buffer type, useful where
// massive MOE models can be made faster by keeping attention on the GPU
// and offloading the expert FFNs to the CPU.
// This is an expert level settings and should only be used by people
// who knows what they are doing.
// Takes a regex with matching tensor name as input
if (cfg.override_tensor_buffer_t)
args.push('--override-tensor', cfg.override_tensor_buffer_t)
args.push('-a', modelId) args.push('-a', modelId)
args.push('--port', String(port)) args.push('--port', String(port))
if (modelConfig.mmproj_path) { if (modelConfig.mmproj_path) {
@ -1340,8 +1349,8 @@ export default class llamacpp_extension extends AIEngine {
return sInfo return sInfo
} catch (error) { } catch (error) {
logger.error('Error loading llama-server:\n', error) logger.error('Error in load command:\n', error)
throw new Error(`Failed to load llama-server: ${error}`) throw new Error(`Failed to load model:\n${error}`)
} }
} }

View File

@ -106,8 +106,10 @@ export function ModelSetting({
<div key={key} className="space-y-2"> <div key={key} className="space-y-2">
<div <div
className={cn( className={cn(
'flex items-start justify-between gap-8', 'flex items-start justify-between gap-8 last:mb-2',
key === 'chat_template' && 'flex-col gap-1' (key === 'chat_template' ||
key === 'override_tensor_buffer_t') &&
'flex-col gap-1 w-full'
)} )}
> >
<div className="space-y-1 mb-2"> <div className="space-y-1 mb-2">

View File

@ -276,9 +276,34 @@ export const useModelProvider = create<ModelProviderState>()(
}) })
} }
// Migration for override_tensor_buffer_type key (version 2 -> 3)
if (version === 2 && state?.providers) {
state.providers.forEach((provider) => {
if (provider.models) {
provider.models.forEach((model) => {
// Initialize settings if it doesn't exist
if (!model.settings) {
model.settings = {}
}
// Add missing override_tensor_buffer_type setting if it doesn't exist
if (!model.settings.override_tensor_buffer_t) {
model.settings.override_tensor_buffer_t = {
...modelSettings.override_tensor_buffer_t,
controller_props: {
...modelSettings.override_tensor_buffer_t
.controller_props,
},
}
}
})
}
})
}
return state return state
}, },
version: 2, version: 3,
} }
) )
) )

View File

@ -133,4 +133,15 @@ export const modelSettings = {
textAlign: 'right', textAlign: 'right',
}, },
}, },
override_tensor_buffer_t: {
key: 'override_tensor_buffer_t',
title: 'Override Tensor Buffer Type',
description: 'Override the tensor buffer type for the model',
controller_type: 'input',
controller_props: {
value: '',
placeholder: 'e.g., layers\\.\\d+\\.ffn_.*=CPU',
type: 'text',
},
},
} }