diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 140b08418..91fe4dd34 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -39,6 +39,7 @@ type LlamacppConfig = { auto_unload: boolean chat_template: string n_gpu_layers: number + override_tensor_buffer_t: string ctx_size: number threads: number threads_batch: number @@ -1262,6 +1263,14 @@ export default class llamacpp_extension extends AIEngine { args.push('--jinja') args.push('--reasoning-format', 'none') args.push('-m', modelPath) + // For overriding tensor buffer type, useful where + // massive MOE models can be made faster by keeping attention on the GPU + // and offloading the expert FFNs to the CPU. + // This is an expert level settings and should only be used by people + // who knows what they are doing. + // Takes a regex with matching tensor name as input + if (cfg.override_tensor_buffer_t) + args.push('--override-tensor', cfg.override_tensor_buffer_t) args.push('-a', modelId) args.push('--port', String(port)) if (modelConfig.mmproj_path) { @@ -1340,8 +1349,8 @@ export default class llamacpp_extension extends AIEngine { return sInfo } catch (error) { - logger.error('Error loading llama-server:\n', error) - throw new Error(`Failed to load llama-server: ${error}`) + logger.error('Error in load command:\n', error) + throw new Error(`Failed to load model:\n${error}`) } } diff --git a/web-app/src/containers/ModelSetting.tsx b/web-app/src/containers/ModelSetting.tsx index 9d8406801..29d996382 100644 --- a/web-app/src/containers/ModelSetting.tsx +++ b/web-app/src/containers/ModelSetting.tsx @@ -106,8 +106,10 @@ export function ModelSetting({
diff --git a/web-app/src/hooks/useModelProvider.ts b/web-app/src/hooks/useModelProvider.ts index e6c1ae74c..b1a988183 100644 --- a/web-app/src/hooks/useModelProvider.ts +++ b/web-app/src/hooks/useModelProvider.ts @@ -276,9 +276,34 @@ export const useModelProvider = create()( }) } + // Migration for override_tensor_buffer_type key (version 2 -> 3) + if (version === 2 && state?.providers) { + state.providers.forEach((provider) => { + if (provider.models) { + provider.models.forEach((model) => { + // Initialize settings if it doesn't exist + if (!model.settings) { + model.settings = {} + } + + // Add missing override_tensor_buffer_type setting if it doesn't exist + if (!model.settings.override_tensor_buffer_t) { + model.settings.override_tensor_buffer_t = { + ...modelSettings.override_tensor_buffer_t, + controller_props: { + ...modelSettings.override_tensor_buffer_t + .controller_props, + }, + } + } + }) + } + }) + } + return state }, - version: 2, + version: 3, } ) ) diff --git a/web-app/src/lib/predefined.ts b/web-app/src/lib/predefined.ts index 1d73fbacc..b4d5164e7 100644 --- a/web-app/src/lib/predefined.ts +++ b/web-app/src/lib/predefined.ts @@ -133,4 +133,15 @@ export const modelSettings = { textAlign: 'right', }, }, + override_tensor_buffer_t: { + key: 'override_tensor_buffer_t', + title: 'Override Tensor Buffer Type', + description: 'Override the tensor buffer type for the model', + controller_type: 'input', + controller_props: { + value: '', + placeholder: 'e.g., layers\\.\\d+\\.ffn_.*=CPU', + type: 'text', + }, + }, }