chore: allow users to setting model offload (#5134)
* chore: allow users to setting model offload * chore: apply model.yaml configurations to default model settings * chore: fallback default value
This commit is contained in:
parent
1b3f16b3e1
commit
a1111033d9
@ -1,4 +1,13 @@
|
||||
[
|
||||
{
|
||||
"key": "auto_unload_models",
|
||||
"title": "Auto-Unload Old Models",
|
||||
"description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
|
||||
"controllerType": "checkbox",
|
||||
"controllerProps": {
|
||||
"value": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "cont_batching",
|
||||
"title": "Continuous Batching",
|
||||
|
||||
@ -37,6 +37,7 @@ enum Settings {
|
||||
use_mmap = 'use_mmap',
|
||||
cpu_threads = 'cpu_threads',
|
||||
huggingfaceToken = 'hugging-face-access-token',
|
||||
auto_unload_models = 'auto_unload_models',
|
||||
}
|
||||
|
||||
type LoadedModelResponse = { data: { engine: string; id: string }[] }
|
||||
@ -61,7 +62,7 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
||||
use_mmap: boolean = true
|
||||
cache_type: string = 'f16'
|
||||
cpu_threads?: number
|
||||
|
||||
auto_unload_models: boolean = true
|
||||
/**
|
||||
* The URL for making inference requests.
|
||||
*/
|
||||
@ -126,6 +127,10 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
||||
this.flash_attn = await this.getSetting<boolean>(Settings.flash_attn, true)
|
||||
this.use_mmap = await this.getSetting<boolean>(Settings.use_mmap, true)
|
||||
this.cache_type = await this.getSetting<string>(Settings.cache_type, 'f16')
|
||||
this.auto_unload_models = await this.getSetting<boolean>(
|
||||
Settings.auto_unload_models,
|
||||
true
|
||||
)
|
||||
const threads_number = Number(
|
||||
await this.getSetting<string>(Settings.cpu_threads, '')
|
||||
)
|
||||
@ -176,6 +181,8 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
||||
if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number
|
||||
} else if (key === Settings.huggingfaceToken) {
|
||||
this.updateCortexConfig({ huggingface_token: value })
|
||||
} else if (key === Settings.auto_unload_models) {
|
||||
this.auto_unload_models = value as boolean
|
||||
}
|
||||
}
|
||||
|
||||
@ -205,7 +212,15 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
||||
console.log(`Model ${model.id} already loaded`)
|
||||
return
|
||||
}
|
||||
|
||||
if (this.auto_unload_models) {
|
||||
// Unload the last used model if it is not the same as the current one
|
||||
for (const lastUsedModel of loadedModels) {
|
||||
if (lastUsedModel.id !== model.id) {
|
||||
console.log(`Unloading last used model: ${lastUsedModel.id}`)
|
||||
await this.unloadModel(lastUsedModel as Model)
|
||||
}
|
||||
}
|
||||
}
|
||||
return await this.apiInstance().then((api) =>
|
||||
api
|
||||
.post('v1/models/start', {
|
||||
|
||||
@ -126,11 +126,9 @@ export const useChat = () => {
|
||||
let availableTools = selectedModel?.capabilities?.includes('tools')
|
||||
? tools
|
||||
: []
|
||||
while (
|
||||
!isCompleted &&
|
||||
!abortController.signal.aborted
|
||||
// TODO: Max attempts can be set in the provider settings later
|
||||
) {
|
||||
// TODO: Later replaced by Agent setup?
|
||||
const followUpWithToolUse = true
|
||||
while (!isCompleted && !abortController.signal.aborted) {
|
||||
const completion = await sendCompletion(
|
||||
activeThread,
|
||||
provider,
|
||||
@ -200,7 +198,8 @@ export const useChat = () => {
|
||||
addMessage(updatedMessage ?? finalContent)
|
||||
|
||||
isCompleted = !toolCalls.length
|
||||
availableTools = []
|
||||
// Do not create agent loop if there is no need for it
|
||||
if (!followUpWithToolUse) availableTools = []
|
||||
}
|
||||
} catch (error) {
|
||||
toast.error(
|
||||
|
||||
@ -308,7 +308,7 @@ export const postMessageProcessing = async (
|
||||
}
|
||||
builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
|
||||
// update message metadata
|
||||
return message
|
||||
}
|
||||
return message
|
||||
}
|
||||
}
|
||||
|
||||
@ -77,7 +77,22 @@ export const getProviders = async (): Promise<ModelProvider[]> => {
|
||||
? (model.capabilities as string[])
|
||||
: [ModelCapabilities.COMPLETION],
|
||||
provider: providerName,
|
||||
settings: modelSettings ,
|
||||
settings: Object.values(modelSettings).reduce(
|
||||
(acc, setting) => {
|
||||
const value = model[
|
||||
setting.key as keyof typeof model
|
||||
] as keyof typeof setting.controller_props.value
|
||||
acc[setting.key] = {
|
||||
...setting,
|
||||
controller_props: {
|
||||
...setting.controller_props,
|
||||
value: value ?? setting.controller_props.value,
|
||||
},
|
||||
}
|
||||
return acc
|
||||
},
|
||||
{} as Record<string, ProviderSetting>
|
||||
),
|
||||
})),
|
||||
}
|
||||
runtimeProviders.push(provider)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user