chore: allow users to setting model offload (#5134)

* chore: allow users to setting model offload

* chore: apply model.yaml configurations to default model settings

* chore: fallback default value
This commit is contained in:
Louis 2025-05-29 13:29:32 +07:00 committed by GitHub
parent 1b3f16b3e1
commit a1111033d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 48 additions and 10 deletions

View File

@ -1,4 +1,13 @@
[ [
{
"key": "auto_unload_models",
"title": "Auto-Unload Old Models",
"description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
"controllerType": "checkbox",
"controllerProps": {
"value": true
}
},
{ {
"key": "cont_batching", "key": "cont_batching",
"title": "Continuous Batching", "title": "Continuous Batching",

View File

@ -37,6 +37,7 @@ enum Settings {
use_mmap = 'use_mmap', use_mmap = 'use_mmap',
cpu_threads = 'cpu_threads', cpu_threads = 'cpu_threads',
huggingfaceToken = 'hugging-face-access-token', huggingfaceToken = 'hugging-face-access-token',
auto_unload_models = 'auto_unload_models',
} }
type LoadedModelResponse = { data: { engine: string; id: string }[] } type LoadedModelResponse = { data: { engine: string; id: string }[] }
@ -61,7 +62,7 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
use_mmap: boolean = true use_mmap: boolean = true
cache_type: string = 'f16' cache_type: string = 'f16'
cpu_threads?: number cpu_threads?: number
auto_unload_models: boolean = true
/** /**
* The URL for making inference requests. * The URL for making inference requests.
*/ */
@ -126,6 +127,10 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
this.flash_attn = await this.getSetting<boolean>(Settings.flash_attn, true) this.flash_attn = await this.getSetting<boolean>(Settings.flash_attn, true)
this.use_mmap = await this.getSetting<boolean>(Settings.use_mmap, true) this.use_mmap = await this.getSetting<boolean>(Settings.use_mmap, true)
this.cache_type = await this.getSetting<string>(Settings.cache_type, 'f16') this.cache_type = await this.getSetting<string>(Settings.cache_type, 'f16')
this.auto_unload_models = await this.getSetting<boolean>(
Settings.auto_unload_models,
true
)
const threads_number = Number( const threads_number = Number(
await this.getSetting<string>(Settings.cpu_threads, '') await this.getSetting<string>(Settings.cpu_threads, '')
) )
@ -176,6 +181,8 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number
} else if (key === Settings.huggingfaceToken) { } else if (key === Settings.huggingfaceToken) {
this.updateCortexConfig({ huggingface_token: value }) this.updateCortexConfig({ huggingface_token: value })
} else if (key === Settings.auto_unload_models) {
this.auto_unload_models = value as boolean
} }
} }
@ -205,7 +212,15 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
console.log(`Model ${model.id} already loaded`) console.log(`Model ${model.id} already loaded`)
return return
} }
if (this.auto_unload_models) {
// Unload the last used model if it is not the same as the current one
for (const lastUsedModel of loadedModels) {
if (lastUsedModel.id !== model.id) {
console.log(`Unloading last used model: ${lastUsedModel.id}`)
await this.unloadModel(lastUsedModel as Model)
}
}
}
return await this.apiInstance().then((api) => return await this.apiInstance().then((api) =>
api api
.post('v1/models/start', { .post('v1/models/start', {

View File

@ -126,11 +126,9 @@ export const useChat = () => {
let availableTools = selectedModel?.capabilities?.includes('tools') let availableTools = selectedModel?.capabilities?.includes('tools')
? tools ? tools
: [] : []
while ( // TODO: Later replaced by Agent setup?
!isCompleted && const followUpWithToolUse = true
!abortController.signal.aborted while (!isCompleted && !abortController.signal.aborted) {
// TODO: Max attempts can be set in the provider settings later
) {
const completion = await sendCompletion( const completion = await sendCompletion(
activeThread, activeThread,
provider, provider,
@ -200,7 +198,8 @@ export const useChat = () => {
addMessage(updatedMessage ?? finalContent) addMessage(updatedMessage ?? finalContent)
isCompleted = !toolCalls.length isCompleted = !toolCalls.length
availableTools = [] // Do not create agent loop if there is no need for it
if (!followUpWithToolUse) availableTools = []
} }
} catch (error) { } catch (error) {
toast.error( toast.error(

View File

@ -308,7 +308,7 @@ export const postMessageProcessing = async (
} }
builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id) builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
// update message metadata // update message metadata
return message
} }
return message
} }
} }

View File

@ -77,7 +77,22 @@ export const getProviders = async (): Promise<ModelProvider[]> => {
? (model.capabilities as string[]) ? (model.capabilities as string[])
: [ModelCapabilities.COMPLETION], : [ModelCapabilities.COMPLETION],
provider: providerName, provider: providerName,
settings: modelSettings , settings: Object.values(modelSettings).reduce(
(acc, setting) => {
const value = model[
setting.key as keyof typeof model
] as keyof typeof setting.controller_props.value
acc[setting.key] = {
...setting,
controller_props: {
...setting.controller_props,
value: value ?? setting.controller_props.value,
},
}
return acc
},
{} as Record<string, ProviderSetting>
),
})), })),
} }
runtimeProviders.push(provider) runtimeProviders.push(provider)