chore: allow users to setting model offload (#5134)
* chore: allow users to setting model offload * chore: apply model.yaml configurations to default model settings * chore: fallback default value
This commit is contained in:
parent
1b3f16b3e1
commit
a1111033d9
@ -1,4 +1,13 @@
|
|||||||
[
|
[
|
||||||
|
{
|
||||||
|
"key": "auto_unload_models",
|
||||||
|
"title": "Auto-Unload Old Models",
|
||||||
|
"description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
|
||||||
|
"controllerType": "checkbox",
|
||||||
|
"controllerProps": {
|
||||||
|
"value": true
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"key": "cont_batching",
|
"key": "cont_batching",
|
||||||
"title": "Continuous Batching",
|
"title": "Continuous Batching",
|
||||||
|
|||||||
@ -37,6 +37,7 @@ enum Settings {
|
|||||||
use_mmap = 'use_mmap',
|
use_mmap = 'use_mmap',
|
||||||
cpu_threads = 'cpu_threads',
|
cpu_threads = 'cpu_threads',
|
||||||
huggingfaceToken = 'hugging-face-access-token',
|
huggingfaceToken = 'hugging-face-access-token',
|
||||||
|
auto_unload_models = 'auto_unload_models',
|
||||||
}
|
}
|
||||||
|
|
||||||
type LoadedModelResponse = { data: { engine: string; id: string }[] }
|
type LoadedModelResponse = { data: { engine: string; id: string }[] }
|
||||||
@ -61,7 +62,7 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
|||||||
use_mmap: boolean = true
|
use_mmap: boolean = true
|
||||||
cache_type: string = 'f16'
|
cache_type: string = 'f16'
|
||||||
cpu_threads?: number
|
cpu_threads?: number
|
||||||
|
auto_unload_models: boolean = true
|
||||||
/**
|
/**
|
||||||
* The URL for making inference requests.
|
* The URL for making inference requests.
|
||||||
*/
|
*/
|
||||||
@ -126,6 +127,10 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
|||||||
this.flash_attn = await this.getSetting<boolean>(Settings.flash_attn, true)
|
this.flash_attn = await this.getSetting<boolean>(Settings.flash_attn, true)
|
||||||
this.use_mmap = await this.getSetting<boolean>(Settings.use_mmap, true)
|
this.use_mmap = await this.getSetting<boolean>(Settings.use_mmap, true)
|
||||||
this.cache_type = await this.getSetting<string>(Settings.cache_type, 'f16')
|
this.cache_type = await this.getSetting<string>(Settings.cache_type, 'f16')
|
||||||
|
this.auto_unload_models = await this.getSetting<boolean>(
|
||||||
|
Settings.auto_unload_models,
|
||||||
|
true
|
||||||
|
)
|
||||||
const threads_number = Number(
|
const threads_number = Number(
|
||||||
await this.getSetting<string>(Settings.cpu_threads, '')
|
await this.getSetting<string>(Settings.cpu_threads, '')
|
||||||
)
|
)
|
||||||
@ -176,6 +181,8 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
|||||||
if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number
|
if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number
|
||||||
} else if (key === Settings.huggingfaceToken) {
|
} else if (key === Settings.huggingfaceToken) {
|
||||||
this.updateCortexConfig({ huggingface_token: value })
|
this.updateCortexConfig({ huggingface_token: value })
|
||||||
|
} else if (key === Settings.auto_unload_models) {
|
||||||
|
this.auto_unload_models = value as boolean
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -205,7 +212,15 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
|
|||||||
console.log(`Model ${model.id} already loaded`)
|
console.log(`Model ${model.id} already loaded`)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if (this.auto_unload_models) {
|
||||||
|
// Unload the last used model if it is not the same as the current one
|
||||||
|
for (const lastUsedModel of loadedModels) {
|
||||||
|
if (lastUsedModel.id !== model.id) {
|
||||||
|
console.log(`Unloading last used model: ${lastUsedModel.id}`)
|
||||||
|
await this.unloadModel(lastUsedModel as Model)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return await this.apiInstance().then((api) =>
|
return await this.apiInstance().then((api) =>
|
||||||
api
|
api
|
||||||
.post('v1/models/start', {
|
.post('v1/models/start', {
|
||||||
|
|||||||
@ -126,11 +126,9 @@ export const useChat = () => {
|
|||||||
let availableTools = selectedModel?.capabilities?.includes('tools')
|
let availableTools = selectedModel?.capabilities?.includes('tools')
|
||||||
? tools
|
? tools
|
||||||
: []
|
: []
|
||||||
while (
|
// TODO: Later replaced by Agent setup?
|
||||||
!isCompleted &&
|
const followUpWithToolUse = true
|
||||||
!abortController.signal.aborted
|
while (!isCompleted && !abortController.signal.aborted) {
|
||||||
// TODO: Max attempts can be set in the provider settings later
|
|
||||||
) {
|
|
||||||
const completion = await sendCompletion(
|
const completion = await sendCompletion(
|
||||||
activeThread,
|
activeThread,
|
||||||
provider,
|
provider,
|
||||||
@ -200,7 +198,8 @@ export const useChat = () => {
|
|||||||
addMessage(updatedMessage ?? finalContent)
|
addMessage(updatedMessage ?? finalContent)
|
||||||
|
|
||||||
isCompleted = !toolCalls.length
|
isCompleted = !toolCalls.length
|
||||||
availableTools = []
|
// Do not create agent loop if there is no need for it
|
||||||
|
if (!followUpWithToolUse) availableTools = []
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
toast.error(
|
toast.error(
|
||||||
|
|||||||
@ -308,7 +308,7 @@ export const postMessageProcessing = async (
|
|||||||
}
|
}
|
||||||
builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
|
builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
|
||||||
// update message metadata
|
// update message metadata
|
||||||
return message
|
|
||||||
}
|
}
|
||||||
|
return message
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -77,7 +77,22 @@ export const getProviders = async (): Promise<ModelProvider[]> => {
|
|||||||
? (model.capabilities as string[])
|
? (model.capabilities as string[])
|
||||||
: [ModelCapabilities.COMPLETION],
|
: [ModelCapabilities.COMPLETION],
|
||||||
provider: providerName,
|
provider: providerName,
|
||||||
settings: modelSettings ,
|
settings: Object.values(modelSettings).reduce(
|
||||||
|
(acc, setting) => {
|
||||||
|
const value = model[
|
||||||
|
setting.key as keyof typeof model
|
||||||
|
] as keyof typeof setting.controller_props.value
|
||||||
|
acc[setting.key] = {
|
||||||
|
...setting,
|
||||||
|
controller_props: {
|
||||||
|
...setting.controller_props,
|
||||||
|
value: value ?? setting.controller_props.value,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return acc
|
||||||
|
},
|
||||||
|
{} as Record<string, ProviderSetting>
|
||||||
|
),
|
||||||
})),
|
})),
|
||||||
}
|
}
|
||||||
runtimeProviders.push(provider)
|
runtimeProviders.push(provider)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user