chore: allow users to setting model offload (#5134)

* chore: allow users to setting model offload * chore: apply model.yaml configurations to default model settings * chore: fallback default value
2025-05-29 13:29:32 +07:00 · 2025-05-29 13:29:32 +07:00 · a1111033d9
commit a1111033d9
parent 1b3f16b3e1
5 changed files with 48 additions and 10 deletions
--- a/extensions/inference-cortex-extension/resources/default_settings.json
+++ b/extensions/inference-cortex-extension/resources/default_settings.json
@ -1,4 +1,13 @@
 [
+  {
+    "key": "auto_unload_models",
+    "title": "Auto-Unload Old Models",
+    "description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
+    "controllerType": "checkbox",
+    "controllerProps": {
+      "value": true
+    }
+  },
  {
    "key": "cont_batching",
    "title": "Continuous Batching",
--- a/extensions/inference-cortex-extension/src/index.ts
+++ b/extensions/inference-cortex-extension/src/index.ts
@ -37,6 +37,7 @@ enum Settings {
  use_mmap = 'use_mmap',
  cpu_threads = 'cpu_threads',
  huggingfaceToken = 'hugging-face-access-token',
+  auto_unload_models = 'auto_unload_models',
 }

 type LoadedModelResponse = { data: { engine: string; id: string }[] }
@ -61,7 +62,7 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
  use_mmap: boolean = true
  cache_type: string = 'f16'
  cpu_threads?: number
-
+  auto_unload_models: boolean = true
  /**
   * The URL for making inference requests.
   */
@ -126,6 +127,10 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
    this.flash_attn = await this.getSetting<boolean>(Settings.flash_attn, true)
    this.use_mmap = await this.getSetting<boolean>(Settings.use_mmap, true)
    this.cache_type = await this.getSetting<string>(Settings.cache_type, 'f16')
+    this.auto_unload_models = await this.getSetting<boolean>(
+      Settings.auto_unload_models,
+      true
+    )
    const threads_number = Number(
      await this.getSetting<string>(Settings.cpu_threads, '')
    )
@ -176,6 +181,8 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
      if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number
    } else if (key === Settings.huggingfaceToken) {
      this.updateCortexConfig({ huggingface_token: value })
+    } else if (key === Settings.auto_unload_models) {
+      this.auto_unload_models = value as boolean
    }
  }

@ -205,7 +212,15 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
      console.log(`Model ${model.id} already loaded`)
      return
    }
-
+    if (this.auto_unload_models) {
+      // Unload the last used model if it is not the same as the current one
+      for (const lastUsedModel of loadedModels) {
+        if (lastUsedModel.id !== model.id) {
+          console.log(`Unloading last used model: ${lastUsedModel.id}`)
+          await this.unloadModel(lastUsedModel as Model)
+        }
+      }
+    }
    return await this.apiInstance().then((api) =>
      api
        .post('v1/models/start', {
--- a/web-app/src/hooks/useChat.ts
+++ b/web-app/src/hooks/useChat.ts
@ -126,11 +126,9 @@ export const useChat = () => {
        let availableTools = selectedModel?.capabilities?.includes('tools')
          ? tools
          : []
-        while (
-          !isCompleted &&
-          !abortController.signal.aborted
-          // TODO: Max attempts can be set in the provider settings later
-        ) {
+        // TODO: Later replaced by Agent setup?
+        const followUpWithToolUse = true
+        while (!isCompleted && !abortController.signal.aborted) {
          const completion = await sendCompletion(
            activeThread,
            provider,
@ -200,7 +198,8 @@ export const useChat = () => {
          addMessage(updatedMessage ?? finalContent)

          isCompleted = !toolCalls.length
-          availableTools = []
+          // Do not create agent loop if there is no need for it
+          if (!followUpWithToolUse) availableTools = []
        }
      } catch (error) {
        toast.error(
--- a/web-app/src/lib/completion.ts
+++ b/web-app/src/lib/completion.ts
@ -308,7 +308,7 @@ export const postMessageProcessing = async (
      }
      builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
      // update message metadata
-      return message
    }
+    return message
  }
 }
--- a/web-app/src/services/providers.ts
+++ b/web-app/src/services/providers.ts
@ -77,7 +77,22 @@ export const getProviders = async (): Promise<ModelProvider[]> => {
            ? (model.capabilities as string[])
            : [ModelCapabilities.COMPLETION],
        provider: providerName,
-        settings: modelSettings ,
+        settings: Object.values(modelSettings).reduce(
+          (acc, setting) => {
+            const value = model[
+              setting.key as keyof typeof model
+            ] as keyof typeof setting.controller_props.value
+            acc[setting.key] = {
+              ...setting,
+              controller_props: {
+                ...setting.controller_props,
+                value: value ?? setting.controller_props.value,
+              },
+            }
+            return acc
+          },
+          {} as Record<string, ProviderSetting>
+        ),
      })),
    }
    runtimeProviders.push(provider)