From 0cbf35dc77d5a1a915657103c66ac2a69bdea479 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Mon, 23 Jun 2025 20:43:54 +0530
Subject: [PATCH] Add auto unload setting to llamacpp-extension

---
 extensions/llamacpp-extension/settings.json | 37 ++++++++++++---------
 extensions/llamacpp-extension/src/index.ts  |  4 ++-
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json
index 206a73ab3..3d57fbdf9 100644
--- a/extensions/llamacpp-extension/settings.json
+++ b/extensions/llamacpp-extension/settings.json
@@ -9,6 +9,13 @@
       "options": []
     }
   },
+  {
+    "key": "auto_unload_models",
+    "title": "Auto-Unload Old Models",
+    "description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
+    "controllerType": "checkbox",
+    "controllerProps": { "value": true }
+  },
   {
     "key": "threads",
     "title": "Threads",
@@ -21,7 +28,7 @@
       "textAlign": "right"
     }
   },
-    {
+  {
     "key": "threads_batch",
     "title": "Threads (Batch)",
     "description": "Number of threads for batch and prompt processing (default: same as Threads).",
@@ -69,7 +76,7 @@
       "textAlign": "right"
     }
   },
-   {
+  {
     "key": "ubatch_size",
     "title": "uBatch Size",
     "description": "Physical maximum batch size for processing prompts.",
@@ -93,7 +100,7 @@
       "textAlign": "right"
     }
   },
-    {
+  {
     "key": "device",
     "title": "Devices for Offload",
     "description": "Comma-separated list of devices to use for offloading (e.g., 'cuda:0', 'cuda:0,cuda:1'). Leave empty to use default/CPU only.",
@@ -118,7 +125,7 @@
       ]
     }
   },
-    {
+  {
     "key": "main_gpu",
     "title": "Main GPU Index",
     "description": "The GPU to use for the model (split-mode=none) or intermediate results (split-mode=row).",
@@ -145,7 +152,7 @@
     "description": "Enable continuous batching (a.k.a dynamic batching) for concurrent requests (default: enabled).",
     "controllerType": "checkbox",
     "controllerProps": {
-      "value": true
+      "value": false
     }
   },
   {
@@ -256,7 +263,7 @@
       "step": 0.01
     }
   },
-   {
+  {
     "key": "rope_freq_base",
     "title": "RoPE Frequency Base",
     "description": "RoPE base frequency (0 = loaded from model).",
@@ -268,7 +275,7 @@
       "textAlign": "right"
     }
   },
-   {
+  {
     "key": "rope_freq_scale",
     "title": "RoPE Frequency Scale Factor",
     "description": "RoPE frequency scaling factor.",
@@ -408,7 +415,7 @@
       ]
     }
   },
-   {
+  {
     "key": "mirostat_lr",
     "title": "Mirostat Learning Rate",
     "description": "Mirostat learning rate (eta).",
@@ -436,7 +443,7 @@
       "step": 0.01
     }
   },
-    {
+  {
     "key": "grammar_file",
     "title": "Grammar File",
     "description": "Path to a BNF-like grammar file to constrain generations.",
@@ -447,7 +454,7 @@
       "type": "text"
     }
   },
-    {
+  {
     "key": "json_schema_file",
     "title": "JSON Schema File",
     "description": "Path to a JSON schema file to constrain generations.",
@@ -464,11 +471,11 @@
     "description": "Mirostat target entropy (tau).",
     "controllerType": "input",
     "controllerProps": {
-        "value": 0,
-        "options": [
-           { "value": -1, "name": "unrestricted thinking budget" },
-           { "value": 0, "name": "disable thinking" }
-     ]
+      "value": 0,
+      "options": [
+        { "value": -1, "name": "unrestricted thinking budget" },
+        { "value": 0, "name": "disable thinking" }
+      ]
     }
   }
 ]
diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
index 06f8d7f34..95bd5d19c 100644
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@@ -30,6 +30,7 @@ import { invoke } from '@tauri-apps/api/core'
 
 type LlamacppConfig = {
   version_backend: string
+  auto_unload: boolean
   n_gpu_layers: number
   ctx_size: number
   threads: number
@@ -106,6 +107,7 @@ interface EmbeddingData {
 
 export default class llamacpp_extension extends AIEngine {
   provider: string = 'llamacpp'
+  autoUnload: boolean = true
   readonly providerId: string = 'llamacpp'
 
   private config: LlamacppConfig
@@ -132,7 +134,7 @@ export default class llamacpp_extension extends AIEngine {
         })
       }
     }
-
+    this.autoUnload = await this.getSetting<boolean>('auto_unload_models', true)
     this.registerSettings(settings)
 
     let config = {}