diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json
index b8b6ddd14..af8a42c51 100644
--- a/extensions/llamacpp-extension/settings.json
+++ b/extensions/llamacpp-extension/settings.json
@@ -1,45 +1,121 @@
 [
   {
-    "key": "port",
-    "title": "Port",
-    "description": "Port",
+    "key": "threads",
+    "title": "Threads",
+    "description": "Number of threads to use during generation (-1 for logical cores).",
     "controllerType": "input",
     "controllerProps": {
-      "value": "8080",
-      "placeholder": "8080",
+      "value": -1,
+      "placeholder": "-1",
+      "type": "number",
+      "textAlign": "right"
+    }
+  },
+    {
+    "key": "threads_batch",
+    "title": "Threads (Batch)",
+    "description": "Number of threads for batch and prompt processing (default: same as Threads).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": -1,
+      "placeholder": "-1 (same as Threads)",
       "type": "number",
       "textAlign": "right"
     }
   },
   {
-    "key": "cont_batching",
-    "title": "Continuous Batching",
-    "description": "Allows processing prompts in parallel with text generation, which usually improves performance.",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": true
-    }
-  },
-  {
-    "key": "n_parallel",
-    "title": "Parallel Operations",
-    "description": "Number of prompts that can be processed simultaneously by the model.",
+    "key": "ctx_size",
+    "title": "Context Size",
+    "description": "Size of the prompt context (0 = loaded from model).",
     "controllerType": "input",
     "controllerProps": {
-      "value": "4",
-      "placeholder": "4",
+      "value": 8192,
+      "placeholder": "8192",
       "type": "number",
       "textAlign": "right"
     }
   },
   {
-    "key": "cpu_threads",
-    "title": "CPU Threads",
-    "description": "Number of CPU cores used for model processing when running without GPU.",
+    "key": "n_predict",
+    "title": "Max Tokens to Predict",
+    "description": "Maximum number of tokens to generate (-1 = infinity).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": -1,
+      "placeholder": "-1",
+      "type": "number",
+      "textAlign": "right"
+    }
+  },
+  {
+    "key": "batch_size",
+    "title": "Batch Size",
+    "description": "Logical maximum batch size for processing prompts.",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 2048,
+      "placeholder": "2048",
+      "type": "number",
+      "textAlign": "right"
+    }
+  },
+   {
+    "key": "ubatch_size",
+    "title": "uBatch Size",
+    "description": "Physical maximum batch size for processing prompts.",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 512,
+      "placeholder": "512",
+      "type": "number",
+      "textAlign": "right"
+    }
+  },
+  {
+    "key": "n_gpu_layers",
+    "title": "GPU Layers",
+    "description": "Number of model layers to offload to the GPU (-1 for all layers, 0 for CPU only).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": -1,
+      "placeholder": "-1",
+      "type": "number",
+      "textAlign": "right"
+    }
+  },
+    {
+    "key": "device",
+    "title": "Devices for Offload",
+    "description": "Comma-separated list of devices to use for offloading (e.g., 'cuda:0', 'cuda:0,cuda:1'). Leave empty to use default/CPU only.",
     "controllerType": "input",
     "controllerProps": {
       "value": "",
-      "placeholder": "Number of CPU threads",
+      "placeholder": "cuda:0",
+      "type": "text"
+    }
+  },
+  {
+    "key": "split_mode",
+    "title": "GPU Split Mode",
+    "description": "How to split the model across multiple GPUs.",
+    "controllerType": "dropdown",
+    "controllerProps": {
+      "value": "layer",
+      "options": [
+        { "value": "none", "name": "None" },
+        { "value": "layer", "name": "Layer" },
+        { "value": "row", "name": "Row" }
+      ]
+    }
+  },
+    {
+    "key": "main_gpu",
+    "title": "Main GPU Index",
+    "description": "The GPU to use for the model (split-mode=none) or intermediate results (split-mode=row).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0,
+      "placeholder": "0",
       "type": "number",
       "textAlign": "right"
     }
@@ -47,52 +123,329 @@
   {
     "key": "flash_attn",
     "title": "Flash Attention",
-    "description": "Optimizes memory usage and speeds up model inference using an efficient attention implementation.",
+    "description": "Enable Flash Attention for optimized performance.",
     "controllerType": "checkbox",
     "controllerProps": {
-      "value": true
+      "value": false
     }
   },
-
   {
-    "key": "caching_enabled",
-    "title": "Caching",
-    "description": "Stores recent prompts and responses to improve speed when similar questions are asked.",
+    "key": "cont_batching",
+    "title": "Continuous Batching",
+    "description": "Enable continuous batching (a.k.a dynamic batching) for concurrent requests (default: enabled).",
     "controllerType": "checkbox",
     "controllerProps": {
       "value": true
     }
   },
   {
-    "key": "cache_type",
-    "title": "KV Cache Type",
-    "description": "Controls memory usage and precision trade-off.",
+    "key": "no_mmap",
+    "title": "Disable mmap",
+    "description": "Do not memory-map model (slower load but may reduce pageouts if not using mlock).",
+    "controllerType": "checkbox",
+    "controllerProps": {
+      "value": false
+    }
+  },
+  {
+    "key": "mlock",
+    "title": "MLock",
+    "description": "Force system to keep model in RAM, preventing swapping/compression.",
+    "controllerType": "checkbox",
+    "controllerProps": {
+      "value": false
+    }
+  },
+  {
+    "key": "no_kv_offload",
+    "title": "Disable KV Offload",
+    "description": "Disable KV cache offload to GPU (if GPU is used).",
+    "controllerType": "checkbox",
+    "controllerProps": {
+      "value": false
+    }
+  },
+  {
+    "key": "cache_type_k",
+    "title": "KV Cache K Type",
+    "description": "KV cache data type for Keys (default: f16).",
     "controllerType": "dropdown",
     "controllerProps": {
       "value": "f16",
       "options": [
-        {
-          "value": "q4_0",
-          "name": "q4_0"
-        },
-        {
-          "value": "q8_0",
-          "name": "q8_0"
-        },
-        {
-          "value": "f16",
-          "name": "f16"
-        }
+        { "value": "f32", "name": "f32" },
+        { "value": "f16", "name": "f16" },
+        { "value": "bf16", "name": "bf16" },
+        { "value": "q8_0", "name": "q8_0" },
+        { "value": "q4_0", "name": "q4_0" },
+        { "value": "q4_1", "name": "q4_1" },
+        { "value": "iq4_nl", "name": "iq4_nl" },
+        { "value": "q5_0", "name": "q5_0" },
+        { "value": "q5_1", "name": "q5_1" }
       ]
     }
   },
   {
-    "key": "use_mmap",
-    "title": "mmap",
-    "description": "Loads model files more efficiently by mapping them to memory, reducing RAM usage.",
-    "controllerType": "checkbox",
+    "key": "cache_type_v",
+    "title": "KV Cache V Type",
+    "description": "KV cache data type for Values (default: f16).",
+    "controllerType": "dropdown",
     "controllerProps": {
-      "value": true
+      "value": "f16",
+      "options": [
+        { "value": "f32", "name": "f32" },
+        { "value": "f16", "name": "f16" },
+        { "value": "bf16", "name": "bf16" },
+        { "value": "q8_0", "name": "q8_0" },
+        { "value": "q4_0", "name": "q4_0" },
+        { "value": "q4_1", "name": "q4_1" },
+        { "value": "iq4_nl", "name": "iq4_nl" },
+        { "value": "q5_0", "name": "q5_0" },
+        { "value": "q5_1", "name": "q5_1" }
+      ]
+    }
+  },
+  {
+    "key": "defrag_thold",
+    "title": "KV Cache Defragmentation Threshold",
+    "description": "Threshold for KV cache defragmentation (< 0 to disable).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0.1,
+      "placeholder": "0.1",
+      "type": "number",
+      "textAlign": "right",
+      "step": 0.01
+    }
+  },
+  {
+    "key": "rope_scaling",
+    "title": "RoPE Scaling Method",
+    "description": "RoPE frequency scaling method.",
+    "controllerType": "dropdown",
+    "controllerProps": {
+      "value": "none",
+      "options": [
+        { "value": "none", "name": "None" },
+        { "value": "linear", "name": "Linear" },
+        { "value": "yarn", "name": "YaRN" }
+      ]
+    }
+  },
+  {
+    "key": "rope_scale",
+    "title": "RoPE Scale Factor",
+    "description": "RoPE context scaling factor.",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 1.0,
+      "placeholder": "1.0",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+   {
+    "key": "rope_freq_base",
+    "title": "RoPE Frequency Base",
+    "description": "RoPE base frequency (0 = loaded from model).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0,
+      "placeholder": "0 (model default)",
+      "type": "number",
+      "textAlign": "right"
+    }
+  },
+   {
+    "key": "rope_freq_scale",
+    "title": "RoPE Frequency Scale Factor",
+    "description": "RoPE frequency scaling factor.",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 1.0,
+      "placeholder": "1.0",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "temp",
+    "title": "Temperature",
+    "description": "Temperature for sampling (higher = more random).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0.8,
+      "placeholder": "0.8",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "top_k",
+    "title": "Top K",
+    "description": "Top-K sampling (0 = disabled).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 40,
+      "placeholder": "40",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0
+    }
+  },
+  {
+    "key": "top_p",
+    "title": "Top P",
+    "description": "Top-P sampling (1.0 = disabled).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0.9,
+      "placeholder": "0.9",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "max": 1.0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "min_p",
+    "title": "Min P",
+    "description": "Min-P sampling (0.0 = disabled).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0.1,
+      "placeholder": "0.1",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "max": 1.0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "repeat_last_n",
+    "title": "Repeat Last N",
+    "description": "Number of tokens to consider for repeat penalty (0 = disabled, -1 = ctx_size).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 64,
+      "placeholder": "64",
+      "type": "number",
+      "textAlign": "right",
+      "min": -1
+    }
+  },
+  {
+    "key": "repeat_penalty",
+    "title": "Repeat Penalty",
+    "description": "Penalize repeating token sequences (1.0 = disabled).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 1.0,
+      "placeholder": "1.0",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "presence_penalty",
+    "title": "Presence Penalty",
+    "description": "Repeat alpha presence penalty (0.0 = disabled).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0.0,
+      "placeholder": "0.0",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "frequency_penalty",
+    "title": "Frequency Penalty",
+    "description": "Repeat alpha frequency penalty (0.0 = disabled).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0.0,
+      "placeholder": "0.0",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "mirostat",
+    "title": "Mirostat Mode",
+    "description": "Use Mirostat sampling (0: disabled, 1: Mirostat V1, 2: Mirostat V2).",
+    "controllerType": "dropdown",
+    "controllerProps": {
+      "value": 0,
+      "options": [
+        { "value": 0, "name": "Disabled" },
+        { "value": 1, "name": "Mirostat V1" },
+        { "value": 2, "name": "Mirostat V2" }
+      ]
+    }
+  },
+   {
+    "key": "mirostat_lr",
+    "title": "Mirostat Learning Rate",
+    "description": "Mirostat learning rate (eta).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0.1,
+      "placeholder": "0.1",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+  {
+    "key": "mirostat_ent",
+    "title": "Mirostat Target Entropy",
+    "description": "Mirostat target entropy (tau).",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 5.0,
+      "placeholder": "5.0",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0,
+      "step": 0.01
+    }
+  },
+    {
+    "key": "grammar_file",
+    "title": "Grammar File",
+    "description": "Path to a BNF-like grammar file to constrain generations.",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": "",
+      "placeholder": "path/to/grammar.gbnf",
+      "type": "text"
+    }
+  },
+    {
+    "key": "json_schema_file",
+    "title": "JSON Schema File",
+    "description": "Path to a JSON schema file to constrain generations.",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": "",
+      "placeholder": "path/to/schema.json",
+      "type": "text"
     }
   }
 ]