The 'Auto-Unload Old Models' setting in the llama.cpp extension failed to persist due to a typo in its key name within `settings.json`. The key was incorrectly `auto_unload_models` instead of `auto_unload`. This commit corrects the key name to `auto_unload`, ensuring that user-configured changes to this setting are properly saved, retrieved, and persist across application restarts. This resolves the issue where the setting would change and remain to its previous value after being changed.
361 lines
9.6 KiB
JSON
361 lines
9.6 KiB
JSON
[
|
|
{
|
|
"key": "version_backend",
|
|
"title": "Version & Backend",
|
|
"description": "Version and Backend for llama.cpp",
|
|
"controllerType": "dropdown",
|
|
"controllerProps": {
|
|
"value": "none",
|
|
"options": []
|
|
}
|
|
},
|
|
{
|
|
"key": "auto_update_engine",
|
|
"title": "Auto update engine",
|
|
"description": "Automatically update llamacpp engine to latest version",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": { "value": true }
|
|
},
|
|
{
|
|
"key": "auto_unload",
|
|
"title": "Auto-Unload Old Models",
|
|
"description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": { "value": true }
|
|
},
|
|
{
|
|
"key": "chat_template",
|
|
"title": "Custom Jinja Chat template",
|
|
"description": "Custom Jinja chat_template to be used for the model",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": "",
|
|
"placeholder": "e.g., {% for message in messages %}...{% endfor %} (default is read from GGUF)",
|
|
"type": "text",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "threads",
|
|
"title": "Threads",
|
|
"description": "Number of threads to use during generation (-1 for logical cores).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": -1,
|
|
"placeholder": "-1",
|
|
"type": "number",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "threads_batch",
|
|
"title": "Threads (Batch)",
|
|
"description": "Number of threads for batch and prompt processing (default: same as Threads).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": -1,
|
|
"placeholder": "-1 (same as Threads)",
|
|
"type": "number",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "ctx_shift",
|
|
"title": "Context Shift",
|
|
"description": "Allow model to cut text in the beginning to accommodate new text in its memory",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": {
|
|
"value": false
|
|
}
|
|
},
|
|
{
|
|
"key": "n_predict",
|
|
"title": "Max Tokens to Predict",
|
|
"description": "Maximum number of tokens to generate (-1 = infinity).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": -1,
|
|
"placeholder": "-1",
|
|
"type": "number",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "batch_size",
|
|
"title": "Batch Size",
|
|
"description": "Logical maximum batch size for processing prompts.",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 2048,
|
|
"placeholder": "2048",
|
|
"type": "number",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "ubatch_size",
|
|
"title": "uBatch Size",
|
|
"description": "Physical maximum batch size for processing prompts.",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 512,
|
|
"placeholder": "512",
|
|
"type": "number",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "device",
|
|
"title": "Devices for Offload",
|
|
"description": "Comma-separated list of devices to use for offloading (e.g., 'CUDA0', 'CUDA0,CUDA1'). Leave empty to use default/CPU only.",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": "",
|
|
"placeholder": "CUDA0",
|
|
"type": "text"
|
|
}
|
|
},
|
|
{
|
|
"key": "split_mode",
|
|
"title": "GPU Split Mode",
|
|
"description": "How to split the model across multiple GPUs.",
|
|
"controllerType": "dropdown",
|
|
"controllerProps": {
|
|
"value": "layer",
|
|
"options": [
|
|
{ "value": "none", "name": "None" },
|
|
{ "value": "layer", "name": "Layer" },
|
|
{ "value": "row", "name": "Row" }
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"key": "main_gpu",
|
|
"title": "Main GPU Index",
|
|
"description": "The GPU to use for the model (split-mode=none) or intermediate results (split-mode=row).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 0,
|
|
"placeholder": "0",
|
|
"type": "number",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "flash_attn",
|
|
"title": "Flash Attention",
|
|
"description": "Enable Flash Attention for optimized performance.",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": {
|
|
"value": false
|
|
}
|
|
},
|
|
{
|
|
"key": "cont_batching",
|
|
"title": "Continuous Batching",
|
|
"description": "Enable continuous batching (a.k.a dynamic batching) for concurrent requests (default: enabled).",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": {
|
|
"value": false
|
|
}
|
|
},
|
|
{
|
|
"key": "no_mmap",
|
|
"title": "Disable mmap",
|
|
"description": "Do not memory-map model (slower load but may reduce pageouts if not using mlock).",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": {
|
|
"value": false
|
|
}
|
|
},
|
|
{
|
|
"key": "mlock",
|
|
"title": "MLock",
|
|
"description": "Force system to keep model in RAM, preventing swapping/compression.",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": {
|
|
"value": false
|
|
}
|
|
},
|
|
{
|
|
"key": "no_kv_offload",
|
|
"title": "Disable KV Offload",
|
|
"description": "Disable KV cache offload to GPU (if GPU is used).",
|
|
"controllerType": "checkbox",
|
|
"controllerProps": {
|
|
"value": false
|
|
}
|
|
},
|
|
{
|
|
"key": "cache_type_k",
|
|
"title": "KV Cache K Type",
|
|
"description": "KV cache data type for Keys (default: f16).",
|
|
"controllerType": "dropdown",
|
|
"controllerProps": {
|
|
"value": "f16",
|
|
"options": [
|
|
{ "value": "f32", "name": "f32" },
|
|
{ "value": "f16", "name": "f16" },
|
|
{ "value": "bf16", "name": "bf16" },
|
|
{ "value": "q8_0", "name": "q8_0" },
|
|
{ "value": "q4_0", "name": "q4_0" },
|
|
{ "value": "q4_1", "name": "q4_1" },
|
|
{ "value": "iq4_nl", "name": "iq4_nl" },
|
|
{ "value": "q5_0", "name": "q5_0" },
|
|
{ "value": "q5_1", "name": "q5_1" }
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"key": "cache_type_v",
|
|
"title": "KV Cache V Type",
|
|
"description": "KV cache data type for Values (default: f16).",
|
|
"controllerType": "dropdown",
|
|
"controllerProps": {
|
|
"value": "f16",
|
|
"options": [
|
|
{ "value": "f32", "name": "f32" },
|
|
{ "value": "f16", "name": "f16" },
|
|
{ "value": "bf16", "name": "bf16" },
|
|
{ "value": "q8_0", "name": "q8_0" },
|
|
{ "value": "q4_0", "name": "q4_0" },
|
|
{ "value": "q4_1", "name": "q4_1" },
|
|
{ "value": "iq4_nl", "name": "iq4_nl" },
|
|
{ "value": "q5_0", "name": "q5_0" },
|
|
{ "value": "q5_1", "name": "q5_1" }
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"key": "defrag_thold",
|
|
"title": "KV Cache Defragmentation Threshold",
|
|
"description": "Threshold for KV cache defragmentation (< 0 to disable).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 0.1,
|
|
"placeholder": "0.1",
|
|
"type": "number",
|
|
"textAlign": "right",
|
|
"step": 0.01
|
|
}
|
|
},
|
|
{
|
|
"key": "rope_scaling",
|
|
"title": "RoPE Scaling Method",
|
|
"description": "RoPE frequency scaling method.",
|
|
"controllerType": "dropdown",
|
|
"controllerProps": {
|
|
"value": "none",
|
|
"options": [
|
|
{ "value": "none", "name": "None" },
|
|
{ "value": "linear", "name": "Linear" },
|
|
{ "value": "yarn", "name": "YaRN" }
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"key": "rope_scale",
|
|
"title": "RoPE Scale Factor",
|
|
"description": "RoPE context scaling factor.",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 1.0,
|
|
"placeholder": "1.0",
|
|
"type": "number",
|
|
"textAlign": "right",
|
|
"min": 0,
|
|
"step": 0.01
|
|
}
|
|
},
|
|
{
|
|
"key": "rope_freq_base",
|
|
"title": "RoPE Frequency Base",
|
|
"description": "RoPE base frequency (0 = loaded from model).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 0,
|
|
"placeholder": "0 (model default)",
|
|
"type": "number",
|
|
"textAlign": "right"
|
|
}
|
|
},
|
|
{
|
|
"key": "rope_freq_scale",
|
|
"title": "RoPE Frequency Scale Factor",
|
|
"description": "RoPE frequency scaling factor.",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 1.0,
|
|
"placeholder": "1.0",
|
|
"type": "number",
|
|
"textAlign": "right",
|
|
"min": 0,
|
|
"step": 0.01
|
|
}
|
|
},
|
|
{
|
|
"key": "mirostat",
|
|
"title": "Mirostat Mode",
|
|
"description": "Use Mirostat sampling (0: disabled, 1: Mirostat V1, 2: Mirostat V2).",
|
|
"controllerType": "dropdown",
|
|
"controllerProps": {
|
|
"value": 0,
|
|
"options": [
|
|
{ "value": 0, "name": "Disabled" },
|
|
{ "value": 1, "name": "Mirostat V1" },
|
|
{ "value": 2, "name": "Mirostat V2" }
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"key": "mirostat_lr",
|
|
"title": "Mirostat Learning Rate",
|
|
"description": "Mirostat learning rate (eta).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 0.1,
|
|
"placeholder": "0.1",
|
|
"type": "number",
|
|
"textAlign": "right",
|
|
"min": 0,
|
|
"step": 0.01
|
|
}
|
|
},
|
|
{
|
|
"key": "mirostat_ent",
|
|
"title": "Mirostat Target Entropy",
|
|
"description": "Mirostat target entropy (tau).",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": 5.0,
|
|
"placeholder": "5.0",
|
|
"type": "number",
|
|
"textAlign": "right",
|
|
"min": 0,
|
|
"step": 0.01
|
|
}
|
|
},
|
|
{
|
|
"key": "grammar_file",
|
|
"title": "Grammar File",
|
|
"description": "Path to a BNF-like grammar file to constrain generations.",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": "",
|
|
"placeholder": "path/to/grammar.gbnf",
|
|
"type": "text"
|
|
}
|
|
},
|
|
{
|
|
"key": "json_schema_file",
|
|
"title": "JSON Schema File",
|
|
"description": "Path to a JSON schema file to constrain generations.",
|
|
"controllerType": "input",
|
|
"controllerProps": {
|
|
"value": "",
|
|
"placeholder": "path/to/schema.json",
|
|
"type": "text"
|
|
}
|
|
}
|
|
]
|