diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json index b8b6ddd14..af8a42c51 100644 --- a/extensions/llamacpp-extension/settings.json +++ b/extensions/llamacpp-extension/settings.json @@ -1,45 +1,121 @@ [ { - "key": "port", - "title": "Port", - "description": "Port", + "key": "threads", + "title": "Threads", + "description": "Number of threads to use during generation (-1 for logical cores).", "controllerType": "input", "controllerProps": { - "value": "8080", - "placeholder": "8080", + "value": -1, + "placeholder": "-1", + "type": "number", + "textAlign": "right" + } + }, + { + "key": "threads_batch", + "title": "Threads (Batch)", + "description": "Number of threads for batch and prompt processing (default: same as Threads).", + "controllerType": "input", + "controllerProps": { + "value": -1, + "placeholder": "-1 (same as Threads)", "type": "number", "textAlign": "right" } }, { - "key": "cont_batching", - "title": "Continuous Batching", - "description": "Allows processing prompts in parallel with text generation, which usually improves performance.", - "controllerType": "checkbox", - "controllerProps": { - "value": true - } - }, - { - "key": "n_parallel", - "title": "Parallel Operations", - "description": "Number of prompts that can be processed simultaneously by the model.", + "key": "ctx_size", + "title": "Context Size", + "description": "Size of the prompt context (0 = loaded from model).", "controllerType": "input", "controllerProps": { - "value": "4", - "placeholder": "4", + "value": 8192, + "placeholder": "8192", "type": "number", "textAlign": "right" } }, { - "key": "cpu_threads", - "title": "CPU Threads", - "description": "Number of CPU cores used for model processing when running without GPU.", + "key": "n_predict", + "title": "Max Tokens to Predict", + "description": "Maximum number of tokens to generate (-1 = infinity).", + "controllerType": "input", + "controllerProps": { + "value": -1, + "placeholder": "-1", + "type": "number", + "textAlign": "right" + } + }, + { + "key": "batch_size", + "title": "Batch Size", + "description": "Logical maximum batch size for processing prompts.", + "controllerType": "input", + "controllerProps": { + "value": 2048, + "placeholder": "2048", + "type": "number", + "textAlign": "right" + } + }, + { + "key": "ubatch_size", + "title": "uBatch Size", + "description": "Physical maximum batch size for processing prompts.", + "controllerType": "input", + "controllerProps": { + "value": 512, + "placeholder": "512", + "type": "number", + "textAlign": "right" + } + }, + { + "key": "n_gpu_layers", + "title": "GPU Layers", + "description": "Number of model layers to offload to the GPU (-1 for all layers, 0 for CPU only).", + "controllerType": "input", + "controllerProps": { + "value": -1, + "placeholder": "-1", + "type": "number", + "textAlign": "right" + } + }, + { + "key": "device", + "title": "Devices for Offload", + "description": "Comma-separated list of devices to use for offloading (e.g., 'cuda:0', 'cuda:0,cuda:1'). Leave empty to use default/CPU only.", "controllerType": "input", "controllerProps": { "value": "", - "placeholder": "Number of CPU threads", + "placeholder": "cuda:0", + "type": "text" + } + }, + { + "key": "split_mode", + "title": "GPU Split Mode", + "description": "How to split the model across multiple GPUs.", + "controllerType": "dropdown", + "controllerProps": { + "value": "layer", + "options": [ + { "value": "none", "name": "None" }, + { "value": "layer", "name": "Layer" }, + { "value": "row", "name": "Row" } + ] + } + }, + { + "key": "main_gpu", + "title": "Main GPU Index", + "description": "The GPU to use for the model (split-mode=none) or intermediate results (split-mode=row).", + "controllerType": "input", + "controllerProps": { + "value": 0, + "placeholder": "0", "type": "number", "textAlign": "right" } @@ -47,52 +123,329 @@ { "key": "flash_attn", "title": "Flash Attention", - "description": "Optimizes memory usage and speeds up model inference using an efficient attention implementation.", + "description": "Enable Flash Attention for optimized performance.", "controllerType": "checkbox", "controllerProps": { - "value": true + "value": false } }, - { - "key": "caching_enabled", - "title": "Caching", - "description": "Stores recent prompts and responses to improve speed when similar questions are asked.", + "key": "cont_batching", + "title": "Continuous Batching", + "description": "Enable continuous batching (a.k.a dynamic batching) for concurrent requests (default: enabled).", "controllerType": "checkbox", "controllerProps": { "value": true } }, { - "key": "cache_type", - "title": "KV Cache Type", - "description": "Controls memory usage and precision trade-off.", + "key": "no_mmap", + "title": "Disable mmap", + "description": "Do not memory-map model (slower load but may reduce pageouts if not using mlock).", + "controllerType": "checkbox", + "controllerProps": { + "value": false + } + }, + { + "key": "mlock", + "title": "MLock", + "description": "Force system to keep model in RAM, preventing swapping/compression.", + "controllerType": "checkbox", + "controllerProps": { + "value": false + } + }, + { + "key": "no_kv_offload", + "title": "Disable KV Offload", + "description": "Disable KV cache offload to GPU (if GPU is used).", + "controllerType": "checkbox", + "controllerProps": { + "value": false + } + }, + { + "key": "cache_type_k", + "title": "KV Cache K Type", + "description": "KV cache data type for Keys (default: f16).", "controllerType": "dropdown", "controllerProps": { "value": "f16", "options": [ - { - "value": "q4_0", - "name": "q4_0" - }, - { - "value": "q8_0", - "name": "q8_0" - }, - { - "value": "f16", - "name": "f16" - } + { "value": "f32", "name": "f32" }, + { "value": "f16", "name": "f16" }, + { "value": "bf16", "name": "bf16" }, + { "value": "q8_0", "name": "q8_0" }, + { "value": "q4_0", "name": "q4_0" }, + { "value": "q4_1", "name": "q4_1" }, + { "value": "iq4_nl", "name": "iq4_nl" }, + { "value": "q5_0", "name": "q5_0" }, + { "value": "q5_1", "name": "q5_1" } ] } }, { - "key": "use_mmap", - "title": "mmap", - "description": "Loads model files more efficiently by mapping them to memory, reducing RAM usage.", - "controllerType": "checkbox", + "key": "cache_type_v", + "title": "KV Cache V Type", + "description": "KV cache data type for Values (default: f16).", + "controllerType": "dropdown", "controllerProps": { - "value": true + "value": "f16", + "options": [ + { "value": "f32", "name": "f32" }, + { "value": "f16", "name": "f16" }, + { "value": "bf16", "name": "bf16" }, + { "value": "q8_0", "name": "q8_0" }, + { "value": "q4_0", "name": "q4_0" }, + { "value": "q4_1", "name": "q4_1" }, + { "value": "iq4_nl", "name": "iq4_nl" }, + { "value": "q5_0", "name": "q5_0" }, + { "value": "q5_1", "name": "q5_1" } + ] + } + }, + { + "key": "defrag_thold", + "title": "KV Cache Defragmentation Threshold", + "description": "Threshold for KV cache defragmentation (< 0 to disable).", + "controllerType": "input", + "controllerProps": { + "value": 0.1, + "placeholder": "0.1", + "type": "number", + "textAlign": "right", + "step": 0.01 + } + }, + { + "key": "rope_scaling", + "title": "RoPE Scaling Method", + "description": "RoPE frequency scaling method.", + "controllerType": "dropdown", + "controllerProps": { + "value": "none", + "options": [ + { "value": "none", "name": "None" }, + { "value": "linear", "name": "Linear" }, + { "value": "yarn", "name": "YaRN" } + ] + } + }, + { + "key": "rope_scale", + "title": "RoPE Scale Factor", + "description": "RoPE context scaling factor.", + "controllerType": "input", + "controllerProps": { + "value": 1.0, + "placeholder": "1.0", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "rope_freq_base", + "title": "RoPE Frequency Base", + "description": "RoPE base frequency (0 = loaded from model).", + "controllerType": "input", + "controllerProps": { + "value": 0, + "placeholder": "0 (model default)", + "type": "number", + "textAlign": "right" + } + }, + { + "key": "rope_freq_scale", + "title": "RoPE Frequency Scale Factor", + "description": "RoPE frequency scaling factor.", + "controllerType": "input", + "controllerProps": { + "value": 1.0, + "placeholder": "1.0", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "temp", + "title": "Temperature", + "description": "Temperature for sampling (higher = more random).", + "controllerType": "input", + "controllerProps": { + "value": 0.8, + "placeholder": "0.8", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "top_k", + "title": "Top K", + "description": "Top-K sampling (0 = disabled).", + "controllerType": "input", + "controllerProps": { + "value": 40, + "placeholder": "40", + "type": "number", + "textAlign": "right", + "min": 0 + } + }, + { + "key": "top_p", + "title": "Top P", + "description": "Top-P sampling (1.0 = disabled).", + "controllerType": "input", + "controllerProps": { + "value": 0.9, + "placeholder": "0.9", + "type": "number", + "textAlign": "right", + "min": 0, + "max": 1.0, + "step": 0.01 + } + }, + { + "key": "min_p", + "title": "Min P", + "description": "Min-P sampling (0.0 = disabled).", + "controllerType": "input", + "controllerProps": { + "value": 0.1, + "placeholder": "0.1", + "type": "number", + "textAlign": "right", + "min": 0, + "max": 1.0, + "step": 0.01 + } + }, + { + "key": "repeat_last_n", + "title": "Repeat Last N", + "description": "Number of tokens to consider for repeat penalty (0 = disabled, -1 = ctx_size).", + "controllerType": "input", + "controllerProps": { + "value": 64, + "placeholder": "64", + "type": "number", + "textAlign": "right", + "min": -1 + } + }, + { + "key": "repeat_penalty", + "title": "Repeat Penalty", + "description": "Penalize repeating token sequences (1.0 = disabled).", + "controllerType": "input", + "controllerProps": { + "value": 1.0, + "placeholder": "1.0", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "presence_penalty", + "title": "Presence Penalty", + "description": "Repeat alpha presence penalty (0.0 = disabled).", + "controllerType": "input", + "controllerProps": { + "value": 0.0, + "placeholder": "0.0", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "frequency_penalty", + "title": "Frequency Penalty", + "description": "Repeat alpha frequency penalty (0.0 = disabled).", + "controllerType": "input", + "controllerProps": { + "value": 0.0, + "placeholder": "0.0", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "mirostat", + "title": "Mirostat Mode", + "description": "Use Mirostat sampling (0: disabled, 1: Mirostat V1, 2: Mirostat V2).", + "controllerType": "dropdown", + "controllerProps": { + "value": 0, + "options": [ + { "value": 0, "name": "Disabled" }, + { "value": 1, "name": "Mirostat V1" }, + { "value": 2, "name": "Mirostat V2" } + ] + } + }, + { + "key": "mirostat_lr", + "title": "Mirostat Learning Rate", + "description": "Mirostat learning rate (eta).", + "controllerType": "input", + "controllerProps": { + "value": 0.1, + "placeholder": "0.1", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "mirostat_ent", + "title": "Mirostat Target Entropy", + "description": "Mirostat target entropy (tau).", + "controllerType": "input", + "controllerProps": { + "value": 5.0, + "placeholder": "5.0", + "type": "number", + "textAlign": "right", + "min": 0, + "step": 0.01 + } + }, + { + "key": "grammar_file", + "title": "Grammar File", + "description": "Path to a BNF-like grammar file to constrain generations.", + "controllerType": "input", + "controllerProps": { + "value": "", + "placeholder": "path/to/grammar.gbnf", + "type": "text" + } + }, + { + "key": "json_schema_file", + "title": "JSON Schema File", + "description": "Path to a JSON schema file to constrain generations.", + "controllerType": "input", + "controllerProps": { + "value": "", + "placeholder": "path/to/schema.json", + "type": "text" } } ]