[ { "key": "threads", "title": "Threads", "description": "Number of threads to use during generation (-1 for logical cores).", "controllerType": "input", "controllerProps": { "value": -1, "placeholder": "-1", "type": "number", "textAlign": "right" } }, { "key": "threads_batch", "title": "Threads (Batch)", "description": "Number of threads for batch and prompt processing (default: same as Threads).", "controllerType": "input", "controllerProps": { "value": -1, "placeholder": "-1 (same as Threads)", "type": "number", "textAlign": "right" } }, { "key": "ctx_size", "title": "Context Size", "description": "Size of the prompt context (0 = loaded from model).", "controllerType": "input", "controllerProps": { "value": 8192, "placeholder": "8192", "type": "number", "textAlign": "right" } }, { "key": "n_predict", "title": "Max Tokens to Predict", "description": "Maximum number of tokens to generate (-1 = infinity).", "controllerType": "input", "controllerProps": { "value": -1, "placeholder": "-1", "type": "number", "textAlign": "right" } }, { "key": "batch_size", "title": "Batch Size", "description": "Logical maximum batch size for processing prompts.", "controllerType": "input", "controllerProps": { "value": 2048, "placeholder": "2048", "type": "number", "textAlign": "right" } }, { "key": "ubatch_size", "title": "uBatch Size", "description": "Physical maximum batch size for processing prompts.", "controllerType": "input", "controllerProps": { "value": 512, "placeholder": "512", "type": "number", "textAlign": "right" } }, { "key": "n_gpu_layers", "title": "GPU Layers", "description": "Number of model layers to offload to the GPU (-1 for all layers, 0 for CPU only).", "controllerType": "input", "controllerProps": { "value": -1, "placeholder": "-1", "type": "number", "textAlign": "right" } }, { "key": "device", "title": "Devices for Offload", "description": "Comma-separated list of devices to use for offloading (e.g., 'cuda:0', 'cuda:0,cuda:1'). Leave empty to use default/CPU only.", "controllerType": "input", "controllerProps": { "value": "", "placeholder": "cuda:0", "type": "text" } }, { "key": "split_mode", "title": "GPU Split Mode", "description": "How to split the model across multiple GPUs.", "controllerType": "dropdown", "controllerProps": { "value": "layer", "options": [ { "value": "none", "name": "None" }, { "value": "layer", "name": "Layer" }, { "value": "row", "name": "Row" } ] } }, { "key": "main_gpu", "title": "Main GPU Index", "description": "The GPU to use for the model (split-mode=none) or intermediate results (split-mode=row).", "controllerType": "input", "controllerProps": { "value": 0, "placeholder": "0", "type": "number", "textAlign": "right" } }, { "key": "flash_attn", "title": "Flash Attention", "description": "Enable Flash Attention for optimized performance.", "controllerType": "checkbox", "controllerProps": { "value": false } }, { "key": "cont_batching", "title": "Continuous Batching", "description": "Enable continuous batching (a.k.a dynamic batching) for concurrent requests (default: enabled).", "controllerType": "checkbox", "controllerProps": { "value": true } }, { "key": "no_mmap", "title": "Disable mmap", "description": "Do not memory-map model (slower load but may reduce pageouts if not using mlock).", "controllerType": "checkbox", "controllerProps": { "value": false } }, { "key": "mlock", "title": "MLock", "description": "Force system to keep model in RAM, preventing swapping/compression.", "controllerType": "checkbox", "controllerProps": { "value": false } }, { "key": "no_kv_offload", "title": "Disable KV Offload", "description": "Disable KV cache offload to GPU (if GPU is used).", "controllerType": "checkbox", "controllerProps": { "value": false } }, { "key": "cache_type_k", "title": "KV Cache K Type", "description": "KV cache data type for Keys (default: f16).", "controllerType": "dropdown", "controllerProps": { "value": "f16", "options": [ { "value": "f32", "name": "f32" }, { "value": "f16", "name": "f16" }, { "value": "bf16", "name": "bf16" }, { "value": "q8_0", "name": "q8_0" }, { "value": "q4_0", "name": "q4_0" }, { "value": "q4_1", "name": "q4_1" }, { "value": "iq4_nl", "name": "iq4_nl" }, { "value": "q5_0", "name": "q5_0" }, { "value": "q5_1", "name": "q5_1" } ] } }, { "key": "cache_type_v", "title": "KV Cache V Type", "description": "KV cache data type for Values (default: f16).", "controllerType": "dropdown", "controllerProps": { "value": "f16", "options": [ { "value": "f32", "name": "f32" }, { "value": "f16", "name": "f16" }, { "value": "bf16", "name": "bf16" }, { "value": "q8_0", "name": "q8_0" }, { "value": "q4_0", "name": "q4_0" }, { "value": "q4_1", "name": "q4_1" }, { "value": "iq4_nl", "name": "iq4_nl" }, { "value": "q5_0", "name": "q5_0" }, { "value": "q5_1", "name": "q5_1" } ] } }, { "key": "defrag_thold", "title": "KV Cache Defragmentation Threshold", "description": "Threshold for KV cache defragmentation (< 0 to disable).", "controllerType": "input", "controllerProps": { "value": 0.1, "placeholder": "0.1", "type": "number", "textAlign": "right", "step": 0.01 } }, { "key": "rope_scaling", "title": "RoPE Scaling Method", "description": "RoPE frequency scaling method.", "controllerType": "dropdown", "controllerProps": { "value": "none", "options": [ { "value": "none", "name": "None" }, { "value": "linear", "name": "Linear" }, { "value": "yarn", "name": "YaRN" } ] } }, { "key": "rope_scale", "title": "RoPE Scale Factor", "description": "RoPE context scaling factor.", "controllerType": "input", "controllerProps": { "value": 1.0, "placeholder": "1.0", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "rope_freq_base", "title": "RoPE Frequency Base", "description": "RoPE base frequency (0 = loaded from model).", "controllerType": "input", "controllerProps": { "value": 0, "placeholder": "0 (model default)", "type": "number", "textAlign": "right" } }, { "key": "rope_freq_scale", "title": "RoPE Frequency Scale Factor", "description": "RoPE frequency scaling factor.", "controllerType": "input", "controllerProps": { "value": 1.0, "placeholder": "1.0", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "temp", "title": "Temperature", "description": "Temperature for sampling (higher = more random).", "controllerType": "input", "controllerProps": { "value": 0.8, "placeholder": "0.8", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "top_k", "title": "Top K", "description": "Top-K sampling (0 = disabled).", "controllerType": "input", "controllerProps": { "value": 40, "placeholder": "40", "type": "number", "textAlign": "right", "min": 0 } }, { "key": "top_p", "title": "Top P", "description": "Top-P sampling (1.0 = disabled).", "controllerType": "input", "controllerProps": { "value": 0.9, "placeholder": "0.9", "type": "number", "textAlign": "right", "min": 0, "max": 1.0, "step": 0.01 } }, { "key": "min_p", "title": "Min P", "description": "Min-P sampling (0.0 = disabled).", "controllerType": "input", "controllerProps": { "value": 0.1, "placeholder": "0.1", "type": "number", "textAlign": "right", "min": 0, "max": 1.0, "step": 0.01 } }, { "key": "repeat_last_n", "title": "Repeat Last N", "description": "Number of tokens to consider for repeat penalty (0 = disabled, -1 = ctx_size).", "controllerType": "input", "controllerProps": { "value": 64, "placeholder": "64", "type": "number", "textAlign": "right", "min": -1 } }, { "key": "repeat_penalty", "title": "Repeat Penalty", "description": "Penalize repeating token sequences (1.0 = disabled).", "controllerType": "input", "controllerProps": { "value": 1.0, "placeholder": "1.0", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "presence_penalty", "title": "Presence Penalty", "description": "Repeat alpha presence penalty (0.0 = disabled).", "controllerType": "input", "controllerProps": { "value": 0.0, "placeholder": "0.0", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "frequency_penalty", "title": "Frequency Penalty", "description": "Repeat alpha frequency penalty (0.0 = disabled).", "controllerType": "input", "controllerProps": { "value": 0.0, "placeholder": "0.0", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "mirostat", "title": "Mirostat Mode", "description": "Use Mirostat sampling (0: disabled, 1: Mirostat V1, 2: Mirostat V2).", "controllerType": "dropdown", "controllerProps": { "value": 0, "options": [ { "value": 0, "name": "Disabled" }, { "value": 1, "name": "Mirostat V1" }, { "value": 2, "name": "Mirostat V2" } ] } }, { "key": "mirostat_lr", "title": "Mirostat Learning Rate", "description": "Mirostat learning rate (eta).", "controllerType": "input", "controllerProps": { "value": 0.1, "placeholder": "0.1", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "mirostat_ent", "title": "Mirostat Target Entropy", "description": "Mirostat target entropy (tau).", "controllerType": "input", "controllerProps": { "value": 5.0, "placeholder": "5.0", "type": "number", "textAlign": "right", "min": 0, "step": 0.01 } }, { "key": "grammar_file", "title": "Grammar File", "description": "Path to a BNF-like grammar file to constrain generations.", "controllerType": "input", "controllerProps": { "value": "", "placeholder": "path/to/grammar.gbnf", "type": "text" } }, { "key": "json_schema_file", "title": "JSON Schema File", "description": "Path to a JSON schema file to constrain generations.", "controllerType": "input", "controllerProps": { "value": "", "placeholder": "path/to/schema.json", "type": "text" } } ]