[ { "key": "cont_batching", "title": "Continuous Batching", "description": "Allows processing prompts in parallel with text generation, which usually improves performance.", "controllerType": "checkbox", "controllerProps": { "value": true } }, { "key": "n_parallel", "title": "Parallel Operations", "description": "Number of prompts that can be processed simultaneously by the model.", "controllerType": "input", "controllerProps": { "value": "4", "placeholder": "4", "type": "number", "textAlign": "right" } }, { "key": "cpu_threads", "title": "CPU Threads", "description": "Number of CPU cores used for model processing when running without GPU.", "controllerType": "input", "controllerProps": { "value": "-1", "placeholder": "Number of CPU threads", "type": "number", "textAlign": "right" } }, { "key": "threads_batch", "title": "Threads (Batch)", "description": "Number of threads for batch and prompt processing (default: same as Threads).", "controllerType": "input", "controllerProps": { "value": -1, "placeholder": "-1 (same as Threads)", "type": "number" } }, { "key": "flash_attn", "title": "Flash Attention", "description": "Optimizes memory usage and speeds up model inference using an efficient attention implementation.", "controllerType": "checkbox", "controllerProps": { "value": true } }, { "key": "caching_enabled", "title": "Caching", "description": "Stores recent prompts and responses to improve speed when similar questions are asked.", "controllerType": "checkbox", "controllerProps": { "value": true } }, { "key": "cache_type", "title": "KV Cache Type", "description": "Controls memory usage and precision trade-off.", "controllerType": "dropdown", "controllerProps": { "value": "f16", "options": [ { "value": "q4_0", "name": "q4_0" }, { "value": "q8_0", "name": "q8_0" }, { "value": "f16", "name": "f16" } ] } }, { "key": "use_mmap", "title": "mmap", "description": "Loads model files more efficiently by mapping them to memory, reducing RAM usage.", "controllerType": "checkbox", "controllerProps": { "value": true } } ]