Louis a1111033d9
chore: allow users to setting model offload (#5134)
* chore: allow users to setting model offload

* chore: apply model.yaml configurations to default model settings

* chore: fallback default value
2025-05-29 13:29:32 +07:00

118 lines
3.2 KiB
JSON

[
{
"key": "auto_unload_models",
"title": "Auto-Unload Old Models",
"description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
"controllerType": "checkbox",
"controllerProps": {
"value": true
}
},
{
"key": "cont_batching",
"title": "Continuous Batching",
"description": "Allows processing prompts in parallel with text generation, which usually improves performance.",
"controllerType": "checkbox",
"controllerProps": {
"value": true
}
},
{
"key": "n_parallel",
"title": "Parallel Operations",
"description": "Number of prompts that can be processed simultaneously by the model.",
"controllerType": "input",
"controllerProps": {
"value": "4",
"placeholder": "4",
"type": "number",
"textAlign": "right"
}
},
{
"key": "cpu_threads",
"title": "CPU Threads",
"description": "Number of CPU cores used for model processing when running without GPU.",
"controllerType": "input",
"controllerProps": {
"value": "-1",
"placeholder": "Number of CPU threads",
"type": "number",
"textAlign": "right"
}
},
{
"key": "threads_batch",
"title": "Threads (Batch)",
"description": "Number of threads for batch and prompt processing (default: same as Threads).",
"controllerType": "input",
"controllerProps": {
"value": -1,
"placeholder": "-1 (same as Threads)",
"type": "number"
}
},
{
"key": "flash_attn",
"title": "Flash Attention",
"description": "Optimizes memory usage and speeds up model inference using an efficient attention implementation.",
"controllerType": "checkbox",
"controllerProps": {
"value": true
}
},
{
"key": "caching_enabled",
"title": "Caching",
"description": "Stores recent prompts and responses to improve speed when similar questions are asked.",
"controllerType": "checkbox",
"controllerProps": {
"value": true
}
},
{
"key": "cache_type",
"title": "KV Cache Type",
"description": "Controls memory usage and precision trade-off.",
"controllerType": "dropdown",
"controllerProps": {
"value": "f16",
"options": [
{
"value": "q4_0",
"name": "q4_0"
},
{
"value": "q8_0",
"name": "q8_0"
},
{
"value": "f16",
"name": "f16"
}
]
}
},
{
"key": "use_mmap",
"title": "mmap",
"description": "Loads model files more efficiently by mapping them to memory, reducing RAM usage.",
"controllerType": "checkbox",
"controllerProps": {
"value": true
}
},
{
"key": "hugging-face-access-token",
"title": "Hugging Face Access Token",
"description": "Access tokens programmatically authenticate your identity to the Hugging Face Hub, allowing applications to perform specific actions specified by the scope of permissions granted.",
"controllerType": "input",
"controllerProps": {
"value": "",
"placeholder": "hf_**********************************",
"type": "password",
"inputActions": ["unobscure", "copy"]
}
}
]