jan/extensions/inference-cortex-extension/resources/default_settings.json

[
  {
    "key": "auto_unload_models",
    "title": "Auto-Unload Old Models",
    "description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "cont_batching",
    "title": "Continuous Batching",
    "description": "Allows processing prompts in parallel with text generation, which usually improves performance.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "n_parallel",
    "title": "Parallel Operations",
    "description": "Number of prompts that can be processed simultaneously by the model.",
    "controllerType": "input",
    "controllerProps": {
      "value": "4",
      "placeholder": "4",
      "type": "number",
      "textAlign": "right"
    }
  },
  {
    "key": "cpu_threads",
    "title": "CPU Threads",
    "description": "Number of CPU cores used for model processing when running without GPU.",
    "controllerType": "input",
    "controllerProps": {
      "value": "-1",
      "placeholder": "Number of CPU threads",
      "type": "number",
      "textAlign": "right"
    }
  },
  {
    "key": "threads_batch",
    "title": "Threads (Batch)",
    "description": "Number of threads for batch and prompt processing (default: same as Threads).",
    "controllerType": "input",
    "controllerProps": {
      "value": -1,
      "placeholder": "-1 (same as Threads)",
      "type": "number"
    }
  },
  {
    "key": "flash_attn",
    "title": "Flash Attention",
    "description": "Optimizes memory usage and speeds up model inference using an efficient attention implementation.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "caching_enabled",
    "title": "Caching",
    "description": "Stores recent prompts and responses to improve speed when similar questions are asked.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "cache_type",
    "title": "KV Cache Type",
    "description": "Controls memory usage and precision trade-off.",
    "controllerType": "dropdown",
    "controllerProps": {
      "value": "f16",
      "options": [
        {
          "value": "q4_0",
          "name": "q4_0"
        },
        {
          "value": "q8_0",
          "name": "q8_0"
        },
        {
          "value": "f16",
          "name": "f16"
        }
      ]
    }
  },
  {
    "key": "use_mmap",
    "title": "mmap",
    "description": "Loads model files more efficiently by mapping them to memory, reducing RAM usage.",
    "controllerType": "checkbox",
    "controllerProps": {
      "value": true
    }
  },
  {
    "key": "hugging-face-access-token",
    "title": "Hugging Face Access Token",
    "description": "Access tokens programmatically authenticate your identity to the Hugging Face Hub, allowing applications to perform specific actions specified by the scope of permissions granted.",
    "controllerType": "input",
    "controllerProps": {
      "value": "",
      "placeholder": "hf_**********************************",
      "type": "password",
      "inputActions": ["unobscure", "copy"]
    }
  }
]