components: schemas: ListModelsResponse: type: object properties: object: type: string enum: [list] data: type: array items: $ref: "#/components/schemas/Model" required: - object - data Model: type: object properties: type: type: string default: "model" description: "The type of the object." version: type: string default: "1" description: "The version number of the model." id: type: string description: "Unique identifier used in chat-completions model_name, matches folder name." example: "zephyr-7b" name: type: string description: "Name of the model." example: "Zephyr 7B" owned_by: type: string description: "Compatibility field for OpenAI." default: "" created: type: integer format: int64 description: "Unix timestamp representing the creation time." description: type: string description: "Description of the model." state: type: string enum: [null, "downloading", "ready", "starting", "stopping"] description: "Current state of the model." format: type: string description: "State format of the model, distinct from the engine." example: "ggufv3" source_url: type: string format: uri description: "URL to the source of the model." example: "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/blob/main/zephyr-7b-beta.Q4_K_M.gguf" settings: type: object properties: ctx_len: type: string description: "Context length." example: "2048" ngl: type: string description: "Number of layers." example: "100" embedding: type: string description: "Indicates if embedding is enabled." example: "true" n_parallel: type: string description: "Number of parallel processes." example: "4" additionalProperties: false parameters: type: object properties: temperature: type: string description: "Temperature setting for the model." example: "0.7" token_limit: type: string description: "Token limit for the model." example: "2048" top_k: type: string description: "Top-k setting for the model." example: "0" top_p: type: string description: "Top-p setting for the model." example: "1" stream: type: string description: "Indicates if streaming is enabled." example: "true" additionalProperties: false metadata: type: object description: "Additional metadata." assets: type: array items: type: string description: "List of assets related to the model." required: - source_url ModelObject: type: object properties: id: type: string description: | "The identifier of the model." example: "zephyr-7b" object: type: string description: | "The type of the object, indicating it's a model." default: "model" created: type: integer format: int64 description: | "Unix timestamp representing the creation time of the model." example: "1253935178" owned_by: type: string description: | "The entity that owns the model." example: "_" GetModelResponse: type: object properties: id: type: string description: "The identifier of the model." example: "zephyr-7b" object: type: string description: "Type of the object, indicating it's a model." default: "model" created: type: integer format: int64 description: "Unix timestamp representing the creation time of the model." owned_by: type: string description: "The entity that owns the model." example: "_" state: type: string enum: [not_downloaded, downloaded, running, stopped] description: "The current state of the model." source_url: type: string format: uri description: "URL to the source of the model." example: "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/blob/main/zephyr-7b-beta.Q4_K_M.gguf" engine_parameters: type: object properties: pre_prompt: type: string description: "Predefined prompt used for setting up internal configurations." default: "" example: "Initial setup complete." system_prompt: type: string description: "Prefix used for system-level prompts." default: "SYSTEM: " user_prompt: type: string description: "Prefix used for user prompts." default: "USER: " ai_prompt: type: string description: "Prefix used for assistant prompts." default: "ASSISTANT: " ngl: type: integer description: "Number of neural network layers loaded onto the GPU for acceleration." minimum: 0 maximum: 100 default: 100 example: 100 ctx_len: type: integer description: "Context length for model operations, varies based on the specific model." minimum: 128 maximum: 4096 default: 2048 example: 2048 n_parallel: type: integer description: "Number of parallel operations, relevant when continuous batching is enabled." minimum: 1 maximum: 10 default: 1 example: 4 cont_batching: type: boolean description: "Indicates if continuous batching is used for processing." default: false example: false cpu_threads: type: integer description: "Number of threads allocated for CPU-based inference." minimum: 1 example: 8 embedding: type: boolean description: "Indicates if embedding layers are enabled in the model." default: true example: true model_parameters: type: object properties: ctx_len: type: integer description: "Maximum context length the model can handle." minimum: 0 maximum: 4096 default: 2048 example: 2048 ngl: type: integer description: "Number of layers in the neural network." minimum: 1 maximum: 100 default: 100 example: 100 embedding: type: boolean description: "Indicates if embedding layers are used." default: true example: true n_parallel: type: integer description: "Number of parallel processes the model can run." minimum: 1 maximum: 10 default: 1 example: 4 temperature: type: number description: "Controls randomness in model's responses. Higher values lead to more random responses." minimum: 0.0 maximum: 2.0 default: 0.7 example: 0.7 token_limit: type: integer description: "Maximum number of tokens the model can generate in a single response." minimum: 1 maximum: 4096 default: 2048 example: 2048 top_k: type: integer description: "Limits the model to consider only the top k most likely next tokens at each step." minimum: 0 maximum: 100 default: 0 example: 0 top_p: type: number description: "Nucleus sampling parameter. The model considers the smallest set of tokens whose cumulative probability exceeds the top_p value." minimum: 0.0 maximum: 1.0 default: 1.0 example: 1.0 metadata: type: object properties: engine: type: string description: "The engine used by the model." example: "llamacpp" quantization: type: string description: "Quantization parameter of the model." example: "Q3_K_L" size: type: string description: "Size of the model." example: "7B" required: - id - object - created - owned_by - state - source_url - parameters - metadata DeleteModelResponse: type: object properties: id: type: string description: "The identifier of the model that was deleted." example: "model-zephyr-7B" object: type: string description: "Type of the object, indicating it's a model." default: "model" deleted: type: boolean description: "Indicates whether the model was successfully deleted." example: true StartModelResponse: type: object properties: id: type: string description: "The identifier of the model that was started." example: "model-zephyr-7B" object: type: string description: "Type of the object, indicating it's a model." default: "model" state: type: string description: "The current state of the model after the start operation." example: "running" required: - id - object - state StopModelResponse: type: object properties: id: type: string description: "The identifier of the model that was started." example: "model-zephyr-7B" object: type: string description: "Type of the object, indicating it's a model." default: "model" state: type: string description: "The current state of the model after the start operation." example: "stopped" required: - id - object - state DownloadModelResponse: type: object properties: id: type: string description: "The identifier of the model that was started." example: "model-zephyr-7B" object: type: string description: "Type of the object, indicating it's a model." default: "model" state: type: string description: "The current state of the model after the start operation." example: "downloaded"