components: schemas: ListModelsResponse: type: object properties: object: type: string enum: - list data: type: array items: $ref: "#/components/schemas/Model" required: - object - data Model: type: object properties: type: type: string default: model description: The type of the object. version: type: string default: "1" description: The version number of the model. id: type: string description: >- Unique identifier used in chat-completions model_name, matches folder name. example: zephyr-7b name: type: string description: Name of the model. example: Zephyr 7B owned_by: type: string description: Compatibility field for OpenAI. default: "" created: type: integer format: int64 description: Unix timestamp representing the creation time. description: type: string description: Description of the model. state: type: string enum: - null - downloading - ready - starting - stopping description: Current state of the model. format: type: string description: "State format of the model, distinct from the engine." example: ggufv3 source_url: type: string format: uri description: URL to the source of the model. example: >- https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/blob/main/zephyr-7b-beta.Q4_K_M.gguf settings: type: object properties: ctx_len: type: string description: Context length. example: "2048" ngl: type: string description: Number of layers. example: "100" embedding: type: string description: Indicates if embedding is enabled. example: "true" n_parallel: type: string description: Number of parallel processes. example: "4" additionalProperties: false parameters: type: object properties: temperature: type: string description: Temperature setting for the model. example: "0.7" token_limit: type: string description: Token limit for the model. example: "2048" top_k: type: string description: Top-k setting for the model. example: "0" top_p: type: string description: Top-p setting for the model. example: "1" stream: type: string description: Indicates if streaming is enabled. example: "true" additionalProperties: false metadata: type: object description: Additional metadata. assets: type: array items: type: string description: List of assets related to the model. required: - source_url ModelObject: type: object properties: id: type: string description: | The identifier of the model. example: zephyr-7b object: type: string description: | The type of the object, indicating it's a model. default: model created: type: integer format: int64 description: | Unix timestamp representing the creation time of the model. example: 1253935178 owned_by: type: string description: | The entity that owns the model. example: _ GetModelResponse: type: object properties: id: type: string description: The identifier of the model. example: zephyr-7b object: type: string description: "Type of the object, indicating it's a model." default: model created: type: integer format: int64 description: Unix timestamp representing the creation time of the model. owned_by: type: string description: The entity that owns the model. example: _ state: type: string enum: - not_downloaded - downloaded - running - stopped description: The current state of the model. source_url: type: string format: uri description: URL to the source of the model. example: >- https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/blob/main/zephyr-7b-beta.Q4_K_M.gguf engine_parameters: type: object properties: pre_prompt: type: string description: Predefined prompt used for setting up internal configurations. default: "" example: Initial setup complete. system_prompt: type: string description: Prefix used for system-level prompts. default: "SYSTEM: " user_prompt: type: string description: Prefix used for user prompts. default: "USER: " ai_prompt: type: string description: Prefix used for assistant prompts. default: "ASSISTANT: " ngl: type: integer description: >- Number of neural network layers loaded onto the GPU for acceleration. minimum: 0 maximum: 100 default: 100 example: 100 ctx_len: type: integer description: >- Context length for model operations, varies based on the specific model. minimum: 128 maximum: 4096 default: 2048 example: 2048 n_parallel: type: integer description: >- Number of parallel operations, relevant when continuous batching is enabled. minimum: 1 maximum: 10 default: 1 example: 4 cont_batching: type: boolean description: Indicates if continuous batching is used for processing. default: false example: false cpu_threads: type: integer description: Number of threads allocated for CPU-based inference. minimum: 1 example: 8 embedding: type: boolean description: Indicates if embedding layers are enabled in the model. default: true example: true model_parameters: type: object properties: ctx_len: type: integer description: Maximum context length the model can handle. minimum: 0 maximum: 4096 default: 2048 example: 2048 ngl: type: integer description: Number of layers in the neural network. minimum: 1 maximum: 100 default: 100 example: 100 embedding: type: boolean description: Indicates if embedding layers are used. default: true example: true n_parallel: type: integer description: Number of parallel processes the model can run. minimum: 1 maximum: 10 default: 1 example: 4 temperature: type: number description: >- Controls randomness in model's responses. Higher values lead to more random responses. minimum: 0 maximum: 2 default: 0.7 example: 0.7 token_limit: type: integer description: >- Maximum number of tokens the model can generate in a single response. minimum: 1 maximum: 4096 default: 2048 example: 2048 top_k: type: integer description: >- Limits the model to consider only the top k most likely next tokens at each step. minimum: 0 maximum: 100 default: 0 example: 0 top_p: type: number description: >- Nucleus sampling parameter. The model considers the smallest set of tokens whose cumulative probability exceeds the top_p value. minimum: 0 maximum: 1 default: 1 example: 1 metadata: type: object properties: engine: type: string description: The engine used by the model. enum: - nitro - openai - hf_inference quantization: type: string description: Quantization parameter of the model. example: Q3_K_L size: type: string description: Size of the model. example: 7B required: - id - object - created - owned_by - state - source_url - parameters - metadata DeleteModelResponse: type: object properties: id: type: string description: The identifier of the model that was deleted. example: model-zephyr-7B object: type: string description: Type of the object, indicating it's a model. default: model deleted: type: boolean description: Indicates whether the model was successfully deleted. example: true StartModelResponse: type: object properties: id: type: string description: The identifier of the model that was started. example: model-zephyr-7B object: type: string description: Type of the object, indicating it's a model. default: model state: type: string description: The current state of the model after the start operation. example: running required: - id - object - state StopModelResponse: type: object properties: id: type: string description: The identifier of the model that was started. example: model-zephyr-7B object: type: string description: Type of the object, indicating it's a model. default: model state: type: string description: The current state of the model after the start operation. example: stopped required: - id - object - state DownloadModelResponse: type: object properties: id: type: string description: The identifier of the model that was started. example: model-zephyr-7B object: type: string description: Type of the object, indicating it's a model. default: model state: type: string description: The current state of the model after the start operation. example: downloaded