remove and add to retrieve model

2023-12-05 18:53:29 +07:00 · 2023-12-05 18:53:29 +07:00 · 6ac5b0c5f0
commit 6ac5b0c5f0
parent f99bf0f008
3 changed files with 96 additions and 201 deletions
--- a/docs/docs/specs/engineering/inference-parameters.md
+++ b/docs/docs/specs/engineering/inference-parameters.md
@ -1,171 +0,0 @@
 ---
 title: "Inference Parameters"
 slug: /specs/inference-parameters
 description: Exhaustive list of json-schema for engine and models
 ---
 # model_parameters
 ```js
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "required": ["messages"],
  "properties": {
    "messages": {
      "type": "array",
      "items": {
        "type": "object"
      }
    },
    "model": {
      "type": "string"
    },
    "frequency_penalty": {
      "type": ["number", "null"],
      "minimum": -2.0,
      "maximum": 2.0,
      "default": 0
    },
    "logit_bias": {
      "type": ["object", "null"],
      "additionalProperties": {
        "type": "number",
        "minimum": -100,
        "maximum": 100
      },
      "default": null
    },
    "max_tokens": {
      "type": ["integer", "null"]
    },
    "n": {
      "type": ["integer", "null"],
      "default": 1
    },
    "presence_penalty": {
      "type": ["number", "null"],
      "minimum": -2.0,
      "maximum": 2.0,
      "default": 0
    },
    "response_format": {
      "type": ["object", "null"],
      "properties": {
        "type": {
          "type": "string"
        }
      }
    },
    "seed": {
      "type": ["integer", "null"]
    },
    "stop": {
      "type": ["string", "array", "null"],
      "items": {
        "type": "string"
      }
    },
    "stream": {
      "type": ["boolean", "null"],
      "default": false
    },
    "temperature": {
      "type": ["number", "null"],
      "minimum": 0,
      "maximum": 2,
      "default": 1
    },
    "top_p": {
      "type": ["number", "null"],
      "minimum": 0,
      "maximum": 1,
      "default": 1
    },
    "tools": {
      "type": ["array", "null"],
      "items": {
        "type": "object"
      }
    },
    "tool_choice": {
      "type": ["string", "object", "null"]
    },
    "user": {
      "type": ["string", "null"]
    },
    "function_call": {
      "type": ["string", "object", "null"],
      "deprecated": true
    },
    "functions": {
      "type": ["array", "null"],
      "items": {
        "type": "object"
      },
      "deprecated": true
    }
  }
 }
 ```
 # nitro engine_parameters
 ```js
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "pre_prompt": {
      "type": "string",
      "description": "The prompt to use for internal configuration."
    },
    "system_prompt": {
      "type": "string",
      "description": "The prefix for system prompt."
    },
    "user_prompt": {
      "type": "string",
      "description": "The prefix for user prompt."
    },
    "ai_prompt": {
      "type": "string",
      "description": "The prefix for assistant prompt."
    },
    "ngl": {
      "type": "integer",
      "default": 100,
      "minimum": 0,
      "maximum": 100,
      "description": "The number of layers to load onto the GPU for acceleration."
    },
    "ctx_len": {
      "type": "integer",
      "default": 2048,
      "minimum": 128,
      "maximum": 4096,
      "description": "The context length for model operations varies; the maximum depends on the specific model used."
    },
    "n_parallel": {
      "type": "integer",
      "default": 1,
      "description": "The number of parallel operations. Only set when enable continuous batching."
    },
    "cont_batching": {
      "type": "boolean",
      "default": false,
      "description": "Whether to use continuous batching."
    },
    "cpu_threads": {
      "type": "integer",
      "description": "The number of threads for CPU-based inference."
    },
    "embedding": {
      "type": "boolean",
      "description": "Whether to enable embedding."
    }
  }
 }
 ```
--- a/docs/openapi/specs/models.yaml
+++ b/docs/openapi/specs/models.yaml
@ -169,53 +169,120 @@ components:
          format: uri
          description: "URL to the source of the model."
          example: "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/blob/main/zephyr-7b-beta.Q4_K_M.gguf"
-        parameters:
+        engine_parameters:
          type: object
          properties:
            pre_prompt:
              type: string
              description: "Predefined prompt used for setting up internal configurations."
              default: ""
              example: "Initial setup complete."
            system_prompt:
              type: string
              description: "Prefix used for system-level prompts."
              default: "SYSTEM: "
            user_prompt:
              type: string
              description: "Prefix used for user prompts."
              default:  "USER: "
            ai_prompt:
              type: string
              description: "Prefix used for assistant prompts."
              default: "ASSISTANT: "
            ngl:
              type: integer
              description: "Number of neural network layers loaded onto the GPU for acceleration."
              minimum: 0
              maximum: 100
              default: 100
              example: 100
            ctx_len:
              type: integer
              description: "Context length for model operations, varies based on the specific model."
              minimum: 128
              maximum: 4096
              default: 2048
              example: 2048
            n_parallel:
              type: integer
              description: "Number of parallel operations, relevant when continuous batching is enabled."
              minimum: 1
              maximum: 10
              default: 1
              example: 4
            cont_batching:
              type: boolean
              description: "Indicates if continuous batching is used for processing."
              default: false
              example: false
            cpu_threads:
              type: integer
              description: "Number of threads allocated for CPU-based inference."
              minimum: 1
              example: 8
            embedding:
              type: boolean
              description: "Indicates if embedding layers are enabled in the model."
              default: true
              example: true
        model_parameters:
          type: object
          properties:
            ctx_len:
              type: integer
-              description: "Context length."
+              description: "Maximum context length the model can handle."
              minimum: 0
              maximum: 4096
              default: 2048
              example: 2048
            ngl:
              type: integer
-              description: "Number of layers."
+              description: "Number of layers in the neural network."
              minimum: 1
              maximum: 100
              default: 100
              example: 100
            embedding:
              type: boolean
-              description: "Indicates if embedding is enabled."
+              description: "Indicates if embedding layers are used."
              default: true
              example: true
            n_parallel:
              type: integer
-              description: "Number of parallel processes."
+              description: "Number of parallel processes the model can run."
              minimum: 1
              maximum: 10
              default: 1
              example: 4
            # pre_prompt:
            #   type: string
            #   description: "Predefined prompt for initiating the chat."
            #   example: "A chat between a curious user and an artificial intelligence"
            # user_prompt:
            #   type: string
            #   description: "Format of user's prompt."
            #   example: "USER: "
            # ai_prompt:
            #   type: string
            #   description: "Format of AI's response."
            #   example: "ASSISTANT: "
            temperature:
-              type: string
+              type: number
-              description: "Temperature setting for the model."
+              description: "Controls randomness in model's responses. Higher values lead to more random responses."
-              example: "0.7"
+              minimum: 0.0
              maximum: 2.0
              default: 0.7
              example: 0.7
            token_limit:
-              type: string
+              type: integer
-              description: "Token limit for the model."
+              description: "Maximum number of tokens the model can generate in a single response."
-              example: "2048"
+              minimum: 1
              maximum: 4096
              default: 2048
              example: 2048
            top_k:
-              type: string
+              type: integer
-              description: "Top-k setting for the model."
+              description: "Limits the model to consider only the top k most likely next tokens at each step."
-              example: "0"
+              minimum: 0
              maximum: 100
              default: 0
              example: 0
            top_p:
-              type: string
+              type: number
-              description: "Top-p setting for the model."
+              description: "Nucleus sampling parameter. The model considers the smallest set of tokens whose cumulative probability exceeds the top_p value."
-              example: "1"
+              minimum: 0.0
              maximum: 1.0
              default: 1.0
              example: 1.0
        metadata:
          type: object
          properties:
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@ -82,7 +82,6 @@ const sidebars = {
        "specs/engineering/chats",
        "specs/engineering/models",
        "specs/engineering/engine",
        "specs/engineering/inference-parameters",
        "specs/engineering/threads",
        "specs/engineering/messages",
        "specs/engineering/assistants",