From 3d24af5504c7e1ae3a61b0e41bb43fb87fb2029c Mon Sep 17 00:00:00 2001 From: Arista Indrajaya Date: Wed, 28 Feb 2024 08:55:15 +0700 Subject: [PATCH] docs: fix the customize engine section --- .../quickstart/models/customize-engine.mdx | 220 ++++-------------- 1 file changed, 49 insertions(+), 171 deletions(-) diff --git a/docs/docs/quickstart/models/customize-engine.mdx b/docs/docs/quickstart/models/customize-engine.mdx index ec38b0790..d56fde2d8 100644 --- a/docs/docs/quickstart/models/customize-engine.mdx +++ b/docs/docs/quickstart/models/customize-engine.mdx @@ -1,185 +1,63 @@ --- -sidebar_position: 3 +sidebar_position: 1 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import janModel from './assets/jan-model-hub.png'; -# Manual Import +# Customize Engine Settings -:::warning +In this guide, we'll walk you through the process of customizing your engine settings by tweaking the `nitro.json` file -This is currently under development. +1. Navigate to the `App Settings` > `Advanced` > `Open App Directory` > `~/jan/engine` folder. + + + + ```sh + cd ~/jan/engines + ``` + + + ```sh + C:/Users//jan/engines + ``` + + + ```sh + cd ~/jan/engines + ``` + + + +2. Modify the `nitro.json` file based on your needs. The default settings are shown below. + +```json title="~/jan/engines/nitro.json" +{ + "ctx_len": 2048, + "ngl": 100, + "cpu_threads": 1, + "cont_batching": false, + "embedding": false +} +``` + +The table below describes the parameters in the `nitro.json` file. + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `ctx_len` | **Integer** | Typically set at `2048`, `ctx_len` provides ample context for model operations like `GPT-3.5`. (*Maximum*: `4096`, *Minimum*: `1`) | +| `ngl` | **Integer** | Defaulted at `100`, `ngl` determines GPU layer usage. | +| `cpu_threads` | **Integer** | Determines CPU inference threads, limited by hardware and OS. (*Maximum* determined by system) | +| `cont_batching` | **Integer** | Controls continuous batching, enhancing throughput for LLM inference. | +| `embedding` | **Integer** | Enables embedding utilization for tasks like document-enhanced chat in RAG-based applications. | + +:::tip + - By default, the value of `ngl` is set to 100, which indicates that it will offload all. If you wish to offload only 50% of the GPU, you can set `ngl` to 15 because most models on Mistral or Llama are around ~ 30 layers. + - To utilize the embedding feature, include the JSON parameter `"embedding": true`. It will enable Nitro to process inferences with embedding capabilities. Please refer to the [Embedding in the Nitro documentation](https://nitro.jan.ai/features/embed) for a more detailed explanation. + - To utilize the continuous batching feature for boosting throughput and minimizing latency in large language model (LLM) inference, include `cont_batching: true`. For details, please refer to the [Continuous Batching in the Nitro documentation](https://nitro.jan.ai/features/cont-batch). ::: -This section will show you how to perform manual import. In this guide, we are using a GGUF model from [HuggingFace](https://huggingface.co/) and our latest model, [Trinity](https://huggingface.co/janhq/trinity-v1-GGUF), as an example. - -## Newer versions - nightly versions and v0.4.4+ - -### 1. Create a Model Folder - -1. Navigate to the `App Settings` > `Advanced` > `Open App Directory` > `~/jan/models` folder. - - - - ```sh - cd ~/jan/models - ``` - - - ```sh - C:/Users//jan/models - ``` - - - ```sh - cd ~/jan/models - ``` - - - -2. In the `models` folder, create a folder with the name of the model. - -```sh -mkdir trinity-v1-7b -``` - -### 2. Drag & Drop the Model - -Drag and drop your model binary into this folder, ensuring the `modelname.gguf` is the same name as the folder name, e.g. `models/modelname`. - -### 3. Done! - -If your model doesn't show up in the **Model Selector** in conversations, **restart the app** or contact us via our [Discord community](https://discord.gg/Dt7MxDyNNZ). - -## Older versions - before v0.4.4 - -### 1. Create a Model Folder - -1. Navigate to the `App Settings` > `Advanced` > `Open App Directory` > `~/jan/models` folder. - - - - ```sh - cd ~/jan/models - ``` - - - ```sh - C:/Users//jan/models - ``` - - - ```sh - cd ~/jan/models - ``` - - - -2. In the `models` folder, create a folder with the name of the model. - -```sh -mkdir trinity-v1-7b -``` - -### 2. Create a Model JSON - -Jan follows a folder-based, [standard model template](https://jan.ai/docs/engineering/models/) called a `model.json` to persist the model configurations on your local filesystem. - -This means that you can easily reconfigure your models, export them, and share your preferences transparently. - - - - ```sh - cd trinity-v1-7b - touch model.json - ``` - - - ```sh - cd trinity-v1-7b - echo {} > model.json - ``` - - - ```sh - cd trinity-v1-7b - touch model.json - ``` - - - -To update `model.json`: - - - Match `id` with folder name. - - Ensure GGUF filename matches `id`. - - Set `source.url` to direct download link ending in `.gguf`. In HuggingFace, you can find the direct links in the `Files and versions` tab. - - Verify that you are using the correct `prompt_template`. This is usually provided in the HuggingFace model's description page. - -```json title="model.json" -{ - "sources": [ - { - "filename": "trinity-v1.Q4_K_M.gguf", - "url": "https://huggingface.co/janhq/trinity-v1-GGUF/resolve/main/trinity-v1.Q4_K_M.gguf" - } - ], - "id": "trinity-v1-7b", - "object": "model", - "name": "Trinity-v1 7B Q4", - "version": "1.0", - "description": "Trinity is an experimental model merge of GreenNodeLM & LeoScorpius using the Slerp method. Recommended for daily assistance purposes.", - "format": "gguf", - "settings": { - "ctx_len": 4096, - "prompt_template": "{system_message}\n### Instruction:\n{prompt}\n### Response:", - "llama_model_path": "trinity-v1.Q4_K_M.gguf" - }, - "parameters": { - "max_tokens": 4096 - }, - "metadata": { - "author": "Jan", - "tags": ["7B", "Merged"], - "size": 4370000000 - }, - "engine": "nitro" -} -``` -#### Regarding `model.json` - -- In `settings`, two crucial values are: - - `ctx_len`: Defined based on the model's context size. - - `prompt_template`: Defined based on the model's trained template (e.g., ChatML, Alpaca). - - To set up the `prompt_template`: - 1. Visit Hugging Face. - 2. Locate the model (e.g., [Gemma 7b it](https://huggingface.co/google/gemma-7b-it)). - 3. Review the text and identify the template. -- In `parameters`, consider the following options. The fields in `parameters` are typically general and can be the same across models. An example is provided below: - -```json -"parameters":{ - "temperature": 0.7, - "top_p": 0.95, - "stream": true, - "max_tokens": 4096, - "frequency_penalty": 0, - "presence_penalty": 0 -} -``` - -### 3. Download the Model - -1. Restart Jan and navigate to the Hub. -2. Locate your model. -3. Click **Download** button to download the model binary. - -
- jan-model-hub -
- :::info[Assistance and Support] If you have questions, please join our [Discord community](https://discord.gg/Dt7MxDyNNZ) for support, updates, and discussions.