diff --git a/extensions/inference-nitro-extension/package.json b/extensions/inference-nitro-extension/package.json index 3cfdd3338..e15b09ec0 100644 --- a/extensions/inference-nitro-extension/package.json +++ b/extensions/inference-nitro-extension/package.json @@ -1,7 +1,7 @@ { "name": "@janhq/inference-nitro-extension", "productName": "Nitro Inference Engine", - "version": "1.0.4", + "version": "1.0.5", "description": "This extension embeds Nitro, a lightweight (3mb) inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.", "main": "dist/index.js", "node": "dist/node/index.cjs.js", diff --git a/extensions/inference-nitro-extension/resources/models/codeninja-1.0-7b/model.json b/extensions/inference-nitro-extension/resources/models/codeninja-1.0-7b/model.json index 4ffe355d1..b82a430a1 100644 --- a/extensions/inference-nitro-extension/resources/models/codeninja-1.0-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/codeninja-1.0-7b/model.json @@ -8,11 +8,11 @@ "id": "codeninja-1.0-7b", "object": "model", "name": "CodeNinja 7B Q4", - "version": "1.0", + "version": "1.1", "description": "CodeNinja is good for coding tasks and can handle various languages including Python, C, C++, Rust, Java, JavaScript, and more.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 8192, "prompt_template": "GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:", "llama_model_path": "codeninja-1.0-openchat-7b.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 8192, "frequency_penalty": 0, "presence_penalty": 0 }, diff --git a/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json b/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json index 2f4b5e0dc..031dc88d4 100644 --- a/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json +++ b/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json @@ -8,11 +8,11 @@ "id": "command-r-34b", "object": "model", "name": "Command-R v01 34B Q4", - "version": "1.3", + "version": "1.4", "description": "C4AI Command-R developed by CohereAI is optimized for a variety of use cases including reasoning, summarization, and question answering.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 131072, "prompt_template": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", "llama_model_path": "c4ai-command-r-v01-Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 131072, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/deepseek-coder-1.3b/model.json b/extensions/inference-nitro-extension/resources/models/deepseek-coder-1.3b/model.json index 365dbfd2f..49814adf2 100644 --- a/extensions/inference-nitro-extension/resources/models/deepseek-coder-1.3b/model.json +++ b/extensions/inference-nitro-extension/resources/models/deepseek-coder-1.3b/model.json @@ -8,11 +8,11 @@ "id": "deepseek-coder-1.3b", "object": "model", "name": "Deepseek Coder 1.3B Q8", - "version": "1.0", + "version": "1.1", "description": "Deepseek Coder excelled in project-level code completion with advanced capabilities across multiple programming languages.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 16384, "prompt_template": "### Instruction:\n{prompt}\n### Response:", "llama_model_path": "deepseek-coder-1.3b-instruct.Q8_0.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 16384, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/deepseek-coder-34b/model.json b/extensions/inference-nitro-extension/resources/models/deepseek-coder-34b/model.json index 8e17b9563..9d3f6a4b2 100644 --- a/extensions/inference-nitro-extension/resources/models/deepseek-coder-34b/model.json +++ b/extensions/inference-nitro-extension/resources/models/deepseek-coder-34b/model.json @@ -1,26 +1,26 @@ { "sources": [ { - "filename": "deepseek-coder-33b-instruct.Q5_K_M.gguf", - "url": "https://huggingface.co/TheBloke/deepseek-coder-33B-instruct-GGUF/resolve/main/deepseek-coder-33b-instruct.Q5_K_M.gguf" + "filename": "deepseek-coder-33b-instruct.Q4_K_M.gguf", + "url": "https://huggingface.co/TheBloke/deepseek-coder-33B-instruct-GGUF/resolve/main/deepseek-coder-33b-instruct.Q4_K_M.gguf" } ], "id": "deepseek-coder-34b", "object": "model", - "name": "Deepseek Coder 33B Q5", - "version": "1.0", + "name": "Deepseek Coder 33B Q4", + "version": "1.1", "description": "Deepseek Coder excelled in project-level code completion with advanced capabilities across multiple programming languages.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 16384, "prompt_template": "### Instruction:\n{prompt}\n### Response:", - "llama_model_path": "deepseek-coder-33b-instruct.Q5_K_M.gguf" + "llama_model_path": "deepseek-coder-33b-instruct.Q4_K_M.gguf" }, "parameters": { "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 16384, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json b/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json index 5615d3358..9673b62dd 100644 --- a/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json +++ b/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json @@ -8,11 +8,11 @@ "id": "gemma-2b", "object": "model", "name": "Gemma 2B Q4", - "version": "1.0", + "version": "1.1", "description": "Gemma is built from the same technology with Google's Gemini.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 8192, "prompt_template": "user\n{prompt}\nmodel", "llama_model_path": "gemma-2b-it-q4_k_m.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 8192, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json b/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json index 043c85b4a..d7384a8b9 100644 --- a/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json @@ -8,11 +8,11 @@ "id": "gemma-7b", "object": "model", "name": "Gemma 7B Q4", - "version": "1.0", + "version": "1.1", "description": "Google's Gemma is built for multilingual purpose", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 8192, "prompt_template": "user\n{prompt}\nmodel", "llama_model_path": "gemma-7b-it-q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 8192, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/mistral-ins-7b-q4/model.json b/extensions/inference-nitro-extension/resources/models/mistral-ins-7b-q4/model.json index 3f9cab127..73a908e17 100644 --- a/extensions/inference-nitro-extension/resources/models/mistral-ins-7b-q4/model.json +++ b/extensions/inference-nitro-extension/resources/models/mistral-ins-7b-q4/model.json @@ -8,11 +8,11 @@ "id": "mistral-ins-7b-q4", "object": "model", "name": "Mistral Instruct 7B Q4", - "version": "1.0", + "version": "1.1", "description": "Mistral Instruct 7b model, specifically designed for a comprehensive understanding of the world.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 32768, "prompt_template": "[INST] {prompt} [/INST]", "llama_model_path": "mistral-7b-instruct-v0.2.Q4_K_M.gguf" }, @@ -20,8 +20,8 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, - "stop": [], + "max_tokens": 32768, + "stop": ["[/INST]"], "frequency_penalty": 0, "presence_penalty": 0 }, diff --git a/extensions/inference-nitro-extension/resources/models/mixtral-8x7b-instruct/model.json b/extensions/inference-nitro-extension/resources/models/mixtral-8x7b-instruct/model.json index e0a0ee040..0935a25a3 100644 --- a/extensions/inference-nitro-extension/resources/models/mixtral-8x7b-instruct/model.json +++ b/extensions/inference-nitro-extension/resources/models/mixtral-8x7b-instruct/model.json @@ -8,11 +8,11 @@ "id": "mixtral-8x7b-instruct", "object": "model", "name": "Mixtral 8x7B Instruct Q4", - "version": "1.0", + "version": "1.1", "description": "The Mixtral-8x7B is a pretrained generative Sparse Mixture of Experts. The Mixtral-8x7B outperforms 70B models on most benchmarks.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 32768, "prompt_template": "[INST] {prompt} [/INST]", "llama_model_path": "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 32768, "frequency_penalty": 0, "presence_penalty": 0 }, diff --git a/extensions/inference-nitro-extension/resources/models/noromaid-7b/model.json b/extensions/inference-nitro-extension/resources/models/noromaid-7b/model.json index 516bc62a9..ab0884936 100644 --- a/extensions/inference-nitro-extension/resources/models/noromaid-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/noromaid-7b/model.json @@ -8,11 +8,11 @@ "id": "noromaid-7b", "object": "model", "name": "Noromaid 7B Q4", - "version": "1.0", + "version": "1.1", "description": "The Noromaid 7b model is designed for role-playing with human-like behavior.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 32768, "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", "llama_model_path": "Noromaid-7B-0.4-DPO.q4_k_m.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 32768, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/openchat-3.5-7b/model.json b/extensions/inference-nitro-extension/resources/models/openchat-3.5-7b/model.json index 1b4dbae19..b37a470d3 100644 --- a/extensions/inference-nitro-extension/resources/models/openchat-3.5-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/openchat-3.5-7b/model.json @@ -8,11 +8,11 @@ "id": "openchat-3.5-7b", "object": "model", "name": "Openchat-3.5 7B Q4", - "version": "1.0", + "version": "1.1", "description": "The performance of Openchat surpasses ChatGPT-3.5 and Grok-1 across various benchmarks.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 8192, "prompt_template": "GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:", "llama_model_path": "openchat-3.5-0106.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 8192, "stop": ["<|end_of_turn|>"], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/phind-34b/model.json b/extensions/inference-nitro-extension/resources/models/phind-34b/model.json index 6b0abe2a1..f79f4e2d0 100644 --- a/extensions/inference-nitro-extension/resources/models/phind-34b/model.json +++ b/extensions/inference-nitro-extension/resources/models/phind-34b/model.json @@ -8,11 +8,11 @@ "id": "phind-34b", "object": "model", "name": "Phind 34B Q4", - "version": "1.1", + "version": "1.2", "description": "Phind 34B is the best Open-source coding model.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 16384, "prompt_template": "### System Prompt\n{system_message}\n### User Message\n{prompt}\n### Assistant", "llama_model_path": "phind-codellama-34b-v2.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 16384, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/qwen-7b/model.json b/extensions/inference-nitro-extension/resources/models/qwen-7b/model.json index 16def5b29..6c0f642e3 100644 --- a/extensions/inference-nitro-extension/resources/models/qwen-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/qwen-7b/model.json @@ -8,11 +8,11 @@ "id": "qwen-7b", "object": "model", "name": "Qwen Chat 7B Q4", - "version": "1.0", + "version": "1.1", "description": "Qwen is optimized at Chinese, ideal for everyday tasks.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 32768, "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", "llama_model_path": "qwen1_5-7b-chat-q4_k_m.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 32768, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/stealth-v1.2-7b/model.json b/extensions/inference-nitro-extension/resources/models/stealth-v1.2-7b/model.json index 93fa6b610..574b4d893 100644 --- a/extensions/inference-nitro-extension/resources/models/stealth-v1.2-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/stealth-v1.2-7b/model.json @@ -12,7 +12,7 @@ "description": "This is a new experimental family designed to enhance Mathematical and Logical abilities.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 32768, "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", "llama_model_path": "stealth-v1.3.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 32768, "frequency_penalty": 0, "presence_penalty": 0 }, diff --git a/extensions/inference-nitro-extension/resources/models/trinity-v1.2-7b/model.json b/extensions/inference-nitro-extension/resources/models/trinity-v1.2-7b/model.json index 14444fbd4..9c988e19a 100644 --- a/extensions/inference-nitro-extension/resources/models/trinity-v1.2-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/trinity-v1.2-7b/model.json @@ -12,7 +12,7 @@ "description": "Trinity is an experimental model merge using the Slerp method. Recommended for daily assistance purposes.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 32768, "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", "llama_model_path": "trinity-v1.2.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 32768, "frequency_penalty": 0, "presence_penalty": 0 }, diff --git a/extensions/inference-nitro-extension/resources/models/vistral-7b/model.json b/extensions/inference-nitro-extension/resources/models/vistral-7b/model.json index 83e0294c4..d435fb784 100644 --- a/extensions/inference-nitro-extension/resources/models/vistral-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/vistral-7b/model.json @@ -8,11 +8,11 @@ "id": "vistral-7b", "object": "model", "name": "Vistral 7B Q4", - "version": "1.0", + "version": "1.1", "description": "Vistral 7B has a deep understanding of Vietnamese.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 32768, "prompt_template": "[INST] <>\n{system_message}\n<>\n{prompt} [/INST]", "llama_model_path": "vistral-7b-chat-dpo.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 32768, "stop": [], "frequency_penalty": 0, "presence_penalty": 0 diff --git a/extensions/inference-nitro-extension/resources/models/wizardcoder-13b/model.json b/extensions/inference-nitro-extension/resources/models/wizardcoder-13b/model.json index cae96c26b..b7e84bf88 100644 --- a/extensions/inference-nitro-extension/resources/models/wizardcoder-13b/model.json +++ b/extensions/inference-nitro-extension/resources/models/wizardcoder-13b/model.json @@ -12,7 +12,7 @@ "description": "WizardCoder 13B is a Python coding model. This model demonstrate high proficiency in specific domains like coding and mathematics.", "format": "gguf", "settings": { - "ctx_len": 4096, + "ctx_len": 16384, "prompt_template": "### Instruction:\n{prompt}\n### Response:", "llama_model_path": "wizardcoder-python-13b-v1.0.Q4_K_M.gguf" }, @@ -20,7 +20,7 @@ "temperature": 0.7, "top_p": 0.95, "stream": true, - "max_tokens": 4096, + "max_tokens": 16384, "stop": [], "frequency_penalty": 0, "presence_penalty": 0