Merge pull request #3998 from janhq/chore/add-qwen-coder-models

chore: add qwen2.5-coder 14B and 32B models
2024-11-12 16:39:43 +07:00 · 2024-11-12 16:39:43 +07:00 · ff2a81e41f
commit ff2a81e41f
parent e87d25de85 23cbeeedc3
8 changed files with 154 additions and 61 deletions
--- a/extensions/inference-cortex-extension/package.json
+++ b/extensions/inference-cortex-extension/package.json
@ -1,7 +1,7 @@
 {
  "name": "@janhq/inference-cortex-extension",
  "productName": "Cortex Inference Engine",
-  "version": "1.0.20",
+  "version": "1.0.21",
  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
  "main": "dist/index.js",
  "node": "dist/node/index.cjs.js",
--- a/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-14b-instruct/model.json
+++ b/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-14b-instruct/model.json
@ -0,0 +1,36 @@
 {
    "sources": [
      {
        "filename": "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf",
        "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-14B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf"
      }
    ],
    "id": "qwen2.5-coder-14b-instruct",
    "object": "model",
    "name": "Qwen2.5 Coder 14B Instruct Q4",
    "version": "1.0",
    "description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models. Significantly improvements in code generation, code reasoning and code fixing.",
    "format": "gguf",
    "settings": {
      "ctx_len": 32768,
      "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
      "llama_model_path": "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf",
      "ngl": 29
    },
    "parameters": {
      "temperature": 0.7,
      "top_p": 0.95,
      "stream": true,
      "max_tokens": 32768,
      "stop": ["<|endoftext|>", "<|im_end|>"],
      "frequency_penalty": 0,
      "presence_penalty": 0
    },
    "metadata": {
      "author": "QwenLM",
      "tags": ["14B", "Featured"],
      "size": 8990000000
    },
    "engine": "llama-cpp"
  }
--- a/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-32b-instruct/model.json
+++ b/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-32b-instruct/model.json
@ -0,0 +1,36 @@
 {
    "sources": [
      {
        "filename": "Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf",
        "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf"
      }
    ],
    "id": "qwen2.5-coder-32b-instruct",
    "object": "model",
    "name": "Qwen2.5 Coder 32B Instruct Q4",
    "version": "1.0",
    "description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models. Significantly improvements in code generation, code reasoning and code fixing.",
    "format": "gguf",
    "settings": {
      "ctx_len": 32768,
      "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
      "llama_model_path": "Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf",
      "ngl": 29
    },
    "parameters": {
      "temperature": 0.7,
      "top_p": 0.95,
      "stream": true,
      "max_tokens": 32768,
      "stop": ["<|endoftext|>", "<|im_end|>"],
      "frequency_penalty": 0,
      "presence_penalty": 0
    },
    "metadata": {
      "author": "QwenLM",
      "tags": ["32B", "Featured"],
      "size": 19900000000
    },
    "engine": "llama-cpp"
  }
--- a/extensions/inference-cortex-extension/rollup.config.ts
+++ b/extensions/inference-cortex-extension/rollup.config.ts
@ -49,6 +49,8 @@ const llama321bJson = require('./resources/models/llama3.2-1b-instruct/model.jso
 const llama323bJson = require('./resources/models/llama3.2-3b-instruct/model.json')
 const qwen257bJson = require('./resources/models/qwen2.5-7b-instruct/model.json')
 const qwen25coder7bJson = require('./resources/models/qwen2.5-coder-7b-instruct/model.json')
 const qwen25coder14bJson = require('./resources/models/qwen2.5-coder-14b-instruct/model.json')
 const qwen25coder32bJson = require('./resources/models/qwen2.5-coder-32b-instruct/model.json')
 const qwen2514bJson = require('./resources/models/qwen2.5-14b-instruct/model.json')
 const qwen2532bJson = require('./resources/models/qwen2.5-32b-instruct/model.json')
 const qwen2572bJson = require('./resources/models/qwen2.5-72b-instruct/model.json')
@ -108,6 +110,8 @@ export default [
          llama323bJson,
          qwen257bJson,
          qwen25coder7bJson,
          qwen25coder14bJson,
          qwen25coder32bJson,
          qwen2514bJson,
          qwen2532bJson,
          qwen2572bJson,
@ -115,6 +119,7 @@ export default [
        NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
        DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson),
        CORTEX_API_URL: JSON.stringify('http://127.0.0.1:39291'),
        CORTEX_SOCKET_URL: JSON.stringify('ws://127.0.0.1:39291'),
      }),
      // Allow json resolution
      json(),
--- a/extensions/inference-cortex-extension/src/@types/global.d.ts
+++ b/extensions/inference-cortex-extension/src/@types/global.d.ts
@ -1,5 +1,6 @@
 declare const NODE: string
 declare const CORTEX_API_URL: string
 declare const CORTEX_SOCKET_URL: string
 declare const DEFAULT_SETTINGS: Array<any>
 declare const MODELS: Array<any>
--- a/extensions/inference-cortex-extension/src/index.ts
+++ b/extensions/inference-cortex-extension/src/index.ts
@ -16,17 +16,29 @@ import {
  getJanDataFolderPath,
  extractModelLoadParams,
  fs,
  events,
  ModelEvent
 } from '@janhq/core'
 import PQueue from 'p-queue'
 import ky from 'ky'
 /**
 * Event subscription types of Downloader
 */
 enum DownloadTypes {
  DownloadUpdated = 'onFileDownloadUpdate',
  DownloadError = 'onFileDownloadError',
  DownloadSuccess = 'onFileDownloadSuccess',
  DownloadStopped = 'onFileDownloadStopped',
  DownloadStarted = 'onFileDownloadStarted',
 }
 /**
 * A class that implements the InferenceExtension interface from the @janhq/core package.
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 */
 export default class JanInferenceCortexExtension extends LocalOAIEngine {
  // DEPRECATED
  nodeModule: string = 'node'
  queue = new PQueue({ concurrency: 1 })
@ -38,6 +50,11 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
   */
  inferenceUrl = `${CORTEX_API_URL}/v1/chat/completions`
  /**
   * Socket instance of events subscription
   */
  socket?: WebSocket = undefined
  /**
   * Subscribes to events emitted by the @janhq/core package.
   */
@ -55,6 +72,8 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
    this.queue.add(() => this.healthz())
    this.subscribeToEvents()
    window.addEventListener('beforeunload', () => {
      this.clean()
    })
@ -138,7 +157,7 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
          methods: ['get'],
        },
      })
-      .then(() => {})
+      .then(() => { })
  }
  /**
@ -154,6 +173,50 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine {
        // Do nothing
      })
  }
  /**
   * Subscribe to cortex.cpp websocket events
   */
  subscribeToEvents() {
    this.queue.add(
      () =>
        new Promise<void>((resolve) => {
          this.socket = new WebSocket(`${CORTEX_SOCKET_URL}/events`)
          this.socket.addEventListener('message', (event) => {
            const data = JSON.parse(event.data)
            const transferred = data.task.items.reduce(
              (acc: number, cur: any) => acc + cur.downloadedBytes,
              0
            )
            const total = data.task.items.reduce(
              (acc: number, cur: any) => acc + cur.bytes,
              0
            )
            const percent = total > 0 ? transferred / total : 0
            events.emit(DownloadTypes[data.type as keyof typeof DownloadTypes], {
              modelId: data.task.id,
              percent: percent,
              size: {
                transferred: transferred,
                total: total,
              },
            })
            // Update models list from Hub
            if (data.type === DownloadTypes.DownloadSuccess) {
              // Delay for the state update from cortex.cpp
              // Just to be sure
              setTimeout(() => {
                events.emit(ModelEvent.OnModelsUpdate, {})
              }, 500)
            }
          })
          resolve()
        })
    )
  }
 }
 /// Legacy
--- a/extensions/model-extension/src/cortex.ts
+++ b/extensions/model-extension/src/cortex.ts
@ -1,6 +1,6 @@
 import PQueue from 'p-queue'
 import ky from 'ky'
-import { events, extractModelLoadParams, Model, ModelEvent } from '@janhq/core'
+import {  extractModelLoadParams, Model } from '@janhq/core'
 import { extractInferenceParams } from '@janhq/core'
 /**
 * cortex.cpp Model APIs interface
@ -24,21 +24,11 @@ type ModelList = {
  data: any[]
 }
 enum DownloadTypes {
  DownloadUpdated = 'onFileDownloadUpdate',
  DownloadError = 'onFileDownloadError',
  DownloadSuccess = 'onFileDownloadSuccess',
  DownloadStopped = 'onFileDownloadStopped',
  DownloadStarted = 'onFileDownloadStarted',
 }
 export class CortexAPI implements ICortexAPI {
  queue = new PQueue({ concurrency: 1 })
  socket?: WebSocket = undefined
  constructor() {
    this.queue.add(() => this.healthz())
    this.subscribeToEvents()
  }
  /**
@ -172,49 +162,6 @@ export class CortexAPI implements ICortexAPI {
      .then(() => {})
  }
  /**
   * Subscribe to cortex.cpp websocket events
   */
  subscribeToEvents() {
    this.queue.add(
      () =>
        new Promise<void>((resolve) => {
          this.socket = new WebSocket(`${SOCKET_URL}/events`)
          this.socket.addEventListener('message', (event) => {
            const data = JSON.parse(event.data)
            const transferred = data.task.items.reduce(
              (acc, cur) => acc + cur.downloadedBytes,
              0
            )
            const total = data.task.items.reduce(
              (acc, cur) => acc + cur.bytes,
              0
            )
            const percent = total > 0 ? transferred / total : 0
            events.emit(DownloadTypes[data.type], {
              modelId: data.task.id,
              percent: percent,
              size: {
                transferred: transferred,
                total: total,
              },
            })
            // Update models list from Hub
            if (data.type === DownloadTypes.DownloadSuccess) {
              // Delay for the state update from cortex.cpp
              // Just to be sure
              setTimeout(() => {
                events.emit(ModelEvent.OnModelsUpdate, {})
              }, 500)
            }
          })
          resolve()
        })
    )
  }
  /**
   * TRansform model to the expected format (e.g. parameters, settings, metadata)
   * @param model
--- a/web/screens/Settings/Advanced/index.tsx
+++ b/web/screens/Settings/Advanced/index.tsx
@ -189,7 +189,7 @@ const Advanced = () => {
   * @param gpuId
   * @returns
   */
-  const handleGPUChange = (gpuId: string) => {
+  const handleGPUChange = async (gpuId: string) => {
    let updatedGpusInUse = [...gpusInUse]
    if (updatedGpusInUse.includes(gpuId)) {
      updatedGpusInUse = updatedGpusInUse.filter((id) => id !== gpuId)
@ -208,7 +208,7 @@ const Advanced = () => {
      updatedGpusInUse.push(gpuId)
    }
    setGpusInUse(updatedGpusInUse)
-    saveSettings({ gpusInUse: updatedGpusInUse })
+    await saveSettings({ gpusInUse: updatedGpusInUse })
    window.core?.api?.relaunch()
  }
@ -306,8 +306,13 @@ const Advanced = () => {
                          })
                        }
                        // Stop any running model to apply the changes
-                        if (e.target.checked !== gpuEnabled)
+                        if (e.target.checked !== gpuEnabled) {
-                          stopModel().then(() => window.core?.api?.relaunch())
+                          stopModel().finally(() => {
                            setTimeout(() => {
                              window.location.reload()
                            }, 300)
                          })
                        }
                      }}
                    />
                  }