diff --git a/extensions/inference-cortex-extension/package.json b/extensions/inference-cortex-extension/package.json index 5a9fc56e9..d262ad5ec 100644 --- a/extensions/inference-cortex-extension/package.json +++ b/extensions/inference-cortex-extension/package.json @@ -1,7 +1,7 @@ { "name": "@janhq/inference-cortex-extension", "productName": "Cortex Inference Engine", - "version": "1.0.20", + "version": "1.0.21", "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.", "main": "dist/index.js", "node": "dist/node/index.cjs.js", diff --git a/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-14b-instruct/model.json b/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-14b-instruct/model.json new file mode 100644 index 000000000..a445ee2db --- /dev/null +++ b/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-14b-instruct/model.json @@ -0,0 +1,36 @@ +{ + "sources": [ + { + "filename": "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-14B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf" + } + ], + "id": "qwen2.5-coder-14b-instruct", + "object": "model", + "name": "Qwen2.5 Coder 14B Instruct Q4", + "version": "1.0", + "description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models. Significantly improvements in code generation, code reasoning and code fixing.", + "format": "gguf", + "settings": { + "ctx_len": 32768, + "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", + "llama_model_path": "Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf", + "ngl": 29 + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 32768, + "stop": ["<|endoftext|>", "<|im_end|>"], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "QwenLM", + "tags": ["14B", "Featured"], + "size": 8990000000 + }, + "engine": "llama-cpp" + } + \ No newline at end of file diff --git a/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-32b-instruct/model.json b/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-32b-instruct/model.json new file mode 100644 index 000000000..cffdf03df --- /dev/null +++ b/extensions/inference-cortex-extension/resources/models/qwen2.5-coder-32b-instruct/model.json @@ -0,0 +1,36 @@ +{ + "sources": [ + { + "filename": "Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf" + } + ], + "id": "qwen2.5-coder-32b-instruct", + "object": "model", + "name": "Qwen2.5 Coder 32B Instruct Q4", + "version": "1.0", + "description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models. Significantly improvements in code generation, code reasoning and code fixing.", + "format": "gguf", + "settings": { + "ctx_len": 32768, + "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", + "llama_model_path": "Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf", + "ngl": 29 + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 32768, + "stop": ["<|endoftext|>", "<|im_end|>"], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "QwenLM", + "tags": ["32B", "Featured"], + "size": 19900000000 + }, + "engine": "llama-cpp" + } + \ No newline at end of file diff --git a/extensions/inference-cortex-extension/rollup.config.ts b/extensions/inference-cortex-extension/rollup.config.ts index ea873990b..34ad9295d 100644 --- a/extensions/inference-cortex-extension/rollup.config.ts +++ b/extensions/inference-cortex-extension/rollup.config.ts @@ -49,6 +49,8 @@ const llama321bJson = require('./resources/models/llama3.2-1b-instruct/model.jso const llama323bJson = require('./resources/models/llama3.2-3b-instruct/model.json') const qwen257bJson = require('./resources/models/qwen2.5-7b-instruct/model.json') const qwen25coder7bJson = require('./resources/models/qwen2.5-coder-7b-instruct/model.json') +const qwen25coder14bJson = require('./resources/models/qwen2.5-coder-14b-instruct/model.json') +const qwen25coder32bJson = require('./resources/models/qwen2.5-coder-32b-instruct/model.json') const qwen2514bJson = require('./resources/models/qwen2.5-14b-instruct/model.json') const qwen2532bJson = require('./resources/models/qwen2.5-32b-instruct/model.json') const qwen2572bJson = require('./resources/models/qwen2.5-72b-instruct/model.json') @@ -108,6 +110,8 @@ export default [ llama323bJson, qwen257bJson, qwen25coder7bJson, + qwen25coder14bJson, + qwen25coder32bJson, qwen2514bJson, qwen2532bJson, qwen2572bJson, @@ -115,6 +119,7 @@ export default [ NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`), DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson), CORTEX_API_URL: JSON.stringify('http://127.0.0.1:39291'), + CORTEX_SOCKET_URL: JSON.stringify('ws://127.0.0.1:39291'), }), // Allow json resolution json(), diff --git a/extensions/inference-cortex-extension/src/@types/global.d.ts b/extensions/inference-cortex-extension/src/@types/global.d.ts index 64ae5a6e7..48dbcd780 100644 --- a/extensions/inference-cortex-extension/src/@types/global.d.ts +++ b/extensions/inference-cortex-extension/src/@types/global.d.ts @@ -1,5 +1,6 @@ declare const NODE: string declare const CORTEX_API_URL: string +declare const CORTEX_SOCKET_URL: string declare const DEFAULT_SETTINGS: Array declare const MODELS: Array diff --git a/extensions/inference-cortex-extension/src/index.ts b/extensions/inference-cortex-extension/src/index.ts index d070ff9a3..44ec423da 100644 --- a/extensions/inference-cortex-extension/src/index.ts +++ b/extensions/inference-cortex-extension/src/index.ts @@ -16,17 +16,29 @@ import { getJanDataFolderPath, extractModelLoadParams, fs, + events, + ModelEvent } from '@janhq/core' import PQueue from 'p-queue' import ky from 'ky' +/** + * Event subscription types of Downloader + */ +enum DownloadTypes { + DownloadUpdated = 'onFileDownloadUpdate', + DownloadError = 'onFileDownloadError', + DownloadSuccess = 'onFileDownloadSuccess', + DownloadStopped = 'onFileDownloadStopped', + DownloadStarted = 'onFileDownloadStarted', +} + /** * A class that implements the InferenceExtension interface from the @janhq/core package. * The class provides methods for initializing and stopping a model, and for making inference requests. * It also subscribes to events emitted by the @janhq/core package and handles new message requests. */ export default class JanInferenceCortexExtension extends LocalOAIEngine { - // DEPRECATED nodeModule: string = 'node' queue = new PQueue({ concurrency: 1 }) @@ -38,6 +50,11 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine { */ inferenceUrl = `${CORTEX_API_URL}/v1/chat/completions` + /** + * Socket instance of events subscription + */ + socket?: WebSocket = undefined + /** * Subscribes to events emitted by the @janhq/core package. */ @@ -55,6 +72,8 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine { this.queue.add(() => this.healthz()) + this.subscribeToEvents() + window.addEventListener('beforeunload', () => { this.clean() }) @@ -138,7 +157,7 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine { methods: ['get'], }, }) - .then(() => {}) + .then(() => { }) } /** @@ -154,6 +173,50 @@ export default class JanInferenceCortexExtension extends LocalOAIEngine { // Do nothing }) } + + /** + * Subscribe to cortex.cpp websocket events + */ + subscribeToEvents() { + this.queue.add( + () => + new Promise((resolve) => { + this.socket = new WebSocket(`${CORTEX_SOCKET_URL}/events`) + + this.socket.addEventListener('message', (event) => { + const data = JSON.parse(event.data) + const transferred = data.task.items.reduce( + (acc: number, cur: any) => acc + cur.downloadedBytes, + 0 + ) + const total = data.task.items.reduce( + (acc: number, cur: any) => acc + cur.bytes, + 0 + ) + const percent = total > 0 ? transferred / total : 0 + + events.emit(DownloadTypes[data.type as keyof typeof DownloadTypes], { + modelId: data.task.id, + percent: percent, + size: { + transferred: transferred, + total: total, + }, + }) + // Update models list from Hub + if (data.type === DownloadTypes.DownloadSuccess) { + // Delay for the state update from cortex.cpp + // Just to be sure + setTimeout(() => { + events.emit(ModelEvent.OnModelsUpdate, {}) + }, 500) + } + }) + resolve() + }) + ) + } + } /// Legacy diff --git a/extensions/model-extension/src/cortex.ts b/extensions/model-extension/src/cortex.ts index b7111c859..7a65e8e3f 100644 --- a/extensions/model-extension/src/cortex.ts +++ b/extensions/model-extension/src/cortex.ts @@ -1,6 +1,6 @@ import PQueue from 'p-queue' import ky from 'ky' -import { events, extractModelLoadParams, Model, ModelEvent } from '@janhq/core' +import { extractModelLoadParams, Model } from '@janhq/core' import { extractInferenceParams } from '@janhq/core' /** * cortex.cpp Model APIs interface @@ -24,21 +24,11 @@ type ModelList = { data: any[] } -enum DownloadTypes { - DownloadUpdated = 'onFileDownloadUpdate', - DownloadError = 'onFileDownloadError', - DownloadSuccess = 'onFileDownloadSuccess', - DownloadStopped = 'onFileDownloadStopped', - DownloadStarted = 'onFileDownloadStarted', -} - export class CortexAPI implements ICortexAPI { queue = new PQueue({ concurrency: 1 }) - socket?: WebSocket = undefined constructor() { this.queue.add(() => this.healthz()) - this.subscribeToEvents() } /** @@ -172,49 +162,6 @@ export class CortexAPI implements ICortexAPI { .then(() => {}) } - /** - * Subscribe to cortex.cpp websocket events - */ - subscribeToEvents() { - this.queue.add( - () => - new Promise((resolve) => { - this.socket = new WebSocket(`${SOCKET_URL}/events`) - - this.socket.addEventListener('message', (event) => { - const data = JSON.parse(event.data) - const transferred = data.task.items.reduce( - (acc, cur) => acc + cur.downloadedBytes, - 0 - ) - const total = data.task.items.reduce( - (acc, cur) => acc + cur.bytes, - 0 - ) - const percent = total > 0 ? transferred / total : 0 - - events.emit(DownloadTypes[data.type], { - modelId: data.task.id, - percent: percent, - size: { - transferred: transferred, - total: total, - }, - }) - // Update models list from Hub - if (data.type === DownloadTypes.DownloadSuccess) { - // Delay for the state update from cortex.cpp - // Just to be sure - setTimeout(() => { - events.emit(ModelEvent.OnModelsUpdate, {}) - }, 500) - } - }) - resolve() - }) - ) - } - /** * TRansform model to the expected format (e.g. parameters, settings, metadata) * @param model diff --git a/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx b/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx index 14055b535..3dfdff2f9 100644 --- a/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx +++ b/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx @@ -79,7 +79,7 @@ const SystemMonitor = () => { {showSystemMonitorPanel && (
{ {ramUtilitized}%
- {gpus.length > 0 && (
{gpus.map((gpu, index) => { diff --git a/web/screens/Settings/Advanced/index.tsx b/web/screens/Settings/Advanced/index.tsx index 0b1438c47..150f70398 100644 --- a/web/screens/Settings/Advanced/index.tsx +++ b/web/screens/Settings/Advanced/index.tsx @@ -189,7 +189,7 @@ const Advanced = () => { * @param gpuId * @returns */ - const handleGPUChange = (gpuId: string) => { + const handleGPUChange = async (gpuId: string) => { let updatedGpusInUse = [...gpusInUse] if (updatedGpusInUse.includes(gpuId)) { updatedGpusInUse = updatedGpusInUse.filter((id) => id !== gpuId) @@ -208,7 +208,7 @@ const Advanced = () => { updatedGpusInUse.push(gpuId) } setGpusInUse(updatedGpusInUse) - saveSettings({ gpusInUse: updatedGpusInUse }) + await saveSettings({ gpusInUse: updatedGpusInUse }) window.core?.api?.relaunch() } @@ -306,8 +306,13 @@ const Advanced = () => { }) } // Stop any running model to apply the changes - if (e.target.checked !== gpuEnabled) - stopModel().then(() => window.core?.api?.relaunch()) + if (e.target.checked !== gpuEnabled) { + stopModel().finally(() => { + setTimeout(() => { + window.location.reload() + }, 300) + }) + } }} /> } diff --git a/web/screens/Thread/ThreadCenterPanel/index.tsx b/web/screens/Thread/ThreadCenterPanel/index.tsx index 3f74181f7..1f23e9dc5 100644 --- a/web/screens/Thread/ThreadCenterPanel/index.tsx +++ b/web/screens/Thread/ThreadCenterPanel/index.tsx @@ -147,6 +147,20 @@ const ThreadCenterPanel = () => { const showSystemMonitorPanel = useAtomValue(showSystemMonitorPanelAtom) + const [height, setHeight] = useState(0) + + useEffect(() => { + if (showSystemMonitorPanel) { + const element = document.querySelector('.system-monitor-panel') + + if (element) { + setHeight(element.clientHeight) // You can also use offsetHeight if needed + } + } else { + setHeight(0) + } + }, [showSystemMonitorPanel]) + return (
{ )}
{activeThread ? (