Resolved conflicts by keeping HEAD changes

2025-05-17 12:55:38 +05:30 · 2025-05-17 12:55:38 +05:30 · a8abc9f9aa
commit a8abc9f9aa
parent 19274f7e69
22 changed files with 541 additions and 5458 deletions
--- a/extensions/inference-cortex-extension/.gitignore
+++ b/extensions/inference-cortex-extension/.gitignore
@ -1,2 +0,0 @@
-bin
-!version.txt
--- a/extensions/inference-cortex-extension/README.md
+++ b/extensions/inference-cortex-extension/README.md
@ -1,75 +0,0 @@
-# Create a Jan Extension using Typescript
-
-Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀
-
-## Create Your Own Extension
-
-To create your own extension, you can use this repository as a template! Just follow the below instructions:
-
-1. Click the Use this template button at the top of the repository
-2. Select Create a new repository
-3. Select an owner and name for your new repository
-4. Click Create repository
-5. Clone your new repository
-
-## Initial Setup
-
-After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension.
-
-> [!NOTE]
->
-> You'll need to have a reasonably modern version of
-> [Node.js](https://nodejs.org) handy. If you are using a version manager like
-> [`nodenv`](https://github.com/nodenv/nodenv) or
-> [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
-> root of your repository to install the version specified in
-> [`package.json`](./package.json). Otherwise, 20.x or later should work!
-
-1. :hammer_and_wrench: Install the dependencies
-
-   ```bash
-   npm install
-   ```
-
-1. :building_construction: Package the TypeScript for distribution
-
-   ```bash
-   npm run bundle
-   ```
-
-1. :white_check_mark: Check your artifact
-
-   There will be a tgz file in your extension directory now
-
-## Update the Extension Metadata
-
-The [`package.json`](package.json) file defines metadata about your extension, such as
-extension name, main entry, description and version.
-
-When you copy this repository, update `package.json` with the name, description for your extension.
-
-## Update the Extension Code
-
-The [`src/`](./src/) directory is the heart of your extension! This contains the
-source code that will be run when your extension functions are invoked. You can replace the
-contents of this directory with your own code.
-
-There are a few things to keep in mind when writing your extension code:
-
- Most Jan Extension functions are processed asynchronously.
-  In `index.ts`, you will see that the extension function will return a `Promise<any>`.
-
-  ```typescript
-  import { events, MessageEvent, MessageRequest } from '@janhq/core'
-
-  function onStart(): Promise<any> {
-    return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) =>
-      this.inference(data)
-    )
-  }
-  ```
-
-  For more information about the Jan Extension Core module, see the
-  [documentation](https://github.com/menloresearch/jan/blob/main/core/README.md).
-
-So, what are you waiting for? Go ahead and start customizing your extension!
--- a/extensions/inference-cortex-extension/bin/version.txt
+++ b/extensions/inference-cortex-extension/bin/version.txt
@ -1 +0,0 @@
-1.0.13-rc9
--- a/extensions/inference-cortex-extension/download.bat
+++ b/extensions/inference-cortex-extension/download.bat
@ -1,40 +0,0 @@
-@echo off
-set BIN_PATH=./bin
-set SHARED_PATH=./../../electron/shared
-set /p CORTEX_VERSION=<./bin/version.txt
-set ENGINE_VERSION=b5509
-
-@REM Download llama.cpp binaries
-set DOWNLOAD_URL=https://github.com/menloresearch/llama.cpp/releases/download/%ENGINE_VERSION%/llama-%ENGINE_VERSION%-bin-win
-set DOWNLOAD_GGML_URL=https://github.com/ggml-org/llama.cpp/releases/download/%ENGINE_VERSION%/llama-%ENGINE_VERSION%-bin-win
-set CUDA_DOWNLOAD_URL=https://github.com/menloresearch/llama.cpp/releases/download/%ENGINE_VERSION%
-set SUBFOLDERS=win-noavx-cuda-cu12.0-x64 win-noavx-cuda-cu11.7-x64 win-avx2-cuda-cu12.0-x64 win-avx2-cuda-cu11.7-x64 win-noavx-x64 win-avx-x64 win-avx2-x64 win-avx512-x64 win-vulkan-x64
-
-call .\node_modules\.bin\download -e --strip 1 -o %BIN_PATH% https://github.com/menloresearch/cortex.cpp/releases/download/v%CORTEX_VERSION%/cortex-%CORTEX_VERSION%-windows-amd64.tar.gz
-call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-cu12.0-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-avx2-cuda-cu12.0-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-cu11.7-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-avx2-cuda-cu11.7-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_URL%-noavx-cuda-cu12.0-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-noavx-cuda-cu12.0-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_URL%-noavx-cuda-cu11.7-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-noavx-cuda-cu11.7-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_URL%-noavx-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-noavx-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_URL%-avx-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-avx-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-avx2-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_URL%-avx512-x64.tar.gz -e --strip 2 -o %SHARED_PATH%/engines/llama.cpp/win-avx512-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %DOWNLOAD_GGML_URL%-vulkan-x64.zip -e --strip 1 -o %SHARED_PATH%/engines/llama.cpp/win-vulkan-x64/%ENGINE_VERSION%
-call .\node_modules\.bin\download %CUDA_DOWNLOAD_URL%/cudart-llama-bin-win-cu12.0-x64.tar.gz -e --strip 1 -o %BIN_PATH%
-call .\node_modules\.bin\download %CUDA_DOWNLOAD_URL%/cudart-llama-bin-win-cu11.7-x64.tar.gz -e --strip 1 -o %BIN_PATH%
-
-move %BIN_PATH%\cortex-server-beta.exe %BIN_PATH%\cortex-server.exe
-del %BIN_PATH%\cortex-beta.exe
-del %BIN_PATH%\cortex.exe
-
-@REM Loop through each folder and move DLLs
-for %%F in (%SUBFOLDERS%) do (
-    echo Processing folder: %SHARED_PATH%\engines\llama.cpp\%%F\%ENGINE_VERSION%
-
-    @REM Move cu*.dll files
-    for %%D in (%SHARED_PATH%\engines\llama.cpp\%%F\%ENGINE_VERSION%\cu*.dll) do (
-        move "%%D" "%BIN_PATH%"        
-    )
-)
-
-echo DLL files moved successfully.
--- a/extensions/inference-cortex-extension/download.sh
+++ b/extensions/inference-cortex-extension/download.sh
@ -1,50 +0,0 @@
-#!/bin/bash
-
-# Read CORTEX_VERSION
-CORTEX_VERSION=$(cat ./bin/version.txt)
-ENGINE_VERSION=b5509
-CORTEX_RELEASE_URL="https://github.com/menloresearch/cortex.cpp/releases/download"
-ENGINE_DOWNLOAD_URL=https://github.com/menloresearch/llama.cpp/releases/download/${ENGINE_VERSION}/llama-${ENGINE_VERSION}-bin
-CUDA_DOWNLOAD_URL=https://github.com/menloresearch/llama.cpp/releases/download/${ENGINE_VERSION}
-BIN_PATH=./bin
-SHARED_PATH="../../electron/shared"
-# Detect platform
-OS_TYPE=$(uname)
-
-if [ "$OS_TYPE" == "Linux" ]; then
-    # Linux downloads
-    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-${CORTEX_VERSION}-linux-amd64.tar.gz" -e --strip 1 -o "./bin"
-    mv ./bin/cortex-server-beta ./bin/cortex-server
-    rm -rf ./bin/cortex
-    rm -rf ./bin/cortex-beta
-    chmod +x "./bin/cortex-server"
-
-    # Download engines for Linux
-    download "${ENGINE_DOWNLOAD_URL}-linux-noavx-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-noavx-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-avx-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-avx-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-avx2-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-avx2-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-avx512-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-avx512-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-avx2-cuda-cu12.0-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-avx2-cuda-cu12.0-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-avx2-cuda-cu11.7-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-avx2-cuda-cu11.7-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-noavx-cuda-cu12.0-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-noavx-cuda-cu12.0-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-noavx-cuda-cu11.7-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-noavx-cuda-cu11.7-x64/${ENGINE_VERSION}" 1
-    download "${ENGINE_DOWNLOAD_URL}-linux-vulkan-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/linux-vulkan-x64/${ENGINE_VERSION}" 1
-    download "${CUDA_DOWNLOAD_URL}/cudart-llama-bin-linux-cu12.0-x64.tar.gz" -e --strip 1 -o "${BIN_PATH}" 1
-    download "${CUDA_DOWNLOAD_URL}/cudart-llama-bin-linux-cu11.7-x64.tar.gz" -e --strip 1 -o "${BIN_PATH}" 1
-
-elif [ "$OS_TYPE" == "Darwin" ]; then
-    # macOS downloads
-    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-${CORTEX_VERSION}-mac-universal.tar.gz" -e --strip 1 -o "./bin" 1
-    mv ./bin/cortex-server-beta ./bin/cortex-server
-    rm -rf ./bin/cortex
-    rm -rf ./bin/cortex-beta
-    chmod +x "./bin/cortex-server"
-
-    # Download engines for macOS
-    download "${ENGINE_DOWNLOAD_URL}-macos-arm64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/macos-arm64/${ENGINE_VERSION}"
-    download "${ENGINE_DOWNLOAD_URL}-macos-x64.tar.gz" -e --strip 2 -o "${SHARED_PATH}/engines/llama.cpp/macos-x64/${ENGINE_VERSION}"
-
-else
-    echo "Unsupported operating system: $OS_TYPE"
-    exit 1
-fi
--- a/extensions/inference-cortex-extension/package.json
+++ b/extensions/inference-cortex-extension/package.json
@ -1,67 +0,0 @@
-{
-  "name": "@janhq/inference-cortex-extension",
-  "productName": "Cortex Inference Engine",
-  "version": "1.0.25",
-  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
-  "main": "dist/index.js",
-  "node": "dist/node/index.cjs.js",
-  "author": "Jan <service@jan.ai>",
-  "license": "AGPL-3.0",
-  "scripts": {
-    "test": "vitest run",
-    "build": "rolldown -c rolldown.config.mjs",
-    "downloadcortex:linux:darwin": "./download.sh",
-    "downloadcortex:win32": "download.bat",
-    "downloadcortex": "run-script-os",
-    "build:publish:darwin": "rimraf *.tgz --glob || true && yarn build && ../../.github/scripts/auto-sign.sh && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:win32:linux": "rimraf *.tgz --glob || true && yarn build && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish": "run-script-os"
-  },
-  "exports": {
-    ".": "./dist/index.js",
-    "./main": "./dist/node/index.cjs.js"
-  },
-  "devDependencies": {
-    "@jest/globals": "^29.7.0",
-    "@types/decompress": "^4.2.7",
-    "@types/jest": "^29.5.12",
-    "@types/node": "^20.11.4",
-    "@types/os-utils": "^0.0.4",
-    "@types/tcp-port-used": "^1.0.4",
-    "cpx": "^1.5.0",
-    "download-cli": "^1.1.1",
-    "jest": "^29.7.0",
-    "rimraf": "^3.0.2",
-    "rolldown": "1.0.0-beta.1",
-    "run-script-os": "^1.1.6",
-    "ts-jest": "^29.1.2",
-    "typescript": "^5.3.3",
-    "vitest": "^3.0.8"
-  },
-  "dependencies": {
-    "@janhq/core": "../../core/package.tgz",
-    "fetch-retry": "^5.0.6",
-    "ky": "^1.7.2",
-    "p-queue": "^8.0.1",
-    "rxjs": "^7.8.1",
-    "ulidx": "^2.3.0"
-  },
-  "engines": {
-    "node": ">=18.0.0"
-  },
-  "files": [
-    "dist/*",
-    "package.json",
-    "README.md"
-  ],
-  "bundleDependencies": [
-    "tcp-port-used",
-    "fetch-retry",
-    "@janhq/core",
-    "decompress"
-  ],
-  "installConfig": {
-    "hoistingLimits": "workspaces"
-  },
-  "packageManager": "yarn@4.5.3"
-}
--- a/extensions/inference-cortex-extension/resources/default_settings.json
+++ b/extensions/inference-cortex-extension/resources/default_settings.json
@ -1,126 +0,0 @@
-[
-  {
-    "key": "auto_unload_models",
-    "title": "Auto-Unload Old Models",
-    "description": "Automatically unloads models that are not in use to free up memory. Ensure only one model is loaded at a time.",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": true
-    }
-  },
-  {
-    "key": "context_shift",
-    "title": "Context Shift",
-    "description": "Automatically shifts the context window when the model is unable to process the entire prompt, ensuring that the most relevant information is always included.",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": false
-    }
-  },
-  {
-    "key": "cont_batching",
-    "title": "Continuous Batching",
-    "description": "Allows processing prompts in parallel with text generation, which usually improves performance.",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": ""
-    }
-  },
-  {
-    "key": "n_parallel",
-    "title": "Parallel Operations",
-    "description": "Number of prompts that can be processed simultaneously by the model.",
-    "controllerType": "input",
-    "controllerProps": {
-      "value": "",
-      "placeholder": "1",
-      "type": "number",
-      "textAlign": "right"
-    }
-  },
-  {
-    "key": "cpu_threads",
-    "title": "CPU Threads",
-    "description": "Number of CPU cores used for model processing when running without GPU.",
-    "controllerType": "input",
-    "controllerProps": {
-      "value": "",
-      "placeholder": "-1 (auto-detect)",
-      "type": "number",
-      "textAlign": "right"
-    }
-  },
-  {
-    "key": "threads_batch",
-    "title": "Threads (Batch)",
-    "description": "Number of threads for batch and prompt processing (default: same as Threads).",
-    "controllerType": "input",
-    "controllerProps": {
-      "value": "",
-      "placeholder": "-1 (same as Threads)",
-      "type": "number"
-    }
-  },
-  {
-    "key": "flash_attn",
-    "title": "Flash Attention",
-    "description": "Optimizes memory usage and speeds up model inference using an efficient attention implementation.",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": true
-    }
-  },
-  {
-    "key": "caching_enabled",
-    "title": "Caching",
-    "description": "Stores recent prompts and responses to improve speed when similar questions are asked.",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": true
-    }
-  },
-  {
-    "key": "cache_type",
-    "title": "KV Cache Type",
-    "description": "Controls memory usage and precision trade-off.",
-    "controllerType": "dropdown",
-    "controllerProps": {
-      "value": "q8_0",
-      "options": [
-        {
-          "value": "q4_0",
-          "name": "q4_0"
-        },
-        {
-          "value": "q8_0",
-          "name": "q8_0"
-        },
-        {
-          "value": "f16",
-          "name": "f16"
-        }
-      ]
-    }
-  },
-  {
-    "key": "use_mmap",
-    "title": "mmap",
-    "description": "Loads model files more efficiently by mapping them to memory, reducing RAM usage.",
-    "controllerType": "checkbox",
-    "controllerProps": {
-      "value": true
-    }
-  },
-  {
-    "key": "hugging-face-access-token",
-    "title": "Hugging Face Access Token",
-    "description": "Access tokens programmatically authenticate your identity to the Hugging Face Hub, allowing applications to perform specific actions specified by the scope of permissions granted.",
-    "controllerType": "input",
-    "controllerProps": {
-      "value": "",
-      "placeholder": "hf_**********************************",
-      "type": "password",
-      "inputActions": ["unobscure", "copy"]
-    }
-  }
-]
--- a/extensions/inference-cortex-extension/rolldown.config.mjs
+++ b/extensions/inference-cortex-extension/rolldown.config.mjs
@ -1,44 +0,0 @@
-import { defineConfig } from 'rolldown'
-import packageJson from './package.json' with { type: 'json' }
-import defaultSettingJson from './resources/default_settings.json' with { type: 'json' }
-
-export default defineConfig([
-  {
-    input: 'src/index.ts',
-    output: {
-      format: 'esm',
-      file: 'dist/index.js',
-    },
-    platform: 'browser',
-    define: {
-      NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
-      SETTINGS: JSON.stringify(defaultSettingJson),
-      CORTEX_API_URL: JSON.stringify(
-        `http://127.0.0.1:${process.env.CORTEX_API_PORT ?? '39291'}`
-      ),
-      CORTEX_SOCKET_URL: JSON.stringify(
-        `ws://127.0.0.1:${process.env.CORTEX_API_PORT ?? '39291'}`
-      ),
-      CORTEX_ENGINE_VERSION: JSON.stringify('b5509'),
-    },
-  },
-  {
-    input: 'src/node/index.ts',
-    external: ['@janhq/core/node'],
-    output: {
-      format: 'cjs',
-      file: 'dist/node/index.cjs.js',
-      sourcemap: false,
-      inlineDynamicImports: true,
-    },
-    resolve: {
-      extensions: ['.js', '.ts', '.json'],
-    },
-    define: {
-      CORTEX_API_URL: JSON.stringify(
-        `http://127.0.0.1:${process.env.CORTEX_API_PORT ?? '39291'}`
-      ),
-    },
-    platform: 'node',
-  },
-])
--- a/extensions/inference-cortex-extension/src/@types/global.d.ts
+++ b/extensions/inference-cortex-extension/src/@types/global.d.ts
@ -1,5 +0,0 @@
-declare const NODE: string
-declare const CORTEX_API_URL: string
-declare const CORTEX_SOCKET_URL: string
-declare const CORTEX_ENGINE_VERSION: string
-declare const SETTINGS: any
--- a/extensions/inference-cortex-extension/src/index.test.ts
+++ b/extensions/inference-cortex-extension/src/index.test.ts
@ -1,452 +0,0 @@
-import { describe, beforeEach, it, expect, vi, afterEach } from 'vitest'
-
-// Must mock before imports
-vi.mock('@janhq/core', () => {
-  return {
-    executeOnMain: vi.fn().mockResolvedValue({}),
-    events: {
-      emit: vi.fn()
-    },
-    extractModelLoadParams: vi.fn().mockReturnValue({}),
-    ModelEvent: {
-      OnModelsUpdate: 'OnModelsUpdate',
-      OnModelStopped: 'OnModelStopped'
-    },
-    EngineEvent: {
-      OnEngineUpdate: 'OnEngineUpdate'
-    },
-    InferenceEngine: {
-      cortex: 'cortex',
-      nitro: 'nitro',
-      cortex_llamacpp: 'cortex_llamacpp'
-    },
-    LocalOAIEngine: class LocalOAIEngine {
-      onLoad() {}
-      onUnload() {}
-    }
-  }
-})
-
-import JanInferenceCortexExtension, { Settings } from './index'
-import { InferenceEngine, ModelEvent, EngineEvent, executeOnMain, events } from '@janhq/core'
-import ky from 'ky'
-
-// Mock global variables
-const CORTEX_API_URL = 'http://localhost:3000'
-const CORTEX_SOCKET_URL = 'ws://localhost:3000'
-const SETTINGS = [
-  { id: 'n_parallel', name: 'Parallel Execution', description: 'Number of parallel executions', type: 'number', value: '4' },
-  { id: 'cont_batching', name: 'Continuous Batching', description: 'Enable continuous batching', type: 'boolean', value: true },
-  { id: 'caching_enabled', name: 'Caching', description: 'Enable caching', type: 'boolean', value: true },
-  { id: 'flash_attn', name: 'Flash Attention', description: 'Enable flash attention', type: 'boolean', value: true },
-  { id: 'cache_type', name: 'Cache Type', description: 'Type of cache to use', type: 'string', value: 'f16' },
-  { id: 'use_mmap', name: 'Use Memory Map', description: 'Use memory mapping', type: 'boolean', value: true },
-  { id: 'cpu_threads', name: 'CPU Threads', description: 'Number of CPU threads', type: 'number', value: '' }
-]
-const NODE = 'node'
-
-// Mock globals
-vi.stubGlobal('CORTEX_API_URL', CORTEX_API_URL)
-vi.stubGlobal('CORTEX_SOCKET_URL', CORTEX_SOCKET_URL)
-vi.stubGlobal('SETTINGS', SETTINGS)
-vi.stubGlobal('NODE', NODE)
-vi.stubGlobal('window', {
-  addEventListener: vi.fn()
-})
-
-// Mock WebSocket
-class MockWebSocket {
-  url :string
-  listeners: {}
-  onclose: Function 
-  
-  constructor(url) {
-    this.url = url
-    this.listeners = {}
-  }
-
-  addEventListener(event, listener) {
-    this.listeners[event] = listener
-  }
-
-  emit(event, data) {
-    if (this.listeners[event]) {
-      this.listeners[event](data)
-    }
-  }
-
-  close() {
-    if (this.onclose) {
-      this.onclose({ code: 1000 })
-    }
-  }
-}
-
-// Mock global WebSocket
-// @ts-ignore
-global.WebSocket = vi.fn().mockImplementation((url) => new MockWebSocket(url))
-
-describe('JanInferenceCortexExtension', () => {
-  let extension
-  
-  beforeEach(() => {
-    // Reset mocks
-    vi.clearAllMocks()
-    
-    // Create a new instance for each test
-    extension = new JanInferenceCortexExtension()
-    
-    // Mock the getSetting method
-    extension.getSetting = vi.fn().mockImplementation((key, defaultValue) => {
-      switch(key) {
-        case Settings.n_parallel:
-          return '4'
-        case Settings.cont_batching:
-          return true
-        case Settings.caching_enabled:
-          return true
-        case Settings.flash_attn:
-          return true
-        case Settings.cache_type:
-          return 'f16'
-        case Settings.use_mmap:
-          return true
-        case Settings.cpu_threads:
-          return ''
-        default:
-          return defaultValue
-      }
-    })
-    
-    // Mock methods
-    extension.registerSettings = vi.fn()
-    extension.onLoad = vi.fn()
-    extension.clean = vi.fn().mockResolvedValue({})
-    extension.healthz = vi.fn().mockResolvedValue({})
-    extension.subscribeToEvents = vi.fn()
-  })
-  
-  describe('onSettingUpdate', () => {
-    it('should update n_parallel setting correctly', () => {
-      extension.onSettingUpdate(Settings.n_parallel, '8')
-      expect(extension.n_parallel).toBe(8)
-    })
-    
-    it('should update cont_batching setting correctly', () => {
-      extension.onSettingUpdate(Settings.cont_batching, false)
-      expect(extension.cont_batching).toBe(false)
-    })
-    
-    it('should update caching_enabled setting correctly', () => {
-      extension.onSettingUpdate(Settings.caching_enabled, false)
-      expect(extension.caching_enabled).toBe(false)
-    })
-    
-    it('should update flash_attn setting correctly', () => {
-      extension.onSettingUpdate(Settings.flash_attn, false)
-      expect(extension.flash_attn).toBe(false)
-    })
-    
-    it('should update cache_type setting correctly', () => {
-      extension.onSettingUpdate(Settings.cache_type, 'f32')
-      expect(extension.cache_type).toBe('f32')
-    })
-    
-    it('should update use_mmap setting correctly', () => {
-      extension.onSettingUpdate(Settings.use_mmap, false)
-      expect(extension.use_mmap).toBe(false)
-    })
-    
-    it('should update cpu_threads setting correctly', () => {
-      extension.onSettingUpdate(Settings.cpu_threads, '4')
-      expect(extension.cpu_threads).toBe(4)
-    })
-    
-    it('should not update cpu_threads when value is not a number', () => {
-      extension.cpu_threads = undefined
-      extension.onSettingUpdate(Settings.cpu_threads, 'not-a-number')
-      expect(extension.cpu_threads).toBeUndefined()
-    })
-  })
-  
-  describe('onUnload', () => {
-    it('should clean up resources correctly', async () => {
-      extension.shouldReconnect = true
-      
-      await extension.onUnload()
-      
-      expect(extension.shouldReconnect).toBe(false)
-      expect(extension.clean).toHaveBeenCalled()
-      expect(executeOnMain).toHaveBeenCalledWith(NODE, 'dispose')
-    })
-  })
-  
-  describe('loadModel', () => {
-    it('should remove llama_model_path and mmproj from settings', async () => {
-      // Setup
-      const model = {
-        id: 'test-model',
-        settings: {
-          llama_model_path: '/path/to/model',
-          mmproj: '/path/to/mmproj',
-          some_setting: 'value'
-        },
-        engine: InferenceEngine.cortex_llamacpp
-      }
-      
-      // Mock ky.post
-      vi.spyOn(ky, 'post').mockImplementation(() => ({
-        // @ts-ignore
-        json: () => Promise.resolve({}),
-        catch: () => ({
-          finally: () => ({
-            // @ts-ignore
-            then: () => Promise.resolve({})
-          })
-        })
-      }))
-      
-      // Setup queue for testing
-      extension.queue = { add: vi.fn(fn => fn()) }
-      
-      // Execute
-      await extension.loadModel(model)
-      
-      // Verify settings were filtered
-      expect(model.settings).not.toHaveProperty('llama_model_path')
-      expect(model.settings).not.toHaveProperty('mmproj')
-      expect(model.settings).toHaveProperty('some_setting')
-    })
-    
-    it('should convert nitro to cortex_llamacpp engine', async () => {
-      // Setup
-      const model = {
-        id: 'test-model',
-        settings: {},
-        engine: InferenceEngine.nitro
-      }
-      
-      // Mock ky.post
-      const mockKyPost = vi.spyOn(ky, 'post').mockImplementation(() => ({
-        // @ts-ignore
-        json: () => Promise.resolve({}),
-        catch: () => ({
-          finally: () => ({
-            // @ts-ignore
-            then: () => Promise.resolve({})
-          })
-        })
-      }))
-      
-      // Setup queue for testing
-      extension.queue = { add: vi.fn(fn => fn()) }
-      
-      // Execute
-      await extension.loadModel(model)
-      
-      // Verify API call
-      expect(mockKyPost).toHaveBeenCalledWith(
-        `${CORTEX_API_URL}/v1/models/start`,
-        expect.objectContaining({
-          json: expect.objectContaining({
-            engine: InferenceEngine.cortex_llamacpp
-          })
-        })
-      )
-    })
-  })
-  
-  describe('unloadModel', () => {
-    it('should call the correct API endpoint and abort loading if in progress', async () => {
-      // Setup
-      const model = { id: 'test-model' }
-      const mockAbort = vi.fn()
-      extension.abortControllers.set(model.id, { abort: mockAbort })
-      
-      // Mock ky.post
-      const mockKyPost = vi.spyOn(ky, 'post').mockImplementation(() => ({
-        // @ts-ignore
-        json: () => Promise.resolve({}),
-        finally: () => ({
-          // @ts-ignore
-          then: () => Promise.resolve({})
-        })
-      }))
-      
-      // Execute
-      await extension.unloadModel(model)
-      
-      // Verify API call
-      expect(mockKyPost).toHaveBeenCalledWith(
-        `${CORTEX_API_URL}/v1/models/stop`,
-        expect.objectContaining({
-          json: { model: model.id }
-        })
-      )
-      
-      // Verify abort controller was called
-      expect(mockAbort).toHaveBeenCalled()
-    })
-  })
-  
-  describe('clean', () => {
-    it('should make a DELETE request to destroy process manager', async () => {
-      // Mock the ky.delete function directly
-      const mockDelete = vi.fn().mockReturnValue({
-        catch: vi.fn().mockReturnValue(Promise.resolve({}))
-      })
-      
-      // Replace the original implementation
-      vi.spyOn(ky, 'delete').mockImplementation(mockDelete)
-      
-      // Override the clean method to use the real implementation
-      // @ts-ignore
-      extension.clean = JanInferenceCortexExtension.prototype.clean
-      
-      // Call the method
-      await extension.clean()
-      
-      // Verify the correct API call was made
-      expect(mockDelete).toHaveBeenCalledWith(
-        `${CORTEX_API_URL}/processmanager/destroy`,
-        expect.objectContaining({
-          timeout: 2000,
-          retry: expect.objectContaining({
-            limit: 0
-          })
-        })
-      )
-    })
-  })
-  
-  describe('WebSocket events', () => {
-    it('should handle WebSocket events correctly', () => {
-      // Create a mock implementation for subscribeToEvents that stores the socket
-      let messageHandler;
-      let closeHandler;
-      
-      // Override the private method
-      extension.subscribeToEvents = function() {
-        this.socket = new MockWebSocket('ws://localhost:3000/events');
-        this.socket.addEventListener('message', (event) => {
-          const data = JSON.parse(event.data);
-          
-          // Store for testing
-          messageHandler = data;
-          
-          const transferred = data.task.items.reduce(
-            (acc, cur) => acc + cur.downloadedBytes,
-            0
-          );
-          const total = data.task.items.reduce(
-            (acc, cur) => acc + cur.bytes,
-            0
-          );
-          const percent = total > 0 ? transferred / total : 0;
-          
-          events.emit(
-            data.type === 'DownloadUpdated' ? 'onFileDownloadUpdate' :
-            data.type === 'DownloadSuccess' ? 'onFileDownloadSuccess' : 
-            data.type,
-            {
-              modelId: data.task.id,
-              percent: percent,
-              size: {
-                transferred: transferred,
-                total: total,
-              },
-              downloadType: data.task.type,
-            }
-          );
-          
-          if (data.task.type === 'Engine') {
-            events.emit(EngineEvent.OnEngineUpdate, {
-              type: data.type,
-              percent: percent,
-              id: data.task.id,
-            });
-          }
-          else if (data.type === 'DownloadSuccess') {
-            setTimeout(() => {
-              events.emit(ModelEvent.OnModelsUpdate, {
-                fetch: true,
-              });
-            }, 500);
-          }
-        });
-        
-        this.socket.onclose = (event) => {
-          closeHandler = event;
-          // Notify app to update model running state
-          events.emit(ModelEvent.OnModelStopped, {});
-        };
-      };
-      
-      // Setup queue
-      extension.queue = {
-        add: vi.fn(fn => fn())
-      };
-      
-      // Execute the method
-      extension.subscribeToEvents();
-      
-      // Simulate a message event
-      extension.socket.listeners.message({
-        data: JSON.stringify({
-          type: 'DownloadUpdated',
-          task: {
-            id: 'test-model',
-            type: 'Model',
-            items: [
-              { downloadedBytes: 50, bytes: 100 }
-            ]
-          }
-        })
-      });
-      
-      // Verify event emission
-      expect(events.emit).toHaveBeenCalledWith(
-        'onFileDownloadUpdate',
-        expect.objectContaining({
-          modelId: 'test-model',
-          percent: 0.5
-        })
-      );
-      
-      // Simulate a download success event
-      vi.useFakeTimers();
-      extension.socket.listeners.message({
-        data: JSON.stringify({
-          type: 'DownloadSuccess',
-          task: {
-            id: 'test-model',
-            type: 'Model',
-            items: [
-              { downloadedBytes: 100, bytes: 100 }
-            ]
-          }
-        })
-      });
-      
-      // Fast-forward time to trigger the timeout
-      vi.advanceTimersByTime(500);
-      
-      // Verify the ModelEvent.OnModelsUpdate event was emitted
-      expect(events.emit).toHaveBeenCalledWith(
-        ModelEvent.OnModelsUpdate,
-        { fetch: true }
-      );
-      
-      vi.useRealTimers();
-      
-      // Trigger websocket close
-      extension.socket.onclose({ code: 1000 });
-      
-      // Verify OnModelStopped event was emitted
-      expect(events.emit).toHaveBeenCalledWith(
-        ModelEvent.OnModelStopped, 
-        {}
-      );
-    });
-  })
-})
--- a/extensions/inference-cortex-extension/src/index.ts
+++ b/extensions/inference-cortex-extension/src/index.ts
@ -1,435 +0,0 @@
-/**
- * @file This file exports a class that implements the InferenceExtension interface from the @janhq/core package.
- * The class provides methods for initializing and stopping a model, and for making inference requests.
- * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
- * @version 1.0.0
- * @module inference-extension/src/index
- */
-
-import {
-  Model,
-  EngineEvent,
-  LocalOAIEngine,
-  extractModelLoadParams,
-  events,
-  ModelEvent,
-} from '@janhq/core'
-import ky, { KyInstance } from 'ky'
-
-/**
- * Event subscription types of Downloader
- */
-enum DownloadTypes {
-  DownloadUpdated = 'onFileDownloadUpdate',
-  DownloadError = 'onFileDownloadError',
-  DownloadSuccess = 'onFileDownloadSuccess',
-  DownloadStopped = 'onFileDownloadStopped',
-  DownloadStarted = 'onFileDownloadStarted',
-}
-
-enum Settings {
-  n_parallel = 'n_parallel',
-  cont_batching = 'cont_batching',
-  caching_enabled = 'caching_enabled',
-  flash_attn = 'flash_attn',
-  cache_type = 'cache_type',
-  use_mmap = 'use_mmap',
-  cpu_threads = 'cpu_threads',
-  huggingfaceToken = 'hugging-face-access-token',
-  auto_unload_models = 'auto_unload_models',
-  context_shift = 'context_shift',
-}
-
-type LoadedModelResponse = { data: { engine: string; id: string }[] }
-
-/**
- * A class that implements the InferenceExtension interface from the @janhq/core package.
- * The class provides methods for initializing and stopping a model, and for making inference requests.
- * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
- */
-export default class JanInferenceCortexExtension extends LocalOAIEngine {
-  nodeModule: string = 'node'
-
-  provider: string = 'cortex'
-
-  shouldReconnect = true
-
-  /** Default Engine model load settings */
-  n_parallel?: number
-  cont_batching: boolean = false
-  caching_enabled: boolean = true
-  flash_attn: boolean = true
-  use_mmap: boolean = true
-  cache_type: string = 'q8'
-  cpu_threads?: number
-  auto_unload_models: boolean = true
-  reasoning_budget = -1 // Default reasoning budget in seconds
-  context_shift = false
-  /**
-   * The URL for making inference requests.
-   */
-  inferenceUrl = `${CORTEX_API_URL}/v1/chat/completions`
-
-  /**
-   * Socket instance of events subscription
-   */
-  socket?: WebSocket = undefined
-
-  abortControllers = new Map<string, AbortController>()
-
-  api?: KyInstance
-  /**
-   * Get the API instance
-   * @returns
-   */
-  async apiInstance(): Promise<KyInstance> {
-    if (this.api) return this.api
-    const apiKey = await window.core?.api.appToken()
-    this.api = ky.extend({
-      prefixUrl: CORTEX_API_URL,
-      headers: apiKey
-        ? {
-            Authorization: `Bearer ${apiKey}`,
-          }
-        : {},
-      retry: 10,
-    })
-    return this.api
-  }
-
-  /**
-   * Authorization headers for the API requests.
-   * @returns
-   */
-  headers(): Promise<HeadersInit> {
-    return window.core?.api.appToken().then((token: string) => ({
-      Authorization: `Bearer ${token}`,
-    }))
-  }
-
-  /**
-   * Called when the extension is loaded.
-   */
-  async onLoad() {
-    super.onLoad()
-
-    // Register Settings
-    this.registerSettings(SETTINGS)
-
-    const numParallel = await this.getSetting<string>(Settings.n_parallel, '')
-    if (numParallel.length > 0 && parseInt(numParallel) > 0) {
-      this.n_parallel = parseInt(numParallel)
-    }
-    if (this.n_parallel && this.n_parallel > 1)
-      this.cont_batching = await this.getSetting<boolean>(
-        Settings.cont_batching,
-        false
-      )
-    this.caching_enabled = await this.getSetting<boolean>(
-      Settings.caching_enabled,
-      true
-    )
-    this.flash_attn = await this.getSetting<boolean>(Settings.flash_attn, true)
-    this.context_shift = await this.getSetting<boolean>(
-      Settings.context_shift,
-      false
-    )
-    this.use_mmap = await this.getSetting<boolean>(Settings.use_mmap, true)
-    if (this.caching_enabled)
-      this.cache_type = await this.getSetting<string>(Settings.cache_type, 'q8')
-    this.auto_unload_models = await this.getSetting<boolean>(
-      Settings.auto_unload_models,
-      true
-    )
-    const threads_number = Number(
-      await this.getSetting<string>(Settings.cpu_threads, '')
-    )
-
-    if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number
-
-    const huggingfaceToken = await this.getSetting<string>(
-      Settings.huggingfaceToken,
-      ''
-    )
-    if (huggingfaceToken) {
-      this.updateCortexConfig({ huggingface_token: huggingfaceToken })
-    }
-    this.subscribeToEvents()
-
-    window.addEventListener('beforeunload', () => {
-      this.clean()
-    })
-
-    // Migrate configs
-    if (!localStorage.getItem('cortex_migration_completed')) {
-      const config = await this.getCortexConfig()
-      console.log('Start cortex.cpp migration', config)
-      if (config && config.huggingface_token) {
-        this.updateSettings([
-          {
-            key: Settings.huggingfaceToken,
-            controllerProps: {
-              value: config.huggingface_token,
-            },
-          },
-        ])
-        this.updateCortexConfig({
-          huggingface_token: config.huggingface_token,
-        })
-        localStorage.setItem('cortex_migration_completed', 'true')
-      }
-    }
-  }
-
-  async onUnload() {
-    console.log('Clean up cortex.cpp services')
-    this.shouldReconnect = false
-    this.clean()
-    super.onUnload()
-  }
-
-  /**
-   * Subscribe to settings update and make change accordingly
-   * @param key
-   * @param value
-   */
-  onSettingUpdate<T>(key: string, value: T): void {
-    if (key === Settings.n_parallel && typeof value === 'string') {
-      if (value.length > 0 && parseInt(value) > 0) {
-        this.n_parallel = parseInt(value)
-      }
-    } else if (key === Settings.cont_batching && typeof value === 'boolean') {
-      this.cont_batching = value as boolean
-    } else if (key === Settings.caching_enabled && typeof value === 'boolean') {
-      this.caching_enabled = value as boolean
-    } else if (key === Settings.flash_attn && typeof value === 'boolean') {
-      this.flash_attn = value as boolean
-    } else if (key === Settings.cache_type && typeof value === 'string') {
-      this.cache_type = value as string
-    } else if (key === Settings.use_mmap && typeof value === 'boolean') {
-      this.use_mmap = value as boolean
-    } else if (key === Settings.cpu_threads && typeof value === 'string') {
-      const threads_number = Number(value)
-      if (!Number.isNaN(threads_number)) this.cpu_threads = threads_number
-    } else if (key === Settings.huggingfaceToken) {
-      this.updateCortexConfig({ huggingface_token: value })
-    } else if (key === Settings.auto_unload_models) {
-      this.auto_unload_models = value as boolean
-    } else if (key === Settings.context_shift && typeof value === 'boolean') {
-      this.context_shift = value
-    }
-  }
-
-  override async loadModel(
-    model: Partial<Model> & {
-      id: string
-      settings?: object
-      file_path?: string
-    },
-    abortController: AbortController
-  ): Promise<void> {
-    // Cortex will handle these settings
-    const { llama_model_path, mmproj, ...settings } = model.settings ?? {}
-    model.settings = settings
-
-    const controller = abortController ?? new AbortController()
-    const { signal } = controller
-
-    this.abortControllers.set(model.id, controller)
-
-    const loadedModels = await this.activeModels()
-
-    // This is to avoid loading the same model multiple times
-    if (loadedModels.some((e: { id: string }) => e.id === model.id)) {
-      console.log(`Model ${model.id} already loaded`)
-      return
-    }
-    if (this.auto_unload_models) {
-      // Unload the last used model if it is not the same as the current one
-      for (const lastUsedModel of loadedModels) {
-        if (lastUsedModel.id !== model.id) {
-          console.log(`Unloading last used model: ${lastUsedModel.id}`)
-          await this.unloadModel(lastUsedModel as Model)
-        }
-      }
-    }
-    const modelSettings = extractModelLoadParams(model.settings)
-    return await this.apiInstance().then((api) =>
-      api
-        .post('v1/models/start', {
-          json: {
-            ...modelSettings,
-            model: model.id,
-            engine:
-              model.engine === 'nitro' // Legacy model cache
-                ? 'llama-cpp'
-                : model.engine,
-            ...(this.n_parallel ? { n_parallel: this.n_parallel } : {}),
-            ...(this.use_mmap ? { use_mmap: true } : {}),
-            ...(this.caching_enabled ? { caching_enabled: true } : {}),
-            ...(this.flash_attn ? { flash_attn: true } : {}),
-            ...(this.caching_enabled && this.cache_type
-              ? { cache_type: this.cache_type }
-              : {}),
-            ...(this.cpu_threads && this.cpu_threads > 0
-              ? { cpu_threads: this.cpu_threads }
-              : {}),
-            ...(this.cont_batching && this.n_parallel && this.n_parallel > 1
-              ? { cont_batching: this.cont_batching }
-              : {}),
-            ...(model.id.toLowerCase().includes('jan-nano')
-              ? { reasoning_budget: 0 }
-              : { reasoning_budget: this.reasoning_budget }),
-            ...(this.context_shift !== true // explicit true required to enable context shift
-              ? { 'no-context-shift': true }
-              : {}),
-            ...(modelSettings.ngl === -1 || modelSettings.ngl === undefined
-              ? { ngl: 100 }
-              : {}),
-          },
-          timeout: false,
-          signal,
-        })
-        .json()
-        .catch(async (e) => {
-          throw (await e.response?.json()) ?? e
-        })
-        .finally(() => this.abortControllers.delete(model.id))
-        .then()
-    )
-  }
-
-  override async unloadModel(model: Model): Promise<void> {
-    return this.apiInstance().then((api) =>
-      api
-        .post('v1/models/stop', {
-          json: { model: model.id },
-          retry: {
-            limit: 0,
-          },
-        })
-        .json()
-        .finally(() => {
-          this.abortControllers.get(model.id)?.abort()
-        })
-        .then()
-    )
-  }
-
-  async activeModels(): Promise<(object & { id: string })[]> {
-    return await this.apiInstance()
-      .then((e) =>
-        e.get('inferences/server/models', {
-          retry: {
-            limit: 0, // Do not retry
-          },
-        })
-      )
-      .then((e) => e.json())
-      .then((e) => (e as LoadedModelResponse).data ?? [])
-      .catch(() => [])
-  }
-
-  /**
-   * Clean cortex processes
-   * @returns
-   */
-  private async clean(): Promise<any> {
-    return this.apiInstance()
-      .then((api) =>
-        api.delete('processmanager/destroy', {
-          timeout: 2000, // maximum 2 seconds
-          retry: {
-            limit: 0,
-          },
-        })
-      )
-      .catch(() => {
-        // Do nothing
-      })
-  }
-
-  /**
-   * Update cortex config
-   * @param body
-   */
-  private async updateCortexConfig(body: {
-    [key: string]: any
-  }): Promise<void> {
-    return this.apiInstance()
-      .then((api) => api.patch('v1/configs', { json: body }).then(() => {}))
-      .catch((e) => console.debug(e))
-  }
-
-  /**
-   * Get cortex config
-   * @param body
-   */
-  private async getCortexConfig(): Promise<any> {
-    return this.apiInstance()
-      .then((api) => api.get('v1/configs').json())
-      .catch((e) => console.debug(e))
-  }
-
-  /**
-   * Subscribe to cortex.cpp websocket events
-   */
-  private subscribeToEvents() {
-    this.socket = new WebSocket(`${CORTEX_SOCKET_URL}/events`)
-
-    this.socket.addEventListener('message', (event) => {
-      const data = JSON.parse(event.data)
-
-      const transferred = data.task.items.reduce(
-        (acc: number, cur: any) => acc + cur.downloadedBytes,
-        0
-      )
-      const total = data.task.items.reduce(
-        (acc: number, cur: any) => acc + cur.bytes,
-        0
-      )
-      const percent = total > 0 ? transferred / total : 0
-
-      events.emit(DownloadTypes[data.type as keyof typeof DownloadTypes], {
-        modelId: data.task.id,
-        percent: percent,
-        size: {
-          transferred: transferred,
-          total: total,
-        },
-        downloadType: data.task.type,
-      })
-
-      if (data.task.type === 'Engine') {
-        events.emit(EngineEvent.OnEngineUpdate, {
-          type: DownloadTypes[data.type as keyof typeof DownloadTypes],
-          percent: percent,
-          id: data.task.id,
-        })
-      } else {
-        if (data.type === DownloadTypes.DownloadSuccess) {
-          // Delay for the state update from cortex.cpp
-          // Just to be sure
-          setTimeout(() => {
-            events.emit(ModelEvent.OnModelsUpdate, {
-              fetch: true,
-            })
-          }, 500)
-        }
-      }
-    })
-
-    /**
-     * This is to handle the server segfault issue
-     */
-    this.socket.onclose = (event) => {
-      // Notify app to update model running state
-      events.emit(ModelEvent.OnModelStopped, {})
-
-      // Reconnect to the /events websocket
-      if (this.shouldReconnect) {
-        setTimeout(() => this.subscribeToEvents(), 1000)
-      }
-    }
-  }
-}
--- a/extensions/inference-cortex-extension/src/node/index.test.ts
+++ b/extensions/inference-cortex-extension/src/node/index.test.ts
@ -1,144 +0,0 @@
-import { describe, it, expect, vi } from 'vitest'
-// Mocks
-
-const CORTEX_API_URL = 'http://localhost:3000'
-vi.stubGlobal('CORTEX_API_URL', CORTEX_API_URL)
-
-vi.mock('@janhq/core/node', (actual) => ({
-  ...actual(),
-  getJanDataFolderPath: () => '',
-  appResourcePath: () => '/mock/path',
-  log: vi.fn(),
-  getSystemResourceInfo: () => {
-    return {
-      cpu: {
-        cores: 1,
-        logicalCores: 1,
-        threads: 1,
-        model: 'model',
-        speed: 1,
-      },
-      memory: {
-        total: 1,
-        free: 1,
-      },
-      gpu: {
-        model: 'model',
-        memory: 1,
-        cuda: {
-          version: 'version',
-          devices: 'devices',
-        },
-        vulkan: {
-          version: 'version',
-          devices: 'devices',
-        },
-      },
-    }
-  },
-}))
-
-vi.mock('fs', () => ({
-  default: {
-    readdirSync: () => [],
-  },
-}))
-
-vi.mock('./watchdog', () => {
-  return {
-    ProcessWatchdog: vi.fn().mockImplementation(() => {
-      return {
-        start: vi.fn(),
-        terminate: vi.fn(),
-      }
-    }),
-  }
-})
-
-vi.mock('child_process', () => ({
-  exec: () => {
-    return {
-      stdout: { on: vi.fn() },
-      stderr: { on: vi.fn() },
-      on: vi.fn(),
-    }
-  },
-  spawn: () => {
-    return {
-      stdout: { on: vi.fn() },
-      stderr: { on: vi.fn() },
-      on: vi.fn(),
-      pid: '111',
-    }
-  },
-}))
-
-import index from './index'
-
-describe('Cortex extension node interface', () => {
-  describe('run', () => {
-    it('should start the cortex subprocess on macOS', async () => {
-      Object.defineProperty(process, 'platform', {
-        value: 'darwin',
-      })
-
-      const result = await index.run()
-      expect(result).toBeUndefined()
-    })
-
-    it('should start the cortex subprocess on Windows', async () => {
-      Object.defineProperty(process, 'platform', {
-        value: 'win32',
-      })
-
-      const result = await index.run()
-      expect(result).toBeUndefined()
-    })
-
-    it('should set the proper environment variables based on platform', async () => {
-      // Test for Windows
-      Object.defineProperty(process, 'platform', {
-        value: 'win32',
-      })
-      process.env.PATH = '/original/path'
-
-      await index.run()
-      expect(process.env.PATH).toContain('/original/path')
-
-      // Test for non-Windows (macOS/Linux)
-      Object.defineProperty(process, 'platform', {
-        value: 'darwin',
-      })
-      process.env.LD_LIBRARY_PATH = '/original/ld/path'
-
-      await index.run()
-      expect(process.env.LD_LIBRARY_PATH).toContain('/original/ld/path')
-    })
-  })
-
-  describe('dispose', () => {
-    it('should dispose a model successfully on Mac', async () => {
-      Object.defineProperty(process, 'platform', {
-        value: 'darwin',
-      })
-
-      // Call the dispose function
-      const result = index.dispose()
-
-      // Assert that the result is as expected
-      expect(result).toBeUndefined()
-    })
-
-    it('should kill the subprocess successfully on Windows', async () => {
-      Object.defineProperty(process, 'platform', {
-        value: 'win32',
-      })
-
-      // Call the dispose function
-      const result = index.dispose()
-
-      // Assert that the result is as expected
-      expect(result).toBeUndefined()
-    })
-  })
-})
--- a/extensions/inference-cortex-extension/src/node/index.ts
+++ b/extensions/inference-cortex-extension/src/node/index.ts
@ -1,103 +0,0 @@
-import path from 'path'
-import { appResourcePath, getJanDataFolderPath, log } from '@janhq/core/node'
-import { ProcessWatchdog } from './watchdog'
-
-let watchdog: ProcessWatchdog | undefined = undefined
-
-/**
- * Spawns a Nitro subprocess.
- * @returns A promise that resolves when the Nitro subprocess is started.
- */
-function run(): Promise<any> {
-  log(`[CORTEX]:: Spawning cortex subprocess...`)
-
-  return new Promise<void>(async (resolve, reject) => {
-    // let gpuVisibleDevices = systemInfo?.gpuSetting?.gpus_in_use.join(',') ?? ''
-    let binaryName = `cortex-server${
-      process.platform === 'win32' ? '.exe' : ''
-    }`
-    const binPath = path.join(__dirname, '..', 'bin')
-
-    const executablePath = path.join(binPath, binaryName)
-
-    addEnvPaths(binPath)
-
-    const sharedPath = path.join(appResourcePath(), 'shared')
-    // Execute the binary
-    log(`[CORTEX]:: Spawn cortex at path: ${executablePath}`)
-
-    const dataFolderPath = getJanDataFolderPath()
-    if (watchdog) {
-      watchdog.terminate()
-    }
-
-    // The HOST address to use for the cortex subprocess
-    const LOCAL_PORT = CORTEX_API_URL.split(':').pop() ?? '39291'
-
-    watchdog = new ProcessWatchdog(
-      executablePath,
-      [
-        '--start-server',
-        '--port',
-        LOCAL_PORT.toString(),
-        '--config_file_path',
-        `${path.join(dataFolderPath, '.janrc')}`,
-        '--data_folder_path',
-        dataFolderPath,
-        'config',
-        '--api_keys',
-        process.env.appToken ?? 'cortex.cpp',
-      ],
-      {
-        env: {
-          ...process.env,
-          // CUDA_VISIBLE_DEVICES: gpuVisibleDevices,
-          // // Vulkan - Support 1 device at a time for now
-          // ...(gpuVisibleDevices?.length > 0 && {
-          //   GGML_VK_VISIBLE_DEVICES: gpuVisibleDevices,
-          // }),
-        },
-        cwd: sharedPath,
-      }
-    )
-    watchdog.start()
-    resolve()
-  })
-}
-
-/**
- * Every module should have a dispose function
- * This will be called when the extension is unloaded and should clean up any resources
- * Also called when app is closed
- */
-function dispose() {
-  watchdog?.terminate()
-}
-
-/**
- * Set the environment paths for the cortex subprocess
- * @param dest
- */
-function addEnvPaths(dest: string) {
-  // Add engine path to the PATH and LD_LIBRARY_PATH
-  if (process.platform === 'win32') {
-    process.env.PATH = (process.env.PATH || '').concat(path.delimiter, dest)
-  } else {
-    process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
-      path.delimiter,
-      dest
-    )
-  }
-}
-
-/**
- * Cortex process info
- */
-export interface CortexProcessInfo {
-  isRunning: boolean
-}
-
-export default {
-  run,
-  dispose,
-}
--- a/extensions/inference-cortex-extension/src/node/watchdog.ts
+++ b/extensions/inference-cortex-extension/src/node/watchdog.ts
@ -1,84 +0,0 @@
-import { log } from '@janhq/core/node'
-import { spawn, ChildProcess } from 'child_process'
-import { EventEmitter } from 'events'
-
-interface WatchdogOptions {
-  cwd?: string
-  restartDelay?: number
-  maxRestarts?: number
-  env?: NodeJS.ProcessEnv
-}
-
-export class ProcessWatchdog extends EventEmitter {
-  private command: string
-  private args: string[]
-  private options: WatchdogOptions
-  private process: ChildProcess | null
-  private restartDelay: number
-  private maxRestarts: number
-  private restartCount: number
-  private isTerminating: boolean
-
-  constructor(command: string, args: string[], options: WatchdogOptions = {}) {
-    super()
-    this.command = command
-    this.args = args
-    this.options = options
-    this.process = null
-    this.restartDelay = options.restartDelay || 5000
-    this.maxRestarts = options.maxRestarts || 5
-    this.restartCount = 0
-    this.isTerminating = false
-  }
-
-  start(): void {
-    this.spawnProcess()
-  }
-
-  private spawnProcess(): void {
-    if (this.isTerminating) return
-
-    log(`Starting process: ${this.command} ${this.args.join(' ')}`)
-    this.process = spawn(this.command, this.args, this.options)
-
-    this.process.stdout?.on('data', (data: Buffer) => {
-      log(`Process output: ${data}`)
-      this.emit('output', data.toString())
-    })
-
-    this.process.stderr?.on('data', (data: Buffer) => {
-      log(`Process error: ${data}`)
-      this.emit('error', data.toString())
-    })
-
-    this.process.on('close', (code: number | null) => {
-      log(`Process exited with code ${code}`)
-      this.emit('close', code)
-      if (!this.isTerminating) {
-        this.restartProcess()
-      }
-    })
-  }
-
-  private restartProcess(): void {
-    if (this.restartCount < this.maxRestarts) {
-      this.restartCount++
-      log(
-        `Restarting process in ${this.restartDelay}ms (Attempt ${this.restartCount}/${this.maxRestarts})`
-      )
-      setTimeout(() => this.spawnProcess(), this.restartDelay)
-    } else {
-      log('Max restart attempts reached. Exiting watchdog.')
-      this.emit('maxRestartsReached')
-    }
-  }
-
-  terminate(): void {
-    this.isTerminating = true
-    if (this.process) {
-      log('Terminating watched process...')
-      this.process.kill()
-    }
-    this.emit('terminated')
-  }
-}
--- a/extensions/inference-cortex-extension/tsconfig.json
+++ b/extensions/inference-cortex-extension/tsconfig.json
@ -1,15 +0,0 @@
-{
-  "compilerOptions": {
-    "moduleResolution": "node",
-    "target": "es2016",
-    "module": "esnext",
-    "strict": true,
-    "sourceMap": true,
-    "esModuleInterop": true,
-    "outDir": "dist",
-    "importHelpers": true,
-    "typeRoots": ["node_modules/@types"]
-  },
-  "include": ["src"],
-  "exclude": ["src/**/*.test.ts"]
-}
--- a/extensions/llamacpp-extension/package.json
+++ b/extensions/llamacpp-extension/package.json
@ -21,6 +21,7 @@
  },
  "dependencies": {
    "@janhq/core": "../../core/package.tgz",
+    "@tauri-apps/api": "^1.4.0",
    "fetch-retry": "^5.0.6",
    "ulidx": "^2.3.0"
  },
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@ -6,10 +6,55 @@
 * @module llamacpp-extension/src/index
 */

-import { RemoteOAIEngine, getJanDataFolderPath, fs, ModelCapability, Model } from '@janhq/core'
+import {
+  AIEngine,
+  localProvider,
+  getJanDataFolderPath,
+  fs,
+  Model,
+} from '@janhq/core'

-export enum Settings {
-  port = 'port',
+import { invoke } from '@tauri-apps/api/tauri'
+import {
+  LocalProvider,
+  ModelInfo,
+  ListOptions,
+  ListResult,
+  PullOptions,
+  PullResult,
+  LoadOptions,
+  SessionInfo,
+  UnloadOptions,
+  UnloadResult,
+  ChatOptions,
+  ChatCompletion,
+  ChatCompletionChunk,
+  DeleteOptions,
+  DeleteResult,
+  ImportOptions,
+  ImportResult,
+  AbortPullOptions,
+  AbortPullResult,
+  ChatCompletionRequest,
+} from './types'
+
+/**
+ * Helper to convert GGUF model filename to a more structured ID/name
+ * Example: "mistral-7b-instruct-v0.2.Q4_K_M.gguf" -> { baseModelId: "mistral-7b-instruct-v0.2", quant: "Q4_K_M" }
+ **/
+function parseGGUFFileName(filename: string): {
+  baseModelId: string
+  quant?: string
+} {
+  const nameWithoutExt = filename.replace(/\.gguf$/i, '')
+  // Try to split by common quantization patterns (e.g., .Q4_K_M, -IQ2_XS)
+  const match = nameWithoutExt.match(
+    /^(.*?)[-_]([QqIiFf]\w{1,3}_\w{1,3}|[Qq]\d+_[KkSsMmXxLl\d]+|[IiQq]\d+_[XxSsMm]+|[Qq]\d+)$/
+  )
+  if (match && match[1] && match[2]) {
+    return { baseModelId: match[1], quant: match[2] }
+  }
+  return { baseModelId: nameWithoutExt }
 }

 /**
@ -17,99 +62,246 @@ export enum Settings {
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 */
-export default class LlamacppProvider extends RemoteOAIEngine {
-  inferenceUrl: string = ''
-  baseURL: string = ''
-  provider: string = ENGINE
+export default class inference_llamacpp_extension
+  extends AIEngine
+  implements localProvider
+{
+  provider: string = 'llamacpp'
+  readonly providerId: string = 'llamcpp'
+
+  private activeSessions: Map<string, SessionInfo> = new Map()
+
+  private modelsBasePath!: string

  override async onLoad(): Promise<void> {
-    super.onLoad()
+    super.onLoad() // Calls registerEngine() from AIEngine
+    this.registerSettings(SETTINGS_DEFINITIONS)

-    // Register Settings
-    this.registerSettings(SETTINGS)
-
-    // register models
-    const models = await this.listModels()
-    this.registerModels(models)
-
-    // NOTE: port 0 may mean request free port from OS. we may want
-    // to take advantage of this. llama-server --port 0 on macOS works.
-    const port = await this.getSetting<number>(Settings.port, 0)
-    this.updateBaseUrl(port)
-  }
-
-  // onSettingUpdate<T>(key: string, value: T): void {
-  //   if (key === Settings.apiKey) {
-  //     this.apiKey = value as string
-  //   } else if (key === Settings.baseUrl) {
-  //     if (typeof value !== 'string') return
-  //     this.updateBaseUrl(value)
-  //   }
-  // }
-
-  updateBaseUrl(value: number): void {
-    if (value == 0) {
-      // set to default value
-      SETTINGS.forEach((setting) => {
-        if (setting.key === Settings.port) {
-          value = setting.controllerProps.value as number
-        }
-      })
+    const customPath = await this.getSetting<string>(
+      LlamaCppSettings.ModelsPath,
+      ''
+    )
+    if (customPath && (await fs.exists(customPath))) {
+      this.modelsBasePath = customPath
+    } else {
+      this.modelsBasePath = await path.join(
+        await getJanDataFolderPath(),
+        'models',
+        ENGINE_ID
+      )
    }
-    this.baseURL = `http://127.0.0.1:${value}`
-    this.inferenceUrl = `${this.baseURL}/chat/completions`
+    await fs.createDirAll(this.modelsBasePath)
+
+    console.log(
+      `${this.providerId} provider loaded. Models path: ${this.modelsBasePath}`
+    )
+
+    // Optionally, list and register models with the core system if AIEngine expects it
+    // const models = await this.listModels({ providerId: this.providerId });
+    // this.registerModels(this.mapModelInfoToCoreModel(models)); // mapModelInfoToCoreModel would be a helper
  }

-  async listModels(): Promise<Model[]> {
-    let modelIds = []
+  async getModelsPath(): Promise<string> {
+    // Ensure modelsBasePath is initialized
+    if (!this.modelsBasePath) {
+      const customPath = await this.getSetting<string>(
+        LlamaCppSettings.ModelsPath,
+        ''
+      )
+      if (customPath && (await fs.exists(customPath))) {
+        this.modelsBasePath = customPath
+      } else {
+        this.modelsBasePath = await path.join(
+          await getJanDataFolderPath(),
+          'models',
+          ENGINE_ID
+        )
+      }
+      await fs.createDirAll(this.modelsBasePath)
+    }
+    return this.modelsBasePath
+  }

-    const modelsFolder = `${await getJanDataFolderPath()}/models`
+  async listModels(_opts: ListOptions): Promise<ListResult> {
+    const modelsDir = await this.getModelsPath()
+    const result: ModelInfo[] = []

-    // cortexso models
-    const cortexsoFolder = `${modelsFolder}/cortex.so`
-    const modelDirs = await fs.readdirSync(cortexsoFolder)
-    for (const modelDir of modelDirs) {
-      const modelName = modelDir.split('/').pop()
+    try {
+      if (!(await fs.exists(modelsDir))) {
+        await fs.createDirAll(modelsDir)
+        return []
+      }

-      // TODO: try removing this check
-      // skip files start with . e.g. .DS_store
-      if (!modelName || modelName.startsWith('.')) continue
+      const entries = await fs.readDir(modelsDir)
+      for (const entry of entries) {
+        if (entry.name?.endsWith('.gguf') && entry.isFile) {
+          const modelPath = await path.join(modelsDir, entry.name)
+          const stats = await fs.stat(modelPath) // Tauri's fs.stat or Node's fs.statSync
+          const parsedName = parseGGUFFileName(entry.name)

-      const variantDirs = await fs.readdirSync(modelDir)
-      for (const variantDir of variantDirs) {
-        // NOTE: we can't detect unfinished download here
-        const ggufPath = `${variantDir}/model.gguf`
-
-        if (await fs.existsSync(ggufPath)) {
-          const variantName = variantDir.split('/').pop()
-          modelIds.push(`${modelName}/${variantName}`)
+          result.push({
+            id: `${parsedName.baseModelId}${parsedName.quant ? `/${parsedName.quant}` : ''}`, // e.g., "mistral-7b/Q4_0"
+            name: entry.name.replace('.gguf', ''), // Or a more human-friendly name
+            quant_type: parsedName.quant,
+            providerId: this.providerId,
+            sizeBytes: stats.size,
+            path: modelPath,
+            tags: [this.providerId, parsedName.quant || 'unknown_quant'].filter(
+              Boolean
+            ) as string[],
+          })
        }
      }
+    } catch (error) {
+      console.error(`[${this.providerId}] Error listing models:`, error)
+      // Depending on desired behavior, either throw or return empty/partial list
+    }
+    return result
+  }
+
+  // pullModel
+  async pullModel(opts: PullOptions): Promise<PullResult> {
+    // TODO: Implement pullModel
+    return 0;
+  }
+
+  // abortPull
+  async abortPull(opts: AbortPullOptions): Promise<AbortPullResult> {
+    // TODO: implement abortPull
+  }
+
+  async loadModel(opts: LoadOptions): Promise<SessionInfo> {
+    if (opts.providerId !== this.providerId) {
+      throw new Error('Invalid providerId for LlamaCppProvider.loadModel')
    }

-    // TODO: list models under huggingface.co
+    const sessionId = uuidv4()
+    const loadParams = {
+      model_path: opts.modelPath,
+      session_id: sessionId, // Pass sessionId to Rust for tracking
+      // Default llama.cpp server options, can be overridden by opts.options
+      port: opts.options?.port ?? 0, // 0 for dynamic port assignment by OS
+      n_gpu_layers:
+        opts.options?.n_gpu_layers ??
+        (await this.getSetting(LlamaCppSettings.DefaultNGpuLayers, -1)),
+      n_ctx:
+        opts.options?.n_ctx ??
+        (await this.getSetting(LlamaCppSettings.DefaultNContext, 2048)),
+      // Spread any other options from opts.options
+      ...(opts.options || {}),
+    }

-    const models = modelIds.map((modelId) => {
+    try {
+      console.log(
+        `[${this.providerId}] Requesting to load model: ${opts.modelPath} with options:`,
+        loadParams
+      )
+      // This matches the Rust handler: core::utils::extensions::inference_llamacpp_extension::server::load
+      const rustResponse: {
+        session_id: string
+        port: number
+        model_path: string
+        settings: Record<string, unknown>
+      } = await invoke('plugin:llamacpp|load', { params: loadParams }) // Adjust namespace if needed
+
+      if (!rustResponse || !rustResponse.port) {
+        throw new Error(
+          'Rust load function did not return expected port or session info.'
+        )
+      }
+
+      const sessionInfo: SessionInfo = {
+        sessionId: rustResponse.session_id, // Use sessionId from Rust if it regenerates/confirms it
+        port: rustResponse.port,
+        modelPath: rustResponse.model_path,
+        providerId: this.providerId,
+        settings: rustResponse.settings, // Settings actually used by the server
+      }
+
+      this.activeSessions.set(sessionInfo.sessionId, sessionInfo)
+      console.log(
+        `[${this.providerId}] Model loaded: ${sessionInfo.modelPath} on port ${sessionInfo.port}, session: ${sessionInfo.sessionId}`
+      )
+      return sessionInfo
+    } catch (error) {
+      console.error(
+        `[${this.providerId}] Error loading model ${opts.modelPath}:`,
+        error
+      )
+      throw error // Re-throw to be handled by the caller
+    }
+  }
+
+  async unloadModel(opts: UnloadOptions): Promise<UnloadResult> {
+    if (opts.providerId !== this.providerId) {
+      return { success: false, error: 'Invalid providerId' }
+    }
+    const session = this.activeSessions.get(opts.sessionId)
+    if (!session) {
      return {
-        sources: [],
-        object: 'model',
-        version: '1.0',
-        format: 'api',
-        id: modelId,
-        name: modelId,
-        created: 0,
-        description: '',
-        settings: {},
-        parameters: {},
-        metadata: {
-          author: '',
-          tags: [],
-          size: 0,
-        },
-        engine: this.provider,
-        capabilities: [ModelCapability.completion],
+        success: false,
+        error: `No active session found for id: ${opts.sessionId}`,
      }
-    })
-    return models
+    }
+
+    try {
+      console.log(
+        `[${this.providerId}] Requesting to unload model for session: ${opts.sessionId}`
+      )
+      // Matches: core::utils::extensions::inference_llamacpp_extension::server::unload
+      const rustResponse: { success: boolean; error?: string } = await invoke(
+        'plugin:llamacpp|unload',
+        { sessionId: opts.sessionId }
+      )
+
+      if (rustResponse.success) {
+        this.activeSessions.delete(opts.sessionId)
+        console.log(
+          `[${this.providerId}] Session ${opts.sessionId} unloaded successfully.`
+        )
+        return { success: true }
+      } else {
+        console.error(
+          `[${this.providerId}] Failed to unload session ${opts.sessionId}: ${rustResponse.error}`
+        )
+        return {
+          success: false,
+          error: rustResponse.error || 'Unknown error during unload',
+        }
+      }
+    } catch (error: any) {
+      console.error(
+        `[${this.providerId}] Error invoking unload for session ${opts.sessionId}:`,
+        error
+      )
+      return { success: false, error: error.message || String(error) }
+    }
+  }
+
+  async chat(
+    opts: ChatOptions
+  ): Promise<ChatCompletion | AsyncIterable<ChatCompletionChunk>> {}
+
+  async deleteModel(opts: DeleteOptions): Promise<DeleteResult> {}
+
+  async importModel(opts: ImportOptions): Promise<ImportResult> {}
+
+  override async loadModel(model: Model): Promise<any> {
+    if (model.engine?.toString() !== this.provider) return Promise.resolve()
+    console.log(
+      `[${this.providerId} AIEngine] Received OnModelInit for:`,
+      model.id
+    )
+    return super.loadModel(model)
+  }
+
+  override async unloadModel(model?: Model): Promise<any> {
+    if (model?.engine && model.engine.toString() !== this.provider)
+      return Promise.resolve()
+    console.log(
+      `[${this.providerId} AIEngine] Received OnModelStop for:`,
+      model?.id || 'all models'
+    )
+    return super.unloadModel(model)
  }
 }
--- a/extensions/llamacpp-extension/src/types.ts
+++ b/extensions/llamacpp-extension/src/types.ts
@ -0,0 +1,199 @@
+// src/providers/local/types.ts
+
+// --- Re-using OpenAI types (minimal definitions for this example) ---
+// In a real project, you'd import these from 'openai' or a shared types package.
+export interface ChatCompletionRequestMessage {
+  role: 'system' | 'user' | 'assistant' | 'tool';
+  content: string | null;
+  name?: string;
+  tool_calls?: any[]; // Simplified
+  tool_call_id?: string;
+}
+
+export interface ChatCompletionRequest {
+  model: string; // Model ID, though for local it might be implicit via sessionId
+  messages: ChatCompletionRequestMessage[];
+  temperature?: number | null;
+  top_p?: number | null;
+  n?: number | null;
+  stream?: boolean | null;
+  stop?: string | string[] | null;
+  max_tokens?: number;
+  presence_penalty?: number | null;
+  frequency_penalty?: number | null;
+  logit_bias?: Record<string, number> | null;
+  user?: string;
+  // ... TODO: other OpenAI params
+}
+
+export interface ChatCompletionChunkChoiceDelta {
+  content?: string | null;
+  role?: 'system' | 'user' | 'assistant' | 'tool';
+  tool_calls?: any[]; // Simplified
+}
+
+export interface ChatCompletionChunkChoice {
+  index: number;
+  delta: ChatCompletionChunkChoiceDelta;
+  finish_reason?: 'stop' | 'length' | 'tool_calls' | 'content_filter' | 'function_call' | null;
+}
+
+export interface ChatCompletionChunk {
+  id: string;
+  object: 'chat.completion.chunk';
+  created: number;
+  model: string;
+  choices: ChatCompletionChunkChoice[];
+  system_fingerprint?: string;
+}
+
+
+export interface ChatCompletionChoice {
+  index: number;
+  message: ChatCompletionRequestMessage; // Response message
+  finish_reason: 'stop' | 'length' | 'tool_calls' | 'content_filter' | 'function_call';
+  logprobs?: any; // Simplified
+}
+
+export interface ChatCompletion {
+  id: string;
+  object: 'chat.completion';
+  created: number;
+  model: string; // Model ID used
+  choices: ChatCompletionChoice[];
+  usage?: {
+    prompt_tokens: number;
+    completion_tokens: number;
+    total_tokens: number;
+  };
+  system_fingerprint?: string;
+}
+// --- End OpenAI types ---
+
+
+// Shared model metadata
+export interface ModelInfo {
+  id: string;            // e.g. "qwen3-4B" or "org/model/quant"
+  name: string;          // human‑readable, e.g., "Qwen3 4B Q4_0"
+  quant_type?: string;    // q4_0 (optional as it might be part of ID or name)
+  providerId: string;    // e.g. "llama.cpp"
+  sizeBytes: number;
+  tags?: string[];
+  path?: string;          // Absolute path to the model file, if applicable
+  // Additional provider-specific metadata can be added here
+  [key: string]: any;
+}
+
+// 1. /list
+export interface ListOptions {
+  providerId: string; // To specify which provider if a central manager calls this
+}
+export type ListResult = ModelInfo[];
+
+// 2. /pull
+export interface PullOptions {
+  providerId: string;
+  modelId: string;         // Identifier for the model to pull (e.g., from a known registry)
+  downloadUrl: string;     // URL to download the model from
+  /** optional callback to receive download progress */
+  onProgress?: (progress: { percent: number; downloadedBytes: number; totalBytes?: number; }) => void;
+}
+export interface PullResult {
+  success: boolean;
+  path?: string;         // local file path to the pulled model
+  error?: string;
+  modelInfo?: ModelInfo; // Info of the pulled model
+}
+
+// 3. /load
+export interface LoadOptions {
+  providerId: string;
+  modelPath: string;
+  /** any provider‑specific tuning options for llama.cpp server */
+  options?: {
+    port?: number; // 0 means dynamic port
+    n_gpu_layers?: number;
+    n_ctx?: number; // context size
+    // ... other llama-cpp-python or llama.cpp server flags
+    [key: string]: any;
+  };
+}
+
+export interface SessionInfo {
+  sessionId: string;    // opaque handle for unload/chat
+  port: number;       // llama-server output port (corrected from portid)
+  modelPath: string;    // path of the loaded model
+  providerId: string;
+  settings: Record<string, unknown>; // The actual settings used to load
+}
+
+// 4. /unload
+export interface UnloadOptions {
+  providerId: string;
+  sessionId: string;
+}
+export interface UnloadResult {
+  success: boolean;
+  error?: string;
+}
+
+// 5. /chat
+export interface ChatOptions {
+  providerId: string;
+  sessionId: string;
+  /** Full OpenAI ChatCompletionRequest payload */
+  payload: ChatCompletionRequest;
+}
+// Output for /chat will be Promise<ChatCompletion> for non-streaming
+// or Promise<AsyncIterable<ChatCompletionChunk>> for streaming
+
+// 6. /delete
+export interface DeleteOptions {
+  providerId: string;
+  modelId: string; // The ID of the model to delete (implies finding its path)
+  modelPath?: string; // Optionally, direct path can be provided
+}
+export interface DeleteResult {
+  success: boolean;
+  error?: string;
+}
+
+// 7. /import
+export interface ImportOptions {
+  providerId: string;
+  sourcePath: string; // Path to the local model file to import
+  desiredModelId?: string; // Optional: if user wants to name it specifically
+}
+export interface ImportResult {
+  success: boolean;
+  modelInfo?: ModelInfo;
+  error?: string;
+}
+
+// 8. /abortPull
+export interface AbortPullOptions {
+  providerId: string;
+  modelId: string; // The modelId whose download is to be aborted
+}
+export interface AbortPullResult {
+  success: boolean;
+  error?: string;
+}
+
+
+// The interface for any local provider
+export interface LocalProvider {
+  readonly providerId: string;
+
+  listModels(opts: ListOptions): Promise<ListResult>;
+  pullModel(opts: PullOptions): Promise<PullResult>;
+  loadModel(opts: LoadOptions): Promise<SessionInfo>;
+  unloadModel(opts: UnloadOptions): Promise<UnloadResult>;
+  chat(opts: ChatOptions): Promise<ChatCompletion | AsyncIterable<ChatCompletionChunk>>;
+  deleteModel(opts: DeleteOptions): Promise<DeleteResult>;
+  importModel(opts: ImportOptions): Promise<ImportResult>;
+  abortPull(opts: AbortPullOptions): Promise<AbortPullResult>;
+
+  // Optional: for direct access to underlying client if needed for specific streaming cases
+  getChatClient?(sessionId: string): any; // e.g., an OpenAI client instance configured for the session
+}
--- a/extensions/yarn.lock
+++ b/extensions/yarn.lock
--- a/src-tauri/src/core/setup.rs
+++ b/src-tauri/src/core/setup.rs
@ -3,12 +3,9 @@ use std::{
    fs::{self, File},
    io::Read,
    path::PathBuf,
-    sync::Arc,
 };
 use tar::Archive;
-use tauri::{App, Emitter, Listener, Manager};
-use tauri_plugin_shell::process::{CommandChild, CommandEvent};
-use tauri_plugin_shell::ShellExt;
+use tauri::{App, Emitter, Manager};
 use tauri_plugin_store::StoreExt;
 use tokio::sync::Mutex;
 use tokio::time::{sleep, Duration}; // Using tokio::sync::Mutex
@ -200,22 +197,18 @@ pub fn setup_mcp(app: &App) {
    let state = app.state::<AppState>();
    let servers = state.mcp_servers.clone();
    let app_handle: tauri::AppHandle = app.handle().clone();
-    
    // Setup kill-mcp-servers event listener (similar to cortex kill-sidecar)
    let app_handle_for_kill = app_handle.clone();
    app_handle.listen("kill-mcp-servers", move |_event| {
        let app_handle = app_handle_for_kill.clone();
        tauri::async_runtime::spawn(async move {
            log::info!("Received kill-mcp-servers event - cleaning up MCP servers");
-            
            let app_state = app_handle.state::<AppState>();
-            
            // Stop all running MCP servers
            if let Err(e) = super::mcp::stop_mcp_servers(app_state.mcp_servers.clone()).await {
                log::error!("Failed to stop MCP servers: {}", e);
                return;
            }
-            
            // Clear active servers and restart counts
            {
                let mut active_servers = app_state.mcp_active_servers.lock().await;
@ -225,11 +218,9 @@ pub fn setup_mcp(app: &App) {
                let mut restart_counts = app_state.mcp_restart_counts.lock().await;
                restart_counts.clear();
            }
-            
            log::info!("MCP servers cleaned up successfully");
        });
    });
-    
    tauri::async_runtime::spawn(async move {
        if let Err(e) = run_mcp_commands(&app_handle, servers).await {
            log::error!("Failed to run mcp commands: {}", e);
@ -471,65 +462,22 @@ pub fn setup_sidecar(app: &App) -> Result<(), String> {
    Ok(())
 }

+//pub fn setup_engine_binaries(app: &App) -> Result<(), String> {
+//    // Copy engine binaries to app_data
+//    let app_data_dir = app.handle().path().app_data_dir().unwrap();
+//    let binaries_dir = app.handle().path().resource_dir().unwrap().join("binaries");
+//    let themes_dir = app
+//        .handle()
+//        .path()
+//        .resource_dir()
+//        .unwrap()
+//        .join("resources");
 //
-// Clean up function to kill the sidecar process
-//
-pub fn clean_up() {
-    #[cfg(windows)]
-    {
-        use std::os::windows::process::CommandExt;
-        let _ = std::process::Command::new("taskkill")
-            .args(["-f", "-im", "llama-server.exe"])
-            .creation_flags(0x08000000)
-            .spawn();
-        let _ = std::process::Command::new("taskkill")
-            .args(["-f", "-im", "cortex-server.exe"])
-            .creation_flags(0x08000000)
-            .spawn();
-    }
-    #[cfg(unix)]
-    {
-        let _ = std::process::Command::new("pkill")
-            .args(["-f", "llama-server"])
-            .spawn();
-        let _ = std::process::Command::new("pkill")
-            .args(["-f", "cortex-server"])
-            .spawn();
-    }
-    log::info!("Clean up function executed, sidecar processes killed.");
-}
-
-fn copy_dir_all(src: PathBuf, dst: PathBuf) -> Result<(), String> {
-    fs::create_dir_all(&dst).map_err(|e| e.to_string())?;
-    log::info!("Copying from {:?} to {:?}", src, dst);
-    for entry in fs::read_dir(src).map_err(|e| e.to_string())? {
-        let entry = entry.map_err(|e| e.to_string())?;
-        let ty = entry.file_type().map_err(|e| e.to_string())?;
-        if ty.is_dir() {
-            copy_dir_all(entry.path(), dst.join(entry.file_name())).map_err(|e| e.to_string())?;
-        } else {
-            fs::copy(entry.path(), dst.join(entry.file_name())).map_err(|e| e.to_string())?;
-        }
-    }
-    Ok(())
-}
-
-pub fn setup_engine_binaries(app: &App) -> Result<(), String> {
-    // Copy engine binaries to app_data
-    let app_data_dir = get_jan_data_folder_path(app.handle().clone());
-    let binaries_dir = app.handle().path().resource_dir().unwrap().join("binaries");
-    let resources_dir = app
-        .handle()
-        .path()
-        .resource_dir()
-        .unwrap()
-        .join("resources");
-
-    if let Err(e) = copy_dir_all(binaries_dir, app_data_dir.clone()) {
-        log::error!("Failed to copy binaries: {}", e);
-    }
-    if let Err(e) = copy_dir_all(resources_dir, app_data_dir.clone()) {
-        log::error!("Failed to copy resources: {}", e);
-    }
-    Ok(())
-}
+//    if let Err(e) = copy_dir_all(binaries_dir, app_data_dir.clone()) {
+//        log::error!("Failed to copy binaries: {}", e);
+//    }
+//    if let Err(e) = copy_dir_all(themes_dir, app_data_dir.clone()) {
+//        log::error!("Failed to copy themes: {}", e);
+//    }
+//    Ok(())
+//}
--- a/src-tauri/src/core/utils/extensions/inference_llamacpp_extension/server.rs
+++ b/src-tauri/src/core/utils/extensions/inference_llamacpp_extension/server.rs
@ -10,8 +10,8 @@ use crate::core::state::AppState;
 pub enum ServerError {
    #[error("Server is already running")]
    AlreadyRunning,
-    #[error("Server is not running")]
-    NotRunning,
+  //  #[error("Server is not running")]
+  //  NotRunning,
    #[error("Failed to locate server binary: {0}")]
    BinaryNotFound(String),
    #[error("Failed to determine resource path: {0}")]
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@ -1,16 +1,14 @@
 mod core;
 use core::{
    cmd::get_jan_data_folder_path,
-    setup::{self, setup_engine_binaries, setup_mcp, setup_sidecar},
+    setup::{self, setup_mcp},
    state::{generate_app_token, AppState},
    utils::download::DownloadManagerState,
 };
 use std::{collections::HashMap, sync::Arc};

-use tauri::Emitter;
 use tokio::sync::Mutex;

-use crate::core::setup::clean_up;

 #[cfg_attr(mobile, tauri::mobile_entry_point)]
 pub fn run() {
@ -122,17 +120,17 @@ pub fn run() {
                log::error!("Failed to install extensions: {}", e);
            }
            setup_mcp(app);
-            setup_sidecar(app).expect("Failed to setup sidecar");
-            setup_engine_binaries(app).expect("Failed to setup engine binaries");
            Ok(())
        })
        .on_window_event(|window, event| match event {
            tauri::WindowEvent::CloseRequested { .. } => {
                if window.label() == "main" {
-                    window.emit("kill-sidecar", ()).unwrap();
                    window.emit("kill-mcp-servers", ()).unwrap();
                    clean_up();
                }
+                let client = Client::new();
+                let url = "http://127.0.0.1:39291/processManager/destroy";
+                let _ = client.delete(url).send();
            }
            _ => {}
        })