Merge pull request #2916 from janhq/dev

Release/0.4.13 to main
2024-05-16 21:35:15 +07:00 · 2024-05-16 21:35:15 +07:00 · f2947c14f5
commit f2947c14f5
parent 63a2f22414 537ef20a54
98 changed files with 1848 additions and 556 deletions
--- a/.github/workflows/jan-electron-linter-and-test.yml
+++ b/.github/workflows/jan-electron-linter-and-test.yml
@ -57,19 +57,19 @@ jobs:
          rm -rf ~/jan
          make clean

-      # - name: Get Commit Message for PR
-      #   if : github.event_name == 'pull_request'
-      #   run: |
-      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}})" >> $GITHUB_ENV
+      - name: Get Commit Message for PR
+        if : github.event_name == 'pull_request'
+        run: |
+          echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}})" >> $GITHUB_ENV

-      # - name: Get Commit Message for push event
-      #   if : github.event_name == 'push'
-      #   run: |
-      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}})" >> $GITHUB_ENV
+      - name: Get Commit Message for push event
+        if : github.event_name == 'push'
+        run: |
+          echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}})" >> $GITHUB_ENV

-      # - name: "Config report portal"
-      #   run: |
-      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App macos" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      - name: "Config report portal"
+        run: |
+          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App macos" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        run: |
@ -78,9 +78,9 @@ jobs:
          make test
        env:
          CSC_IDENTITY_AUTO_DISCOVERY: "false"
-          # TURBO_API: "${{ secrets.TURBO_API }}"
-          # TURBO_TEAM: "macos"
-          # TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+          TURBO_API: "${{ secrets.TURBO_API }}"
+          TURBO_TEAM: "macos"
+          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"

  test-on-macos-pr-target:
    if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository
@ -141,16 +141,16 @@ jobs:
          }
          make clean
  
-      # - name: Get Commit Message for push event
-      #   if : github.event_name == 'push'
-      #   shell: bash
-      #   run: |
-      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV
+      - name: Get Commit Message for push event
+        if : github.event_name == 'push'
+        shell: bash
+        run: |
+          echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV

-      # - name: "Config report portal"
-      #   shell: bash
-      #   run: |
-      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows ${{ matrix.antivirus-tools }}" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      - name: "Config report portal"
+        shell: bash
+        run: |
+          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows ${{ matrix.antivirus-tools }}" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        shell: powershell
@ -158,10 +158,10 @@ jobs:
          npm config set registry ${{ secrets.NPM_PROXY }} --global
          yarn config set registry ${{ secrets.NPM_PROXY }} --global
          make test
-        # env:
-        #   TURBO_API: "${{ secrets.TURBO_API }}"
-        #   TURBO_TEAM: "windows"
-        #   TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+        env:
+          TURBO_API: "${{ secrets.TURBO_API }}"
+          TURBO_TEAM: "windows"
+          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
  test-on-windows-pr:
    if: (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository)
    runs-on: windows-desktop-default-windows-security
@ -189,16 +189,16 @@ jobs:
          }
          make clean

-      # - name: Get Commit Message for PR
-      #   if : github.event_name == 'pull_request'
-      #   shell: bash
-      #   run: |
-      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV
+      - name: Get Commit Message for PR
+        if : github.event_name == 'pull_request'
+        shell: bash
+        run: |
+          echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV

-      # - name: "Config report portal"
-      #   shell: bash
-      #   run: |
-      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      - name: "Config report portal"
+        shell: bash
+        run: |
+          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        shell: powershell
@ -206,10 +206,10 @@ jobs:
          npm config set registry ${{ secrets.NPM_PROXY }} --global
          yarn config set registry ${{ secrets.NPM_PROXY }} --global
          make test
-        # env:
-        #   TURBO_API: "${{ secrets.TURBO_API }}"
-        #   TURBO_TEAM: "windows"
-        #   TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+        env:
+          TURBO_API: "${{ secrets.TURBO_API }}"
+          TURBO_TEAM: "windows"
+          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"

  test-on-windows-pr-target:
    if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository
@ -266,20 +266,20 @@ jobs:
          rm -rf ~/jan
          make clean

-      # - name: Get Commit Message for PR
-      #   if : github.event_name == 'pull_request'
-      #   run: |
-      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV
+      - name: Get Commit Message for PR
+        if : github.event_name == 'pull_request'
+        run: |
+          echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV

-      # - name: Get Commit Message for push event
-      #   if : github.event_name == 'push'
-      #   run: |
-      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV
+      - name: Get Commit Message for push event
+        if : github.event_name == 'push'
+        run: |
+          echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV

-      # - name: "Config report portal"
-      #   shell: bash
-      #   run: |
-      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Linux" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      - name: "Config report portal"
+        shell: bash
+        run: |
+          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Linux" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        run: |
@ -288,10 +288,10 @@ jobs:
          npm config set registry ${{ secrets.NPM_PROXY }} --global
          yarn config set registry ${{ secrets.NPM_PROXY }} --global
          make test
-        # env:
-        #   TURBO_API: "${{ secrets.TURBO_API }}"
-        #   TURBO_TEAM: "linux"
-        #   TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+        env:
+          TURBO_API: "${{ secrets.TURBO_API }}"
+          TURBO_TEAM: "linux"
+          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"

  test-on-ubuntu-pr-target:
    runs-on: [self-hosted, Linux, ubuntu-desktop]
--- a/.github/workflows/template-build-macos-arm64.yml
+++ b/.github/workflows/template-build-macos-arm64.yml
@ -41,7 +41,7 @@ on:

 jobs:
  build-macos:
-    runs-on: macos-silicon
+    runs-on: macos-latest
    environment: production
    permissions:
      contents: write
@ -55,15 +55,9 @@ jobs:
        uses: actions/setup-node@v1
        with:
          node-version: 20
-      - name: Unblock keychain
-        run: |
-          security unlock-keychain -p ${{ secrets.KEYCHAIN_PASSWORD }} ~/Library/Keychains/login.keychain-db
-      # - uses: actions/setup-python@v5
-      #   with:
-      #     python-version: '3.11'

-      # - name: Install jq
-      #   uses: dcarbone/install-jq-action@v2.0.1
+      - name: Install jq
+        uses: dcarbone/install-jq-action@v2.0.1

      - name: Update app version based on latest release tag with build number
        if: inputs.public_provider != 'github'
@ -101,17 +95,17 @@ jobs:
        env:
          VERSION_TAG: ${{ inputs.new_version }}

-      # - name: Get Cer for code signing
-      #   run: base64 -d <<< "$CODE_SIGN_P12_BASE64" > /tmp/codesign.p12
-      #   shell: bash
-      #   env:
-      #     CODE_SIGN_P12_BASE64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+      - name: Get Cer for code signing
+        run: base64 -d <<< "$CODE_SIGN_P12_BASE64" > /tmp/codesign.p12
+        shell: bash
+        env:
+          CODE_SIGN_P12_BASE64: ${{ secrets.CODE_SIGN_P12_BASE64 }}

-      # - uses: apple-actions/import-codesign-certs@v2
-      #   continue-on-error: true
-      #   with:
-      #     p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
-      #     p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
+      - uses: apple-actions/import-codesign-certs@v2
+        continue-on-error: true
+        with:
+          p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+          p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }}

      - name: Build and publish app to cloudflare r2 or github artifactory
        if: inputs.public_provider != 'github'
@ -125,9 +119,9 @@ jobs:
          fi
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          # CSC_LINK: "/tmp/codesign.p12"
-          # CSC_KEY_PASSWORD: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
-          # CSC_IDENTITY_AUTO_DISCOVERY: "true"
+          CSC_LINK: "/tmp/codesign.p12"
+          CSC_KEY_PASSWORD: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
+          CSC_IDENTITY_AUTO_DISCOVERY: "true"
          APPLE_ID: ${{ secrets.APPLE_ID }}
          APPLE_APP_SPECIFIC_PASSWORD: ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }}
          APP_PATH: "."
@ -143,9 +137,9 @@ jobs:
          make build-and-publish
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          # CSC_LINK: "/tmp/codesign.p12"
-          # CSC_KEY_PASSWORD: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
-          # CSC_IDENTITY_AUTO_DISCOVERY: "true"
+          CSC_LINK: "/tmp/codesign.p12"
+          CSC_KEY_PASSWORD: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
+          CSC_IDENTITY_AUTO_DISCOVERY: "true"
          APPLE_ID: ${{ secrets.APPLE_ID }}
          APPLE_APP_SPECIFIC_PASSWORD: ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }}
          APP_PATH: "."
--- a/.github/workflows/template-build-macos-x64.yml
+++ b/.github/workflows/template-build-macos-x64.yml
@ -159,4 +159,3 @@ jobs:
        with:
          name: latest-mac-x64
          path: ./electron/dist/latest-mac.yml
-
--- a/core/src/browser/extensions/engines/helpers/sse.ts
+++ b/core/src/browser/extensions/engines/helpers/sse.ts
@ -68,14 +68,19 @@ export function requestInference(
            let cachedLines = ''
            for (const line of lines) {
              try {
-                const toParse = cachedLines + line
-                if (!line.includes('data: [DONE]')) {
-                  const data = JSON.parse(toParse.replace('data: ', ''))
-                  content += data.choices[0]?.delta?.content ?? ''
-                  if (content.startsWith('assistant: ')) {
-                    content = content.replace('assistant: ', '')
+                if (transformResponse) {
+                  content += transformResponse(line)
+                  subscriber.next(content ?? '')
+                } else {
+                  const toParse = cachedLines + line
+                  if (!line.includes('data: [DONE]')) {
+                    const data = JSON.parse(toParse.replace('data: ', ''))
+                    content += data.choices[0]?.delta?.content ?? ''
+                    if (content.startsWith('assistant: ')) {
+                      content = content.replace('assistant: ', '')
+                    }
+                    if (content !== '') subscriber.next(content)
                  }
-                  if (content !== '') subscriber.next(content)
                }
              } catch {
                cachedLines = line
--- a/core/src/node/api/restful/helper/consts.ts
+++ b/core/src/node/api/restful/helper/consts.ts
@ -9,11 +9,11 @@ export const SUPPORTED_MODEL_FORMAT = '.gguf'
 // The URL for the Nitro subprocess
 const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${NITRO_DEFAULT_PORT}`
 // The URL for the Nitro subprocess to load a model
-export const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/loadmodel`
+export const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/loadmodel`
 // The URL for the Nitro subprocess to validate a model
-export const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/modelstatus`
+export const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/modelstatus`

 // The URL for the Nitro subprocess to kill itself
 export const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`

-export const DEFAULT_CHAT_COMPLETION_URL = `http://${LOCAL_HOST}:${NITRO_DEFAULT_PORT}/inferences/llamacpp/chat_completion` // default nitro url
+export const DEFAULT_CHAT_COMPLETION_URL = `http://${LOCAL_HOST}:${NITRO_DEFAULT_PORT}/inferences/server/chat_completion` // default nitro url
--- a/core/src/node/api/restful/helper/startStopModel.ts
+++ b/core/src/node/api/restful/helper/startStopModel.ts
@ -144,12 +144,12 @@ const runNitroAndLoadModel = async (modelId: string, modelSettings: NitroModelSe
 }

 const spawnNitroProcess = async (): Promise<void> => {
-  log(`[SERVER]::Debug: Spawning Nitro subprocess...`)
+  log(`[SERVER]::Debug: Spawning cortex subprocess...`)

  let binaryFolder = join(
    getJanExtensionsPath(),
    '@janhq',
-    'inference-nitro-extension',
+    'inference-cortex-extension',
    'dist',
    'bin'
  )
@ -160,7 +160,7 @@ const spawnNitroProcess = async (): Promise<void> => {
  const args: string[] = ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()]
  // Execute the binary
  log(
-    `[SERVER]::Debug: Spawn nitro at path: ${executableOptions.executablePath}, and args: ${args}`
+    `[SERVER]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
  )
  subprocess = spawn(
    executableOptions.executablePath,
@ -184,12 +184,12 @@ const spawnNitroProcess = async (): Promise<void> => {
  })

  subprocess.on('close', (code: any) => {
-    log(`[SERVER]::Debug: Nitro exited with code: ${code}`)
+    log(`[SERVER]::Debug: cortex exited with code: ${code}`)
    subprocess = undefined
  })

  tcpPortUsed.waitUntilUsed(NITRO_DEFAULT_PORT, 300, 30000).then(() => {
-    log(`[SERVER]::Debug: Nitro is ready`)
+    log(`[SERVER]::Debug: cortex is ready`)
  })
 }

@ -203,13 +203,13 @@ const executableNitroFile = (): NitroExecutableOptions => {
  let binaryFolder = join(
    getJanExtensionsPath(),
    '@janhq',
-    'inference-nitro-extension',
+    'inference-cortex-extension',
    'dist',
    'bin'
  )

  let cudaVisibleDevices = ''
-  let binaryName = 'nitro'
+  let binaryName = 'cortex-cpp'
  /**
   * The binary folder is different for each platform.
   */
@ -228,12 +228,16 @@ const executableNitroFile = (): NitroExecutableOptions => {
      }
      cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
    }
-    binaryName = 'nitro.exe'
+    binaryName = 'cortex-cpp.exe'
  } else if (process.platform === 'darwin') {
    /**
     *  For MacOS: mac-universal both Silicon and InteL
     */
-    binaryFolder = join(binaryFolder, 'mac-universal')
+    if(process.arch === 'arm64') {
+    binaryFolder = join(binaryFolder, 'mac-arm64')
+    } else {
+      binaryFolder = join(binaryFolder, 'mac-amd64')
+    }
  } else {
    /**
     *  For Linux: linux-cpu, linux-cuda-11-7, linux-cuda-12-0
@ -300,7 +304,7 @@ const loadLLMModel = async (settings: NitroModelSettings): Promise<Response> =>
    retryDelay: 500,
  })
    .then((res: any) => {
-      log(`[SERVER]::Debug: Load model success with response ${JSON.stringify(res)}`)
+      log(`[SERVER]::Debug: Load model request with response ${JSON.stringify(res)}`)
      return Promise.resolve(res)
    })
    .catch((err: any) => {
@ -327,7 +331,7 @@ export const stopModel = async (_modelId: string) => {
      })
    }, 5000)
    const tcpPortUsed = require('tcp-port-used')
-    log(`[SERVER]::Debug: Request to kill Nitro`)
+    log(`[SERVER]::Debug: Request to kill cortex`)

    fetch(NITRO_HTTP_KILL_URL, {
      method: 'DELETE',
--- a/core/src/node/helper/resource.ts
+++ b/core/src/node/helper/resource.ts
@ -4,7 +4,7 @@ import { log } from './logger'

 export const getSystemResourceInfo = async (): Promise<SystemResourceInfo> => {
  const cpu = await physicalCpuCount()
-  log(`[NITRO]::CPU information - ${cpu}`)
+  log(`[CORTEX]::CPU information - ${cpu}`)

  return {
    numCpuPhysicalCore: cpu,
--- a/core/src/types/api/index.ts
+++ b/core/src/types/api/index.ts
@ -19,6 +19,7 @@ export enum NativeRoute {
  showMainWindow = 'showMainWindow',

  quickAskSizeUpdated = 'quickAskSizeUpdated',
+  ackDeepLink = 'ackDeepLink',
 }

 /**
@ -45,6 +46,8 @@ export enum AppEvent {

  onUserSubmitQuickAsk = 'onUserSubmitQuickAsk',
  onSelectedText = 'onSelectedText',
+
+  onDeepLink = 'onDeepLink',
 }

 export enum DownloadRoute {
--- a/electron/handlers/native.ts
+++ b/electron/handlers/native.ts
@ -151,4 +151,8 @@ export function handleAppIPCs() {
    async (_event, heightOffset: number): Promise<void> =>
      windowManager.expandQuickAskWindow(heightOffset)
  )
+
+  ipcMain.handle(NativeRoute.ackDeepLink, async (_event): Promise<void> => {
+    windowManager.ackDeepLink()
+  })
 }
--- a/electron/main.ts
+++ b/electron/main.ts
@ -1,6 +1,6 @@
 import { app, BrowserWindow } from 'electron'

-import { join } from 'path'
+import { join, resolve } from 'path'
 /**
 * Managers
 **/
@ -39,15 +39,44 @@ const quickAskUrl = `${mainUrl}/search`

 const gotTheLock = app.requestSingleInstanceLock()

+if (process.defaultApp) {
+  if (process.argv.length >= 2) {
+    app.setAsDefaultProtocolClient('jan', process.execPath, [
+      resolve(process.argv[1]),
+    ])
+  }
+} else {
+  app.setAsDefaultProtocolClient('jan')
+}
+
+const createMainWindow = () => {
+  const startUrl = app.isPackaged ? `file://${mainPath}` : mainUrl
+  windowManager.createMainWindow(preloadPath, startUrl)
+}
+
 app
  .whenReady()
  .then(() => {
    if (!gotTheLock) {
      app.quit()
      throw new Error('Another instance of the app is already running')
+    } else {
+      app.on(
+        'second-instance',
+        (_event, commandLine, _workingDirectory): void => {
+          if (process.platform === 'win32' || process.platform === 'linux') {
+            // this is for handling deeplink on windows and linux
+            // since those OS will emit second-instance instead of open-url
+            const url = commandLine.pop()
+            if (url) {
+              windowManager.sendMainAppDeepLink(url)
+            }
+          }
+          windowManager.showMainWindow()
+        }
+      )
    }
  })
-  .then(setupReactDevTool)
  .then(setupCore)
  .then(createUserSpace)
  .then(migrateExtensions)
@ -60,6 +89,7 @@ app
  .then(registerGlobalShortcuts)
  .then(() => {
    if (!app.isPackaged) {
+      setupReactDevTool()
      windowManager.mainWindow?.webContents.openDevTools()
    }
  })
@ -75,11 +105,11 @@ app
    })
  })

-app.on('second-instance', (_event, _commandLine, _workingDirectory) => {
-  windowManager.showMainWindow()
+app.on('open-url', (_event, url) => {
+  windowManager.sendMainAppDeepLink(url)
 })

-app.on('before-quit', function (evt) {
+app.on('before-quit', function (_event) {
  trayManager.destroyCurrentTray()
 })

@ -104,11 +134,6 @@ function createQuickAskWindow() {
  windowManager.createQuickAskWindow(preloadPath, startUrl)
 }

-function createMainWindow() {
-  const startUrl = app.isPackaged ? `file://${mainPath}` : mainUrl
-  windowManager.createMainWindow(preloadPath, startUrl)
-}
-
 /**
 * Handles various IPC messages from the renderer process.
 */
--- a/electron/managers/window.ts
+++ b/electron/managers/window.ts
@ -14,9 +14,9 @@ class WindowManager {
  private _quickAskWindowVisible = false
  private _mainWindowVisible = false

+  private deeplink: string | undefined
  /**
   * Creates a new window instance.
-   * @param {Electron.BrowserWindowConstructorOptions} options - The options to create the window with.
   * @returns The created window instance.
   */
  createMainWindow(preloadPath: string, startUrl: string) {
@ -29,6 +29,17 @@ class WindowManager {
      },
    })

+    if (process.platform === 'win32' || process.platform === 'linux') {
+      /// This is work around for windows deeplink.
+      /// second-instance event is not fired when app is not open, so the app
+      /// does not received the deeplink.
+      const commandLine = process.argv.slice(1)
+      if (commandLine.length > 0) {
+        const url = commandLine[0]
+        this.sendMainAppDeepLink(url)
+      }
+    }
+
    /* Load frontend app to the window */
    this.mainWindow.loadURL(startUrl)

@ -123,6 +134,22 @@ class WindowManager {
    )
  }

+  /**
+   * Try to send the deep link to the main app.
+   */
+  sendMainAppDeepLink(url: string): void {
+    this.deeplink = url
+    const interval = setInterval(() => {
+      if (!this.deeplink) clearInterval(interval)
+      const mainWindow = this.mainWindow
+      if (mainWindow) {
+        mainWindow.webContents.send(AppEvent.onDeepLink, this.deeplink)
+        if (mainWindow.isMinimized()) mainWindow.restore()
+        mainWindow.focus()
+      }
+    }, 500)
+  }
+
  cleanUp(): void {
    if (!this.mainWindow?.isDestroyed()) {
      this.mainWindow?.close()
@ -137,6 +164,13 @@ class WindowManager {
      this._quickAskWindowVisible = false
    }
  }
+
+  /**
+   * Acknowledges that the window has received a deep link. We can remove it.
+   */
+  ackDeepLink() {
+    this.deeplink = undefined
+  }
 }

 export const windowManager = new WindowManager()
--- a/electron/package.json
+++ b/electron/package.json
@ -61,6 +61,14 @@
      "include": "scripts/uninstaller.nsh",
      "deleteAppDataOnUninstall": true
    },
+    "protocols": [
+      {
+        "name": "Jan",
+        "schemes": [
+          "jan"
+        ]
+      }
+    ],
    "artifactName": "jan-${os}-${arch}-${version}.${ext}"
  },
  "scripts": {
@ -96,7 +104,7 @@
    "request": "^2.88.2",
    "request-progress": "^3.0.0",
    "ulidx": "^2.3.0",
-    "@nut-tree/nut-js": "^4.0.0"
+    "@kirillvakalov/nut-tree__nut-js": "4.2.1-2"
  },
  "devDependencies": {
    "@electron/notarize": "^2.1.0",
--- a/electron/utils/dev.ts
+++ b/electron/utils/dev.ts
@ -1,17 +1,13 @@
-import { app } from 'electron'
-
 export const setupReactDevTool = async () => {
-  if (!app.isPackaged) {
-    // Which means you're running from source code
-    const { default: installExtension, REACT_DEVELOPER_TOOLS } = await import(
-      'electron-devtools-installer'
-    ) // Don't use import on top level, since the installer package is dev-only
-    try {
-      const name = await installExtension(REACT_DEVELOPER_TOOLS)
-      console.debug(`Added Extension: ${name}`)
-    } catch (err) {
-      console.error('An error occurred while installing devtools:', err)
-      // Only log the error and don't throw it because it's not critical
-    }
+  // Which means you're running from source code
+  const { default: installExtension, REACT_DEVELOPER_TOOLS } = await import(
+    'electron-devtools-installer'
+  ) // Don't use import on top level, since the installer package is dev-only
+  try {
+    const name = await installExtension(REACT_DEVELOPER_TOOLS)
+    console.debug(`Added Extension: ${name}`)
+  } catch (err) {
+    console.error('An error occurred while installing devtools:', err)
+    // Only log the error and don't throw it because it's not critical
  }
 }
--- a/electron/utils/selectedText.ts
+++ b/electron/utils/selectedText.ts
@ -1,5 +1,5 @@
 import { clipboard, globalShortcut } from 'electron'
-import { keyboard, Key } from '@nut-tree/nut-js'
+import { keyboard, Key } from "@kirillvakalov/nut-tree__nut-js"

 /**
 * Gets selected text by synthesizing the keyboard shortcut
--- a/extensions/assistant-extension/src/node/index.ts
+++ b/extensions/assistant-extension/src/node/index.ts
@ -10,11 +10,12 @@ export function toolRetrievalUpdateTextSplitter(
 }
 export async function toolRetrievalIngestNewDocument(
  file: string,
+  model: string,
  engine: string
 ) {
  const filePath = path.join(getJanDataFolderPath(), normalizeFilePath(file))
  const threadPath = path.dirname(filePath.replace('files', ''))
-  retrieval.updateEmbeddingEngine(engine)
+  retrieval.updateEmbeddingEngine(model, engine)
  return retrieval
    .ingestAgentKnowledge(filePath, `${threadPath}/memory`)
    .catch((err) => {
--- a/extensions/assistant-extension/src/node/retrieval.ts
+++ b/extensions/assistant-extension/src/node/retrieval.ts
@ -28,14 +28,14 @@ export class Retrieval {
    })
  }

-  public updateEmbeddingEngine(engine: string): void {
+  public updateEmbeddingEngine(model: string, engine: string): void {
    // Engine settings are not compatible with the current embedding model params
    // Switch case manually for now
    if (engine === 'nitro') {
      this.embeddingModel = new OpenAIEmbeddings(
-        { openAIApiKey: 'nitro-embedding' },
+        { openAIApiKey: 'nitro-embedding', model },
        // TODO: Raw settings
-        { basePath: 'http://127.0.0.1:3928/v1' }
+        { basePath: 'http://127.0.0.1:3928/v1' },
      )
    } else {
      // Fallback to OpenAI Settings
--- a/extensions/assistant-extension/src/tools/retrieval.ts
+++ b/extensions/assistant-extension/src/tools/retrieval.ts
@ -36,6 +36,7 @@ export class RetrievalTool extends InferenceTool {
          NODE,
          'toolRetrievalIngestNewDocument',
          docFile,
+          data.model?.id,
          data.model?.engine
        )
      } else {
--- a/extensions/inference-anthropic-extension/README.md
+++ b/extensions/inference-anthropic-extension/README.md
@ -0,0 +1,79 @@
+# Anthropic Engine Extension
+
+Created using Jan extension example
+
+# Create a Jan Extension using Typescript
+
+Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀
+
+## Create Your Own Extension
+
+To create your own extension, you can use this repository as a template! Just follow the below instructions:
+
+1. Click the Use this template button at the top of the repository
+2. Select Create a new repository
+3. Select an owner and name for your new repository
+4. Click Create repository
+5. Clone your new repository
+
+## Initial Setup
+
+After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension.
+
+> [!NOTE]
+>
+> You'll need to have a reasonably modern version of
+> [Node.js](https://nodejs.org) handy. If you are using a version manager like
+> [`nodenv`](https://github.com/nodenv/nodenv) or
+> [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
+> root of your repository to install the version specified in
+> [`package.json`](./package.json). Otherwise, 20.x or later should work!
+
+1. :hammer_and_wrench: Install the dependencies
+
+   ```bash
+   npm install
+   ```
+
+1. :building_construction: Package the TypeScript for distribution
+
+   ```bash
+   npm run bundle
+   ```
+
+1. :white_check_mark: Check your artifact
+
+   There will be a tgz file in your extension directory now
+
+## Update the Extension Metadata
+
+The [`package.json`](package.json) file defines metadata about your extension, such as
+extension name, main entry, description and version.
+
+When you copy this repository, update `package.json` with the name, description for your extension.
+
+## Update the Extension Code
+
+The [`src/`](./src/) directory is the heart of your extension! This contains the
+source code that will be run when your extension functions are invoked. You can replace the
+contents of this directory with your own code.
+
+There are a few things to keep in mind when writing your extension code:
+
+- Most Jan Extension functions are processed asynchronously.
+  In `index.ts`, you will see that the extension function will return a `Promise<any>`.
+
+  ```typescript
+  import { events, MessageEvent, MessageRequest } from '@janhq/core'
+
+  function onStart(): Promise<any> {
+    return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) =>
+      this.inference(data)
+    )
+  }
+  ```
+
+  For more information about the Jan Extension Core module, see the
+  [documentation](https://github.com/janhq/jan/blob/main/core/README.md).
+
+So, what are you waiting for? Go ahead and start customizing your extension!
--- a/extensions/inference-anthropic-extension/package.json
+++ b/extensions/inference-anthropic-extension/package.json
@ -0,0 +1,43 @@
+{
+  "name": "@janhq/inference-anthropic-extension",
+  "productName": "Anthropic Inference Engine",
+  "version": "1.0.0",
+  "description": "This extension enables Anthropic chat completion API calls",
+  "main": "dist/index.js",
+  "module": "dist/module.js",
+  "engine": "anthropic",
+  "author": "Jan <service@jan.ai>",
+  "license": "AGPL-3.0",
+  "scripts": {
+    "build": "tsc -b . && webpack --config webpack.config.js",
+    "build:publish": "rimraf *.tgz --glob && yarn build && npm pack && cpx *.tgz ../../pre-install",
+    "sync:core": "cd ../.. && yarn build:core && cd extensions && rm yarn.lock &&  cd inference-anthropic-extension && yarn && yarn build:publish"
+  },
+  "exports": {
+    ".": "./dist/index.js",
+    "./main": "./dist/module.js"
+  },
+  "devDependencies": {
+    "cpx": "^1.5.0",
+    "rimraf": "^3.0.2",
+    "webpack": "^5.88.2",
+    "webpack-cli": "^5.1.4",
+    "ts-loader": "^9.5.0"
+  },
+  "dependencies": {
+    "@janhq/core": "file:../../core",
+    "fetch-retry": "^5.0.6",
+    "ulidx": "^2.3.0"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "files": [
+    "dist/*",
+    "package.json",
+    "README.md"
+  ],
+  "bundleDependencies": [
+    "fetch-retry"
+  ]
+}
--- a/extensions/inference-anthropic-extension/resources/models.json
+++ b/extensions/inference-anthropic-extension/resources/models.json
@ -0,0 +1,83 @@
+[
+  {
+    "sources": [
+      {
+        "url": "https://www.anthropic.com/"
+      }
+    ],
+    "id": "claude-3-opus-20240229",
+    "object": "model",
+    "name": "Claude 3 Opus",
+    "version": "1.0",
+    "description": "Claude 3 Opus is a powerful model suitables for highly complex task.",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 4096,
+      "temperature": 0.7,
+      "stream": false
+    },
+    "metadata": {
+      "author": "Anthropic",
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
+    },
+    "engine": "anthropic"
+  },
+  {
+    "sources": [
+      {
+        "url": "https://www.anthropic.com/"
+      }
+    ],
+    "id": "claude-3-sonnet-20240229",
+    "object": "model",
+    "name": "Claude 3 Sonnet",
+    "version": "1.0",
+    "description": "Claude 3 Sonnet is an ideal model balance of intelligence and speed for enterprise workloads.",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 4096,
+      "temperature": 0.7,
+      "stream": false
+    },
+    "metadata": {
+      "author": "Anthropic",
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
+    },
+    "engine": "anthropic"
+  },
+  {
+    "sources": [
+      {
+        "url": "https://www.anthropic.com/"
+      }
+    ],
+    "id": "claude-3-haiku-20240307",
+    "object": "model",
+    "name": "Claude 3 Haiku",
+    "version": "1.0",
+    "description": "Claude 3 Haiku is the fastest model provides near-instant responsiveness.",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 4096,
+      "temperature": 0.7,
+      "stream": false
+    },
+    "metadata": {
+      "author": "Anthropic",
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
+    },
+    "engine": "anthropic"
+  }
+]
--- a/extensions/inference-anthropic-extension/resources/settings.json
+++ b/extensions/inference-anthropic-extension/resources/settings.json
@ -0,0 +1,23 @@
+[
+  {
+    "key": "chat-completions-endpoint",
+    "title": "Chat Completions Endpoint",
+    "description": "The endpoint to use for chat completions. See the [Anthropic API documentation](https://docs.anthropic.com/claude/docs/intro-to-claude) for more information.",
+    "controllerType": "input",
+    "controllerProps": {
+      "placeholder": "https://api.anthropic.com/v1/messages",
+      "value": "https://api.anthropic.com/v1/messages"
+    }
+  },
+  {
+    "key": "anthropic-api-key",
+    "title": "API Key",
+    "description": "The Anthropic API uses API keys for authentication. Visit your [API Keys](https://console.anthropic.com/settings/keys) page to retrieve the API key you'll use in your requests.",
+    "controllerType": "input",
+    "controllerProps": {
+      "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+      "value": "",
+      "type": "password"
+    }
+  }
+]
--- a/extensions/inference-anthropic-extension/src/index.ts
+++ b/extensions/inference-anthropic-extension/src/index.ts
@ -0,0 +1,124 @@
+/**
+ * @file This file exports a class that implements the InferenceExtension interface from the @janhq/core package.
+ * The class provides methods for initializing and stopping a model, and for making inference requests.
+ * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
+ * @version 1.0.0
+ * @module inference-anthropic-extension/src/index
+ */
+
+import { RemoteOAIEngine } from '@janhq/core'
+import { PayloadType } from '@janhq/core'
+import { ChatCompletionRole } from '@janhq/core'
+
+declare const SETTINGS: Array<any>
+declare const MODELS: Array<any>
+
+enum Settings {
+  apiKey = 'anthropic-api-key',
+  chatCompletionsEndPoint = 'chat-completions-endpoint',
+}
+
+type AnthropicPayloadType = {
+  model?: string
+  max_tokens?: number
+  messages?: Array<{ role: string; content: string }>
+}
+
+/**
+ * A class that implements the InferenceExtension interface from the @janhq/core package.
+ * The class provides methods for initializing and stopping a model, and for making inference requests.
+ * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
+ */
+export default class JanInferenceAnthropicExtension extends RemoteOAIEngine {
+  inferenceUrl: string = ''
+  provider: string = 'anthropic'
+  maxTokens: number = 4096
+
+  override async onLoad(): Promise<void> {
+    super.onLoad()
+
+    // Register Settings
+    this.registerSettings(SETTINGS)
+    this.registerModels(MODELS)
+
+    this.apiKey = await this.getSetting<string>(Settings.apiKey, '')
+    this.inferenceUrl = await this.getSetting<string>(
+      Settings.chatCompletionsEndPoint,
+      ''
+    )
+
+    if (this.inferenceUrl.length === 0) {
+      SETTINGS.forEach((setting) => {
+        if (setting.key === Settings.chatCompletionsEndPoint) {
+          this.inferenceUrl = setting.controllerProps.value as string
+        }
+      })
+    }
+  }
+
+  // Override the headers method to include the x-API-key in the request headers
+  override async headers(): Promise<HeadersInit> {
+    return {
+      'Content-Type': 'application/json',
+      'x-api-key': this.apiKey,
+      'anthropic-version': '2023-06-01',
+    }
+  }
+
+  onSettingUpdate<T>(key: string, value: T): void {
+    if (key === Settings.apiKey) {
+      this.apiKey = value as string
+    } else if (key === Settings.chatCompletionsEndPoint) {
+      if (typeof value !== 'string') return
+
+      if (value.trim().length === 0) {
+        SETTINGS.forEach((setting) => {
+          if (setting.key === Settings.chatCompletionsEndPoint) {
+            this.inferenceUrl = setting.controllerProps.value as string
+          }
+        })
+      } else {
+        this.inferenceUrl = value
+      }
+    }
+  }
+
+  // Override the transformPayload method to convert the payload to the required format
+  transformPayload = (payload: PayloadType): AnthropicPayloadType => {
+    if (!payload.messages || payload.messages.length === 0) {
+      return { max_tokens: this.maxTokens, messages: [], model: payload.model }
+    }
+
+    const convertedData: AnthropicPayloadType = {
+      max_tokens: this.maxTokens,
+      messages: [],
+      model: payload.model,
+    }
+
+    payload.messages.forEach((item, index) => {
+      if (item.role === ChatCompletionRole.User) {
+        convertedData.messages.push({
+          role: 'user',
+          content: item.content as string,
+        })
+      } else if (item.role === ChatCompletionRole.Assistant) {
+        convertedData.messages.push({
+          role: 'assistant',
+          content: item.content as string,
+        })
+      }
+    })
+
+    return convertedData
+  }
+
+  // Override the transformResponse method to convert the response to the required format
+  transformResponse = (data: any): string => {
+    if (data.content && data.content.length > 0 && data.content[0].text) {
+      return data.content[0].text
+    } else {
+      console.error('Invalid response format:', data)
+      return ''
+    }
+  }
+}
--- a/extensions/inference-anthropic-extension/tsconfig.json
+++ b/extensions/inference-anthropic-extension/tsconfig.json
@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "target": "es2016",
+    "module": "ES6",
+    "moduleResolution": "node",
+    "outDir": "./dist",
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": false,
+    "skipLibCheck": true,
+    "rootDir": "./src"
+  },
+  "include": ["./src"]
+}
--- a/extensions/inference-anthropic-extension/webpack.config.js
+++ b/extensions/inference-anthropic-extension/webpack.config.js
@ -0,0 +1,37 @@
+const webpack = require('webpack')
+const packageJson = require('./package.json')
+const settingJson = require('./resources/settings.json')
+const modelsJson = require('./resources/models.json')
+
+module.exports = {
+  experiments: { outputModule: true },
+  entry: './src/index.ts', // Adjust the entry point to match your project's main file
+  mode: 'production',
+  module: {
+    rules: [
+      {
+        test: /\.tsx?$/,
+        use: 'ts-loader',
+        exclude: /node_modules/,
+      },
+    ],
+  },
+  plugins: [
+    new webpack.DefinePlugin({
+      MODELS: JSON.stringify(modelsJson),
+      SETTINGS: JSON.stringify(settingJson),
+      ENGINE: JSON.stringify(packageJson.engine),
+    }),
+  ],
+  output: {
+    filename: 'index.js', // Adjust the output file name as needed
+    library: { type: 'module' }, // Specify ESM output format
+  },
+  resolve: {
+    extensions: ['.ts', '.js'],
+  },
+  optimization: {
+    minimize: false,
+  },
+  // Add loaders and other configuration as needed for your project
+}
--- a/extensions/inference-cohere-extension/resources/models.json
+++ b/extensions/inference-cohere-extension/resources/models.json
@ -1,4 +1,4 @@
-  [
+[
  {
    "sources": [
      {
@ -19,7 +19,37 @@
    },
    "metadata": {
      "author": "Cohere",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
+    },
+    "engine": "cohere"
+  },
+  {
+    "sources": [
+      {
+        "url": "https://cohere.com"
+      }
+    ],
+    "id": "command-r",
+    "object": "model",
+    "name": "Command R",
+    "version": "1.0",
+    "description": "Command R is an instruction-following conversational model that performs language tasks at a higher quality, more reliably, and with a longer context than previous models. It can be used for complex workflows like code generation, retrieval augmented generation (RAG), tool use, and agents.",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 128000,
+      "temperature": 0.7,
+      "stream": false
+    },
+    "metadata": {
+      "author": "Cohere",
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
    },
    "engine": "cohere"
  }
--- a/extensions/inference-cohere-extension/resources/settings.json
+++ b/extensions/inference-cohere-extension/resources/settings.json
@ -12,7 +12,7 @@
  {
    "key": "cohere-api-key",
    "title": "API Key",
-    "description": "The Cohere API uses API keys for authentication. Visit your [API Keys](https://platform.openai.com/account/api-keys) page to retrieve the API key you'll use in your requests.",
+    "description": "The Cohere API uses API keys for authentication. Visit your [API Keys](https://dashboard.cohere.com/api-keys) page to retrieve the API key you'll use in your requests.",
    "controllerType": "input",
    "controllerProps": {
      "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
--- a/extensions/inference-cohere-extension/src/index.ts
+++ b/extensions/inference-cohere-extension/src/index.ts
@ -3,7 +3,7 @@
 * The class provides methods for initializing and stopping a model, and for making inference requests.
 * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
 * @version 1.0.0
- * @module inference-openai-extension/src/index
+ * @module inference-cohere-extension/src/index
 */

 import { RemoteOAIEngine } from '@janhq/core'
@ -26,8 +26,8 @@ enum RoleType {

 type CoherePayloadType = {
  chat_history?: Array<{ role: RoleType; message: string }>
-  message?: string,
-  preamble?: string,
+  message?: string
+  preamble?: string
 }

 /**
@ -82,18 +82,24 @@ export default class JanInferenceCohereExtension extends RemoteOAIEngine {
    if (payload.messages.length === 0) {
      return {}
    }
-    const convertedData:CoherePayloadType = {
+
+    const { messages, ...params } = payload
+    const convertedData: CoherePayloadType = {
+      ...params,
      chat_history: [],
      message: '',
    }
-    payload.messages.forEach((item, index) => {
+    messages.forEach((item, index) => {
      // Assign the message of the last item to the `message` property
-      if (index === payload.messages.length - 1) {
+      if (index === messages.length - 1) {
        convertedData.message = item.content as string
        return
      }
      if (item.role === ChatCompletionRole.User) {
-        convertedData.chat_history.push({ role: RoleType.user, message: item.content as string})
+        convertedData.chat_history.push({
+          role: RoleType.user,
+          message: item.content as string,
+        })
      } else if (item.role === ChatCompletionRole.Assistant) {
        convertedData.chat_history.push({
          role: RoleType.chatbot,
@ -106,5 +112,7 @@ export default class JanInferenceCohereExtension extends RemoteOAIEngine {
    return convertedData
  }

-  transformResponse = (data: any) => data.text
+  transformResponse = (data: any) => {
+    return typeof data === 'object' ? data.text : JSON.parse(data).text ?? ''
+  }
 }
--- a/extensions/inference-groq-extension/package.json
+++ b/extensions/inference-groq-extension/package.json
@ -1,7 +1,7 @@
 {
  "name": "@janhq/inference-groq-extension",
  "productName": "Groq Inference Engine",
-  "version": "1.0.0",
+  "version": "1.0.1",
  "description": "This extension enables fast Groq chat completion API calls",
  "main": "dist/index.js",
  "module": "dist/module.js",
--- a/extensions/inference-groq-extension/resources/models.json
+++ b/extensions/inference-groq-extension/resources/models.json
@ -8,22 +8,25 @@
    "id": "llama3-70b-8192",
    "object": "model",
    "name": "Groq Llama 3 70b",
-    "version": "1.0",
+    "version": "1.1",
    "description": "Groq Llama 3 70b with supercharged speed!",
    "format": "api",
-    "settings": {
-      "text_model": false
-    },
+    "settings": {},
    "parameters": {
      "max_tokens": 8192,
      "temperature": 0.7,
-      "top_p": 1,
-      "stop": null,
-      "stream": true
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
    },
    "metadata": {
      "author": "Meta",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
    },
    "engine": "groq"
  },
@ -36,22 +39,25 @@
    "id": "llama3-8b-8192",
    "object": "model",
    "name": "Groq Llama 3 8b",
-    "version": "1.0",
+    "version": "1.1",
    "description": "Groq Llama 3 8b with supercharged speed!",
    "format": "api",
-    "settings": {
-      "text_model": false
-    },
+    "settings": {},
    "parameters": {
      "max_tokens": 8192,
      "temperature": 0.7,
-      "top_p": 1,
-      "stop": null,
-      "stream": true
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
    },
    "metadata": {
      "author": "Meta",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
    },
    "engine": "groq"
  },
@ -64,50 +70,24 @@
    "id": "gemma-7b-it",
    "object": "model",
    "name": "Groq Gemma 7b Instruct",
-    "version": "1.0",
+    "version": "1.1",
    "description": "Groq Gemma 7b Instruct with supercharged speed!",
    "format": "api",
-    "settings": {
-      "text_model": false
-    },
+    "settings": {},
    "parameters": {
-      "max_tokens": 4096,
+      "max_tokens": 8192,
      "temperature": 0.7,
-      "top_p": 1,
-      "stop": null,
-      "stream": true
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
    },
    "metadata": {
      "author": "Google",
-      "tags": ["General"]
-    },
-    "engine": "groq"
-  },
-  {
-    "sources": [
-      {
-        "url": "https://groq.com"
-      }
-    ],
-    "id": "llama2-70b-4096",
-    "object": "model",
-    "name": "Groq Llama 2 70b",
-    "version": "1.0",
-    "description": "Groq Llama 2 70b with supercharged speed!",
-    "format": "api",
-    "settings": {
-      "text_model": false
-    },
-    "parameters": {
-      "max_tokens": 4096,
-      "temperature": 0.7,
-      "top_p": 1,
-      "stop": null,
-      "stream": true
-    },
-    "metadata": {
-      "author": "Meta",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General"
+      ]
    },
    "engine": "groq"
  },
@ -120,22 +100,25 @@
    "id": "mixtral-8x7b-32768",
    "object": "model",
    "name": "Groq Mixtral 8x7b Instruct",
-    "version": "1.0",
+    "version": "1.1",
    "description": "Groq Mixtral 8x7b Instruct is Mixtral with supercharged speed!",
    "format": "api",
-    "settings": {
-      "text_model": false
-    },
+    "settings": {},
    "parameters": {
-      "max_tokens": 4096,
+      "max_tokens": 32768,
      "temperature": 0.7,
-      "top_p": 1,
-      "stop": null,
-      "stream": true
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
    },
    "metadata": {
      "author": "Mistral",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General",
+        "Big Context Length"
+      ]
    },
    "engine": "groq"
  }
--- a/extensions/inference-martian-extension/README.md
+++ b/extensions/inference-martian-extension/README.md
@ -0,0 +1,79 @@
+# Martian Engine Extension
+
+Created using Jan extension example
+
+# Create a Jan Extension using Typescript
+
+Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀
+
+## Create Your Own Extension
+
+To create your own extension, you can use this repository as a template! Just follow the below instructions:
+
+1. Click the Use this template button at the top of the repository
+2. Select Create a new repository
+3. Select an owner and name for your new repository
+4. Click Create repository
+5. Clone your new repository
+
+## Initial Setup
+
+After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension.
+
+> [!NOTE]
+>
+> You'll need to have a reasonably modern version of
+> [Node.js](https://nodejs.org) handy. If you are using a version manager like
+> [`nodenv`](https://github.com/nodenv/nodenv) or
+> [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
+> root of your repository to install the version specified in
+> [`package.json`](./package.json). Otherwise, 20.x or later should work!
+
+1. :hammer_and_wrench: Install the dependencies
+
+   ```bash
+   npm install
+   ```
+
+1. :building_construction: Package the TypeScript for distribution
+
+   ```bash
+   npm run bundle
+   ```
+
+1. :white_check_mark: Check your artifact
+
+   There will be a tgz file in your extension directory now
+
+## Update the Extension Metadata
+
+The [`package.json`](package.json) file defines metadata about your extension, such as
+extension name, main entry, description and version.
+
+When you copy this repository, update `package.json` with the name, description for your extension.
+
+## Update the Extension Code
+
+The [`src/`](./src/) directory is the heart of your extension! This contains the
+source code that will be run when your extension functions are invoked. You can replace the
+contents of this directory with your own code.
+
+There are a few things to keep in mind when writing your extension code:
+
+- Most Jan Extension functions are processed asynchronously.
+  In `index.ts`, you will see that the extension function will return a `Promise<any>`.
+
+  ```typescript
+  import { events, MessageEvent, MessageRequest } from '@janhq/core'
+
+  function onStart(): Promise<any> {
+    return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) =>
+      this.inference(data)
+    )
+  }
+  ```
+
+  For more information about the Jan Extension Core module, see the
+  [documentation](https://github.com/janhq/jan/blob/main/core/README.md).
+
+So, what are you waiting for? Go ahead and start customizing your extension!
--- a/extensions/inference-martian-extension/package.json
+++ b/extensions/inference-martian-extension/package.json
@ -0,0 +1,42 @@
+{
+  "name": "@janhq/inference-martian-extension",
+  "productName": "Martian Inference Engine",
+  "version": "1.0.1",
+  "description": "This extension enables Martian chat completion API calls",
+  "main": "dist/index.js",
+  "module": "dist/module.js",
+  "engine": "martian",
+  "author": "Jan <service@jan.ai>",
+  "license": "AGPL-3.0",
+  "scripts": {
+    "build": "tsc -b . && webpack --config webpack.config.js",
+    "build:publish": "rimraf *.tgz --glob && yarn build && npm pack && cpx *.tgz ../../pre-install"
+  },
+  "exports": {
+    ".": "./dist/index.js",
+    "./main": "./dist/module.js"
+  },
+  "devDependencies": {
+    "cpx": "^1.5.0",
+    "rimraf": "^3.0.2",
+    "webpack": "^5.88.2",
+    "webpack-cli": "^5.1.4",
+    "ts-loader": "^9.5.0"
+  },
+  "dependencies": {
+    "@janhq/core": "file:../../core",
+    "fetch-retry": "^5.0.6",
+    "ulidx": "^2.3.0"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "files": [
+    "dist/*",
+    "package.json",
+    "README.md"
+  ],
+  "bundleDependencies": [
+    "fetch-retry"
+  ]
+}
--- a/extensions/inference-martian-extension/resources/models.json
+++ b/extensions/inference-martian-extension/resources/models.json
@ -0,0 +1,32 @@
+[
+  {
+    "sources": [
+      {
+        "url": "https://withmartian.com/"
+      }
+    ],
+    "id": "router",
+    "object": "model",
+    "name": "Martian Model Router",
+    "version": "1.0",
+    "description": "Martian Model Router dynamically routes requests to the best LLM in real-time",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 4096,
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
+    },
+    "metadata": {
+      "author": "Martian",
+      "tags": [
+        "General"
+      ]
+    },
+    "engine": "martian"
+  }
+]
--- a/extensions/inference-martian-extension/resources/settings.json
+++ b/extensions/inference-martian-extension/resources/settings.json
@ -0,0 +1,23 @@
+[
+  {
+    "key": "chat-completions-endpoint",
+    "title": "Chat Completions Endpoint",
+    "description": "The endpoint to use for chat completions. See the [Martian API documentation](https://docs.withmartian.com/martian-model-router/api-reference/get-chat-completions) for more information.",
+    "controllerType": "input",
+    "controllerProps": {
+      "placeholder": "https://withmartian.com/api/openai/v1/chat/completions",
+      "value": "https://withmartian.com/api/openai/v1/chat/completions"
+    }
+  },
+  {
+    "key": "martian-api-key",
+    "title": "API Key",
+    "description": "The Martian API uses API keys for authentication. Visit your [API Keys](https://withmartian.com/dashboard) page to retrieve the API key you'll use in your requests.",
+    "controllerType": "input",
+    "controllerProps": {
+      "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+      "value": "",
+      "type": "password"
+    }
+  }
+]
--- a/extensions/inference-martian-extension/src/index.ts
+++ b/extensions/inference-martian-extension/src/index.ts
@ -0,0 +1,66 @@
+/**
+ * @file This file exports a class that implements the InferenceExtension interface from the @janhq/core package.
+ * The class provides methods for initializing and stopping a model, and for making inference requests.
+ * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
+ * @version 1.0.0
+ * @module inference-martian-extension/src/index
+ */
+
+import { RemoteOAIEngine, SettingComponentProps } from '@janhq/core'
+
+declare const SETTINGS: Array<any>
+declare const MODELS: Array<any>
+
+enum Settings {
+  apiKey = 'martian-api-key',
+  chatCompletionsEndPoint = 'chat-completions-endpoint',
+}
+
+/**
+ * A class that implements the InferenceExtension interface from the @janhq/core package.
+ * The class provides methods for initializing and stopping a model, and for making inference requests.
+ * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
+ */
+export default class JanInferenceMartianExtension extends RemoteOAIEngine {
+  inferenceUrl: string = ''
+  provider: string = 'martian'
+
+  override async onLoad(): Promise<void> {
+    super.onLoad()
+
+    // Register Settings
+    this.registerSettings(SETTINGS)
+    this.registerModels(MODELS)
+
+    this.apiKey = await this.getSetting<string>(Settings.apiKey, '')
+    this.inferenceUrl = await this.getSetting<string>(
+      Settings.chatCompletionsEndPoint,
+      ''
+    )
+    if (this.inferenceUrl.length === 0) {
+      SETTINGS.forEach((setting) => {
+        if (setting.key === Settings.chatCompletionsEndPoint) {
+          this.inferenceUrl = setting.controllerProps.value as string
+        }
+      })
+    }
+  }
+
+  onSettingUpdate<T>(key: string, value: T): void {
+    if (key === Settings.apiKey) {
+      this.apiKey = value as string
+    } else if (key === Settings.chatCompletionsEndPoint) {
+      if (typeof value !== 'string') return
+
+      if (value.trim().length === 0) {
+        SETTINGS.forEach((setting) => {
+          if (setting.key === Settings.chatCompletionsEndPoint) {
+            this.inferenceUrl = setting.controllerProps.value as string
+          }
+        })
+      } else {
+        this.inferenceUrl = value
+      }
+    }
+  }
+}
--- a/extensions/inference-martian-extension/tsconfig.json
+++ b/extensions/inference-martian-extension/tsconfig.json
@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "target": "es2016",
+    "module": "ES6",
+    "moduleResolution": "node",
+    "outDir": "./dist",
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": false,
+    "skipLibCheck": true,
+    "rootDir": "./src"
+  },
+  "include": ["./src"]
+}
--- a/extensions/inference-martian-extension/webpack.config.js
+++ b/extensions/inference-martian-extension/webpack.config.js
@ -0,0 +1,37 @@
+const webpack = require('webpack')
+const packageJson = require('./package.json')
+const settingJson = require('./resources/settings.json')
+const modelsJson = require('./resources/models.json')
+
+module.exports = {
+  experiments: { outputModule: true },
+  entry: './src/index.ts', // Adjust the entry point to match your project's main file
+  mode: 'production',
+  module: {
+    rules: [
+      {
+        test: /\.tsx?$/,
+        use: 'ts-loader',
+        exclude: /node_modules/,
+      },
+    ],
+  },
+  plugins: [
+    new webpack.DefinePlugin({
+      MODELS: JSON.stringify(modelsJson),
+      SETTINGS: JSON.stringify(settingJson),
+      ENGINE: JSON.stringify(packageJson.engine),
+    }),
+  ],
+  output: {
+    filename: 'index.js', // Adjust the output file name as needed
+    library: { type: 'module' }, // Specify ESM output format
+  },
+  resolve: {
+    extensions: ['.ts', '.js'],
+  },
+  optimization: {
+    minimize: false,
+  },
+  // Add loaders and other configuration as needed for your project
+}
--- a/extensions/inference-mistral-extension/package.json
+++ b/extensions/inference-mistral-extension/package.json
@ -1,7 +1,7 @@
 {
  "name": "@janhq/inference-mistral-extension",
  "productName": "MistralAI Inference Engine",
-  "version": "1.0.0",
+  "version": "1.0.1",
  "description": "This extension enables Mistral chat completion API calls",
  "main": "dist/index.js",
  "module": "dist/module.js",
--- a/extensions/inference-mistral-extension/resources/models.json
+++ b/extensions/inference-mistral-extension/resources/models.json
@ -8,48 +8,20 @@
    "id": "mistral-small-latest",
    "object": "model",
    "name": "Mistral Small",
-    "version": "1.0",
-    "description": "Mistral Small is the ideal choice for simpe tasks that one can do in builk - like Classification, Customer Support, or Text Generation. It offers excellent performance at an affordable price point.",
+    "version": "1.1",
+    "description": "Mistral Small is the ideal choice for simple tasks (Classification, Customer Support, or Text Generation) at an affordable price.",
    "format": "api",
    "settings": {},
    "parameters": {
-      "max_tokens": 4096,
-      "temperature": 0.7
+      "max_tokens": 32000,
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true
    },
    "metadata": {
      "author": "Mistral",
      "tags": [
-        "Classification",
-        "Customer Support",
-        "Text Generation"
-      ]
-    },
-    "engine": "mistral"
-  },
-  {
-    "sources": [
-      {
-        "url": "https://docs.mistral.ai/api/"
-      }
-    ],
-    "id": "mistral-medium-latest",
-    "object": "model",
-    "name": "Mistral Medium",
-    "version": "1.0",
-    "description": "Mistral Medium is the ideal for intermediate tasks that require moderate reasoning - like Data extraction, Summarizing a Document, Writing a Job Description, or Writing Product Descriptions. Mistral Medium strikes a balance between performance and capability, making it suitable for a wide range of tasks that only require language transformaion",
-    "format": "api",
-    "settings": {},
-    "parameters": {
-      "max_tokens": 4096,
-      "temperature": 0.7
-    },
-    "metadata": {
-      "author": "Mistral",
-      "tags": [
-        "Data extraction",
-        "Summarizing a Document",
-        "Writing a Job Description",
-        "Writing Product Descriptions"
+        "General"
      ]
    },
    "engine": "mistral"
@ -63,21 +35,47 @@
    "id": "mistral-large-latest",
    "object": "model",
    "name": "Mistral Large",
-    "version": "1.0",
-    "description": "Mistral Large is ideal for complex tasks that require large reasoning capabilities or are highly specialized - like Synthetic Text Generation, Code Generation, RAG, or Agents.",
+    "version": "1.1",
+    "description": "Mistral Large is ideal for complex tasks (Synthetic Text Generation, Code Generation, RAG, or Agents).",
    "format": "api",
    "settings": {},
    "parameters": {
-      "max_tokens": 4096,
-      "temperature": 0.7
+      "max_tokens": 32000,
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true
    },
    "metadata": {
      "author": "Mistral",
      "tags": [
-        "Text Generation",
-        "Code Generation",
-        "RAG",
-        "Agents"
+        "General"
+      ]
+    },
+    "engine": "mistral"
+  },
+  {
+    "sources": [
+      {
+        "url": "https://docs.mistral.ai/api/"
+      }
+    ],
+    "id": "open-mixtral-8x22b",
+    "object": "model",
+    "name": "Mixtral 8x22B",
+    "version": "1.1",
+    "description": "Mixtral 8x22B is a high-performance, cost-effective model designed for complex tasks.",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 32000,
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true
+    },
+    "metadata": {
+      "author": "Mistral",
+      "tags": [
+        "General"
      ]
    },
    "engine": "mistral"
--- a/extensions/inference-nitro-extension/.gitignore
+++ b/extensions/inference-nitro-extension/.gitignore
@ -0,0 +1,2 @@
+bin
+!version.txt
--- a/extensions/inference-nitro-extension/bin/version.txt
+++ b/extensions/inference-nitro-extension/bin/version.txt
@ -1 +1 @@
-0.3.22
+0.4.4
--- a/extensions/inference-nitro-extension/download.bat
+++ b/extensions/inference-nitro-extension/download.bat
@ -1,3 +1,3 @@
@echo off
-set /p NITRO_VERSION=<./bin/version.txt
-.\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64-avx2-cuda-12-0.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0 && .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64-avx2-cuda-11-7.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7 && .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64-avx2.tar.gz -e --strip 1 -o ./bin/win-cpu && .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%NITRO_VERSION%/nitro-%NITRO_VERSION%-win-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/win-vulkan
+set /p CORTEX_VERSION=<./bin/version.txt
+.\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-avx2-cuda-12-0.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-avx2-cuda-11-7.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7 && .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-avx2.tar.gz -e --strip 1 -o ./bin/win-cpu && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/win-vulkan
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@ -1,8 +1,8 @@
 {
-  "name": "@janhq/inference-nitro-extension",
-  "productName": "Nitro Inference Engine",
-  "version": "1.0.4",
-  "description": "This extension embeds Nitro, a lightweight (3mb) inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
+  "name": "@janhq/inference-cortex-extension",
+  "productName": "Cortex Inference Engine",
+  "version": "1.0.7",
+  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
  "main": "dist/index.js",
  "node": "dist/node/index.cjs.js",
  "author": "Jan <service@jan.ai>",
@ -10,8 +10,8 @@
  "scripts": {
    "test": "jest",
    "build": "tsc --module commonjs && rollup -c rollup.config.ts",
-    "downloadnitro:linux": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-avx2.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/nitro && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-linux-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/linux-vulkan && chmod +x ./bin/linux-vulkan/nitro",
-    "downloadnitro:darwin": "NITRO_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/nitro/releases/download/v${NITRO_VERSION}/nitro-${NITRO_VERSION}-mac-universal.tar.gz -o ./bin/ && mkdir -p ./bin/mac-universal && tar -zxvf ./bin/nitro-${NITRO_VERSION}-mac-universal.tar.gz --strip-components=1 -C ./bin/mac-universal && rm -rf ./bin/nitro-${NITRO_VERSION}-mac-universal.tar.gz && chmod +x ./bin/mac-universal/nitro",
+    "downloadnitro:linux": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-avx2.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/linux-vulkan && chmod +x ./bin/linux-vulkan/cortex-cpp",
+    "downloadnitro:darwin": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-arm64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz --strip-components=1 -C ./bin/mac-arm64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz && chmod +x ./bin/mac-arm64/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-amd64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz --strip-components=1 -C ./bin/mac-amd64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz && chmod +x ./bin/mac-amd64/cortex-cpp",
    "downloadnitro:win32": "download.bat",
    "downloadnitro": "run-script-os",
    "build:publish:darwin": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && ../../.github/scripts/auto-sign.sh && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
--- a/extensions/inference-nitro-extension/resources/models/codeninja-1.0-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/codeninja-1.0-7b/model.json
@ -8,19 +8,20 @@
  "id": "codeninja-1.0-7b",
  "object": "model",
  "name": "CodeNinja 7B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "CodeNinja is good for coding tasks and can handle various languages including Python, C, C++, Rust, Java, JavaScript, and more.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 8192,
    "prompt_template": "GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:",
-    "llama_model_path": "codeninja-1.0-openchat-7b.Q4_K_M.gguf"
+    "llama_model_path": "codeninja-1.0-openchat-7b.Q4_K_M.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 8192,
    "frequency_penalty": 0,
    "presence_penalty": 0
  },
--- a/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json
@ -8,19 +8,20 @@
    "id": "command-r-34b",
    "object": "model",
    "name": "Command-R v01 34B Q4",
-    "version": "1.3",
+    "version": "1.4",
    "description": "C4AI Command-R developed by CohereAI is optimized for a variety of use cases including reasoning, summarization, and question answering.",
    "format": "gguf",
    "settings": {
-      "ctx_len": 4096,
+      "ctx_len": 131072,
      "prompt_template": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-      "llama_model_path": "c4ai-command-r-v01-Q4_K_M.gguf"
+      "llama_model_path": "c4ai-command-r-v01-Q4_K_M.gguf",
+      "ngl": 40
    },
    "parameters": {
      "temperature": 0.7,
      "top_p": 0.95,
      "stream": true,
-      "max_tokens": 4096,
+      "max_tokens": 131072,
      "stop": [],
      "frequency_penalty": 0,
      "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/deepseek-coder-1.3b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/deepseek-coder-1.3b/model.json
@ -8,19 +8,20 @@
  "id": "deepseek-coder-1.3b",
  "object": "model",
  "name": "Deepseek Coder 1.3B Q8",
-  "version": "1.0",
+  "version": "1.1",
  "description": "Deepseek Coder excelled in project-level code completion with advanced capabilities across multiple programming languages.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 16384,
    "prompt_template": "### Instruction:\n{prompt}\n### Response:",
-    "llama_model_path": "deepseek-coder-1.3b-instruct.Q8_0.gguf"
+    "llama_model_path": "deepseek-coder-1.3b-instruct.Q8_0.gguf",
+    "ngl": 24
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 16384,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/deepseek-coder-34b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/deepseek-coder-34b/model.json
@ -1,26 +1,27 @@
 {
  "sources": [
    {
-      "filename": "deepseek-coder-33b-instruct.Q5_K_M.gguf",
-      "url": "https://huggingface.co/TheBloke/deepseek-coder-33B-instruct-GGUF/resolve/main/deepseek-coder-33b-instruct.Q5_K_M.gguf"
+      "filename": "deepseek-coder-33b-instruct.Q4_K_M.gguf",
+      "url": "https://huggingface.co/TheBloke/deepseek-coder-33B-instruct-GGUF/resolve/main/deepseek-coder-33b-instruct.Q4_K_M.gguf"
    }
  ],
  "id": "deepseek-coder-34b",
  "object": "model",
-  "name": "Deepseek Coder 33B Q5",
-  "version": "1.0",
+  "name": "Deepseek Coder 33B Q4",
+  "version": "1.1",
  "description": "Deepseek Coder excelled in project-level code completion with advanced capabilities across multiple programming languages.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 16384,
    "prompt_template": "### Instruction:\n{prompt}\n### Response:",
-    "llama_model_path": "deepseek-coder-33b-instruct.Q5_K_M.gguf"
+    "llama_model_path": "deepseek-coder-33b-instruct.Q4_K_M.gguf",
+    "ngl": 62
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 16384,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/dolphin-phi-2/model.json
+++ b/extensions/inference-nitro-extension/resources/models/dolphin-phi-2/model.json
@ -1,32 +0,0 @@
-{
-    "sources": [
-      {
-        "url": "https://huggingface.co/TheBloke/dolphin-2_6-phi-2-GGUF/resolve/main/dolphin-2_6-phi-2.Q8_0.gguf",
-        "filename": "dolphin-2_6-phi-2.Q8_0.gguf"
-      }
-    ],
-    "id": "dolphin-phi-2",
-    "object": "model",
-    "name": "Dolphin Phi-2 2.7B Q8",
-    "version": "1.0",
-    "description": "Dolphin Phi-2 is a good alternative for Phi-2 in chatting",
-    "format": "gguf",
-    "settings": {
-      "ctx_len": 4096,
-      "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-      "llama_model_path": "dolphin-2_6-phi-2.Q8_0.gguf"
-    },
-    "parameters": {
-      "max_tokens": 4096,
-      "stop": ["<|im_end|>"]
-    },
-    "metadata": {
-      "author": "Cognitive Computations, Microsoft",
-      "tags": [
-        "3B",
-        "Finetuned"
-      ],
-      "size": 2960000000
-    },
-    "engine": "nitro"
-  }
--- a/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json
@ -8,19 +8,20 @@
  "id": "gemma-2b",
  "object": "model",
  "name": "Gemma 2B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "Gemma is built from the same technology with Google's Gemini.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 8192,
    "prompt_template": "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model",
-    "llama_model_path": "gemma-2b-it-q4_k_m.gguf"
+    "llama_model_path": "gemma-2b-it-q4_k_m.gguf",
+    "ngl": 18
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 8192,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json
@ -8,19 +8,20 @@
  "id": "gemma-7b",
  "object": "model",
  "name": "Gemma 7B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "Google's Gemma is built for multilingual purpose",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 8192,
    "prompt_template": "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model",
-    "llama_model_path": "gemma-7b-it-q4_K_M.gguf"
+    "llama_model_path": "gemma-7b-it-q4_K_M.gguf",
+    "ngl": 28
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 8192,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/llama2-chat-70b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/llama2-chat-70b/model.json
@ -14,7 +14,8 @@
  "settings": {
    "ctx_len": 4096,
    "prompt_template": "[INST] <<SYS>>\n{system_message}<</SYS>>\n{prompt}[/INST]",
-    "llama_model_path": "llama-2-70b-chat.Q4_K_M.gguf"
+    "llama_model_path": "llama-2-70b-chat.Q4_K_M.gguf",
+    "ngl": 80
  },
  "parameters": {
    "temperature": 0.7,
--- a/extensions/inference-nitro-extension/resources/models/llama2-chat-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/llama2-chat-7b/model.json
@ -14,7 +14,8 @@
  "settings": {
    "ctx_len": 4096,
    "prompt_template": "[INST] <<SYS>>\n{system_message}<</SYS>>\n{prompt}[/INST]",
-    "llama_model_path": "llama-2-7b-chat.Q4_K_M.gguf"
+    "llama_model_path": "llama-2-7b-chat.Q4_K_M.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
--- a/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json
+++ b/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json
@ -8,19 +8,20 @@
    "id": "llama3-8b-instruct",
    "object": "model",
    "name": "Llama 3 8B Q4",
-    "version": "1.0",
+    "version": "1.1",
    "description": "Meta's Llama 3 excels at general usage situations, including chat, general world knowledge, and coding.",
    "format": "gguf",
    "settings": {
      "ctx_len": 8192,
      "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-      "llama_model_path": "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
+      "llama_model_path": "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
+      "ngl": 32
    },
    "parameters": {
      "temperature": 0.7,
      "top_p": 0.95,
      "stream": true,
-      "max_tokens": 4096,
+      "max_tokens": 8192,
      "stop": ["<|end_of_text|>","<|eot_id|>"],
      "frequency_penalty": 0,
      "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/llama3-hermes-8b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/llama3-hermes-8b/model.json
@ -1,35 +1,38 @@
 {
    "sources": [
      {
-        "filename": "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf",
-        "url": "https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf"
+        "filename": "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
+        "url": "https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf"
      }
    ],
-    "id": "hermes-pro-7b",
+    "id": "llama3-hermes-8b",
    "object": "model",
-    "name": "Hermes Pro 7B Q4",
+    "name": "Hermes Pro Llama 3 8B Q4",
    "version": "1.1",
-    "description": "Hermes Pro is superior in Roleplaying, Reasoning and Explaining problem.",
+    "description": "Hermes Pro is well-designed for General chat and JSON output.",
    "format": "gguf",
    "settings": {
-      "ctx_len": 4096,
+      "ctx_len": 8192,
      "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-      "llama_model_path": "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf"
+      "llama_model_path": "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
+      "ngl": 32
    },
    "parameters": {
      "temperature": 0.7,
      "top_p": 0.95,
      "stream": true,
-      "max_tokens": 4096,
+      "max_tokens": 8192,
      "stop": [],
      "frequency_penalty": 0,
      "presence_penalty": 0
    },
    "metadata": {
      "author": "NousResearch",
-      "tags": ["7B", "Finetuned"],
-      "size": 4370000000
+      "tags": [
+        "7B",
+        "Finetuned"
+      ],
+      "size": 4920000000
    },
    "engine": "nitro"
  }
-  
--- a/extensions/inference-nitro-extension/resources/models/llamacorn-1.1b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/llamacorn-1.1b/model.json
@ -14,7 +14,8 @@
    "settings": {
      "ctx_len": 2048,
      "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-      "llama_model_path": "llamacorn-1.1b-chat.Q8_0.gguf"
+      "llama_model_path": "llamacorn-1.1b-chat.Q8_0.gguf",
+      "ngl": 22
    },
    "parameters": {
      "temperature": 0.7,
--- a/extensions/inference-nitro-extension/resources/models/miqu-70b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/miqu-70b/model.json
@ -1,34 +0,0 @@
-{
-    "sources": [
-      {
-        "filename": "miqu-1-70b.q4_k_m.gguf",
-        "url": "https://huggingface.co/miqudev/miqu-1-70b/resolve/main/miqu-1-70b.q4_k_m.gguf"
-      }
-    ],
-    "id": "miqu-70b",
-    "object": "model",
-    "name": "Mistral 70B Q4",
-    "version": "1.0",
-    "description": "A leak weight of Mistral 70B model.",
-    "format": "gguf",
-    "settings": {
-      "ctx_len": 4096,
-      "prompt_template": "[INST] {prompt} [/INST]",
-      "llama_model_path": "miqu-1-70b.q4_k_m.gguf"
-    },
-    "parameters": {
-      "temperature": 0.7,
-      "top_p": 0.95,
-      "stream": true,
-      "max_tokens": 4096,
-      "frequency_penalty": 0,
-      "presence_penalty": 0
-    },
-    "metadata": {
-      "author": "miqudev",
-      "tags": ["70B", "Foundational Model"],
-      "size": 26440000000
-    },
-    "engine": "nitro"
-  }
-  
--- a/extensions/inference-nitro-extension/resources/models/mistral-ins-7b-q4/model.json
+++ b/extensions/inference-nitro-extension/resources/models/mistral-ins-7b-q4/model.json
@ -8,20 +8,21 @@
  "id": "mistral-ins-7b-q4",
  "object": "model",
  "name": "Mistral Instruct 7B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "Mistral Instruct 7b model, specifically designed for a comprehensive understanding of the world.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 32768,
    "prompt_template": "[INST] {prompt} [/INST]",
-    "llama_model_path": "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
+    "llama_model_path": "mistral-7b-instruct-v0.2.Q4_K_M.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
-    "stop": [],
+    "max_tokens": 32768,
+    "stop": ["[/INST]"],
    "frequency_penalty": 0,
    "presence_penalty": 0
  },
--- a/extensions/inference-nitro-extension/resources/models/mixtral-8x7b-instruct/model.json
+++ b/extensions/inference-nitro-extension/resources/models/mixtral-8x7b-instruct/model.json
@ -8,19 +8,20 @@
  "id": "mixtral-8x7b-instruct",
  "object": "model",
  "name": "Mixtral 8x7B Instruct Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "The Mixtral-8x7B is a pretrained generative Sparse Mixture of Experts. The Mixtral-8x7B outperforms 70B models on most benchmarks.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 32768,
    "prompt_template": "[INST] {prompt} [/INST]",
-    "llama_model_path": "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
+    "llama_model_path": "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
+    "ngl": 100
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 32768,
    "frequency_penalty": 0,
    "presence_penalty": 0
  },
--- a/extensions/inference-nitro-extension/resources/models/noromaid-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/noromaid-7b/model.json
@ -8,19 +8,20 @@
  "id": "noromaid-7b",
  "object": "model",
  "name": "Noromaid 7B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "The Noromaid 7b model is designed for role-playing with human-like behavior.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 32768,
    "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-    "llama_model_path": "Noromaid-7B-0.4-DPO.q4_k_m.gguf"
+    "llama_model_path": "Noromaid-7B-0.4-DPO.q4_k_m.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 32768,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/openchat-3.5-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/openchat-3.5-7b/model.json
@ -8,19 +8,20 @@
  "id": "openchat-3.5-7b",
  "object": "model",
  "name": "Openchat-3.5 7B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "The performance of Openchat surpasses ChatGPT-3.5 and Grok-1 across various benchmarks.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 8192,
    "prompt_template": "GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:",
-    "llama_model_path": "openchat-3.5-0106.Q4_K_M.gguf"
+    "llama_model_path": "openchat-3.5-0106.Q4_K_M.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 8192,
    "stop": ["<|end_of_turn|>"],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/openhermes-neural-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/openhermes-neural-7b/model.json
@ -1,34 +0,0 @@
-{
-  "sources": [
-    {
-      "filename": "openhermes-2.5-neural-chat-v3-3-slerp.Q4_K_M.gguf",
-      "url": "https://huggingface.co/janhq/openhermes-2.5-neural-chat-v3-3-slerp-GGUF/resolve/main/openhermes-2.5-neural-chat-v3-3-slerp.Q4_K_M.gguf"
-    }
-  ],
-  "id": "openhermes-neural-7b",
-  "object": "model",
-  "name": "OpenHermes Neural 7B Q4",
-  "version": "1.1",
-  "description": "OpenHermes Neural is a merged model using the TIES method. It performs well in various benchmarks.",
-  "format": "gguf",
-  "settings": {
-    "ctx_len": 4096,
-    "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-    "llama_model_path": "openhermes-2.5-neural-chat-v3-3-slerp.Q4_K_M.gguf"
-  },
-  "parameters": {
-    "temperature": 0.7,
-    "top_p": 0.95,
-    "stream": true,
-    "max_tokens": 4096,
-    "frequency_penalty": 0,
-    "presence_penalty": 0
-  },
-  "metadata": {
-    "author": "Intel, Jan",
-    "tags": ["7B", "Merged"],
-    "size": 4370000000,
-    "cover": "https://raw.githubusercontent.com/janhq/jan/dev/models/openhermes-neural-7b/cover.png"
-  },
-  "engine": "nitro"
-}
--- a/extensions/inference-nitro-extension/resources/models/phi3-3.8b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/phi3-3.8b/model.json
@ -13,7 +13,7 @@
    "format": "gguf",
    "settings": {
      "ctx_len": 4096,
-      "prompt_template": "<|system|>\n{system_message}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n",
+      "prompt_template": "<|user|>\n{prompt}<|end|>\n<|assistant|>\n",
      "llama_model_path": "Phi-3-mini-4k-instruct-q4.gguf"
    },
    "parameters": {
--- a/extensions/inference-nitro-extension/resources/models/phind-34b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/phind-34b/model.json
@ -8,19 +8,20 @@
  "id": "phind-34b",
  "object": "model",
  "name": "Phind 34B Q4",
-  "version": "1.1",
+  "version": "1.2",
  "description": "Phind 34B is the best Open-source coding model.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 16384,
    "prompt_template": "### System Prompt\n{system_message}\n### User Message\n{prompt}\n### Assistant",
-    "llama_model_path": "phind-codellama-34b-v2.Q4_K_M.gguf"
+    "llama_model_path": "phind-codellama-34b-v2.Q4_K_M.gguf",
+    "ngl": 48
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 16384,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/qwen-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/qwen-7b/model.json
@ -8,19 +8,20 @@
  "id": "qwen-7b",
  "object": "model",
  "name": "Qwen Chat 7B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "Qwen is optimized at Chinese, ideal for everyday tasks.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 32768,
    "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-    "llama_model_path": "qwen1_5-7b-chat-q4_k_m.gguf"
+    "llama_model_path": "qwen1_5-7b-chat-q4_k_m.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 32768,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/stable-zephyr-3b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/stable-zephyr-3b/model.json
@ -14,7 +14,8 @@
    "settings": {
      "ctx_len": 4096,
      "prompt_template": "<|user|>\n{prompt}<|endoftext|>\n<|assistant|>",
-      "llama_model_path": "stablelm-zephyr-3b.Q8_0.gguf"
+      "llama_model_path": "stablelm-zephyr-3b.Q8_0.gguf",
+      "ngl": 32
    },
    "parameters": {
      "temperature": 0.7,
--- a/extensions/inference-nitro-extension/resources/models/stealth-v1.2-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/stealth-v1.2-7b/model.json
@ -12,15 +12,16 @@
  "description": "This is a new experimental family designed to enhance Mathematical and Logical abilities.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 32768,
    "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-    "llama_model_path": "stealth-v1.3.Q4_K_M.gguf"
+    "llama_model_path": "stealth-v1.3.Q4_K_M.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 32768,
    "frequency_penalty": 0,
    "presence_penalty": 0
  },
--- a/extensions/inference-nitro-extension/resources/models/tinyllama-1.1b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/tinyllama-1.1b/model.json
@ -14,7 +14,8 @@
  "settings": {
    "ctx_len": 4096,
    "prompt_template": "<|system|>\n{system_message}<|user|>\n{prompt}<|assistant|>",
-    "llama_model_path": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+    "llama_model_path": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+    "ngl": 22
  },
  "parameters": {
    "temperature": 0.7,
--- a/extensions/inference-nitro-extension/resources/models/trinity-v1.2-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/trinity-v1.2-7b/model.json
@ -12,15 +12,16 @@
  "description": "Trinity is an experimental model merge using the Slerp method. Recommended for daily assistance purposes.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 32768,
    "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-    "llama_model_path": "trinity-v1.2.Q4_K_M.gguf"
+    "llama_model_path": "trinity-v1.2.Q4_K_M.gguf",
+    "ngl": 32
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 32768,
    "frequency_penalty": 0,
    "presence_penalty": 0
  },
--- a/extensions/inference-nitro-extension/resources/models/vistral-7b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/vistral-7b/model.json
@ -8,19 +8,20 @@
    "id": "vistral-7b",
    "object": "model",
    "name": "Vistral 7B Q4",
-    "version": "1.0",
+    "version": "1.1",
    "description": "Vistral 7B has a deep understanding of Vietnamese.",
    "format": "gguf",
    "settings": {
-      "ctx_len": 4096,
+      "ctx_len": 32768,
      "prompt_template": "[INST] <<SYS>>\n{system_message}\n<</SYS>>\n{prompt} [/INST]",
-      "llama_model_path": "vistral-7b-chat-dpo.Q4_K_M.gguf"
+      "llama_model_path": "vistral-7b-chat-dpo.Q4_K_M.gguf",
+      "ngl": 32
    },
    "parameters": {
      "temperature": 0.7,
      "top_p": 0.95,
      "stream": true,
-      "max_tokens": 4096,
+      "max_tokens": 32768,
      "stop": [],
      "frequency_penalty": 0,
      "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/wizardcoder-13b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/wizardcoder-13b/model.json
@ -12,15 +12,16 @@
  "description": "WizardCoder 13B is a Python coding model. This model demonstrate high proficiency in specific domains like coding and mathematics.",
  "format": "gguf",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 16384,
    "prompt_template": "### Instruction:\n{prompt}\n### Response:",
-    "llama_model_path": "wizardcoder-python-13b-v1.0.Q4_K_M.gguf"
+    "llama_model_path": "wizardcoder-python-13b-v1.0.Q4_K_M.gguf",
+    "ngl": 40
  },
  "parameters": {
    "temperature": 0.7,
    "top_p": 0.95,
    "stream": true,
-    "max_tokens": 4096,
+    "max_tokens": 16384,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0
--- a/extensions/inference-nitro-extension/resources/models/yi-34b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/yi-34b/model.json
@ -14,7 +14,8 @@
  "settings": {
    "ctx_len": 4096,
    "prompt_template": "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant",
-    "llama_model_path": "yi-34b-chat.Q4_K_M.gguf"
+    "llama_model_path": "yi-34b-chat.Q4_K_M.gguf",
+    "ngl": 60
  },
  "parameters": {
    "temperature": 0.7,
--- a/extensions/inference-nitro-extension/rollup.config.ts
+++ b/extensions/inference-nitro-extension/rollup.config.ts
@ -12,21 +12,17 @@ const codeninja7bJson = require('./resources/models/codeninja-1.0-7b/model.json'
 const commandr34bJson = require('./resources/models/command-r-34b/model.json')
 const deepseekCoder13bJson = require('./resources/models/deepseek-coder-1.3b/model.json')
 const deepseekCoder34bJson = require('./resources/models/deepseek-coder-34b/model.json')
-const dolphinPhi2Json = require('./resources/models/dolphin-phi-2/model.json')
 const gemma2bJson = require('./resources/models/gemma-2b/model.json')
 const gemma7bJson = require('./resources/models/gemma-7b/model.json')
-const hermesPro7bJson = require('./resources/models/hermes-pro-7b/model.json')
 const llama2Chat70bJson = require('./resources/models/llama2-chat-70b/model.json')
 const llama2Chat7bJson = require('./resources/models/llama2-chat-7b/model.json')
 const llamacorn1bJson = require('./resources/models/llamacorn-1.1b/model.json')
 const llava13bJson = require('./resources/models/llava-13b/model.json')
 const llava7bJson = require('./resources/models/llava-7b/model.json')
-const miqu70bJson = require('./resources/models/miqu-70b/model.json')
 const mistralIns7bq4Json = require('./resources/models/mistral-ins-7b-q4/model.json')
 const mixtral8x7bInstructJson = require('./resources/models/mixtral-8x7b-instruct/model.json')
 const noromaid7bJson = require('./resources/models/noromaid-7b/model.json')
 const openchat357bJson = require('./resources/models/openchat-3.5-7b/model.json')
-const openhermesNeural7bJson = require('./resources/models/openhermes-neural-7b/model.json')
 const phind34bJson = require('./resources/models/phind-34b/model.json')
 const qwen7bJson = require('./resources/models/qwen-7b/model.json')
 const stableZephyr3bJson = require('./resources/models/stable-zephyr-3b/model.json')
@ -37,6 +33,7 @@ const vistral7bJson = require('./resources/models/vistral-7b/model.json')
 const wizardcoder13bJson = require('./resources/models/wizardcoder-13b/model.json')
 const yi34bJson = require('./resources/models/yi-34b/model.json')
 const llama3Json = require('./resources/models/llama3-8b-instruct/model.json')
+const llama3Hermes8bJson = require('./resources/models/llama3-hermes-8b/model.json')

 export default [
  {
@ -56,21 +53,17 @@ export default [
          commandr34bJson,
          deepseekCoder13bJson,
          deepseekCoder34bJson,
-          dolphinPhi2Json,
          gemma2bJson,
          gemma7bJson,
-          hermesPro7bJson,
          llama2Chat70bJson,
          llama2Chat7bJson,
          llamacorn1bJson,
          llava13bJson,
          llava7bJson,
-          miqu70bJson,
          mistralIns7bq4Json,
          mixtral8x7bInstructJson,
          noromaid7bJson,
          openchat357bJson,
-          openhermesNeural7bJson,
          phind34bJson,
          qwen7bJson,
          stableZephyr3bJson,
@ -80,13 +73,14 @@ export default [
          vistral7bJson,
          wizardcoder13bJson,
          yi34bJson,
-          llama3Json
+          llama3Json,
+          llama3Hermes8bJson
        ]),
        NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
        DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson),
        INFERENCE_URL: JSON.stringify(
          process.env.INFERENCE_URL ||
-            'http://127.0.0.1:3928/inferences/llamacpp/chat_completion'
+            'http://127.0.0.1:3928/inferences/server/chat_completion'
        ),
        TROUBLESHOOTING_URL: JSON.stringify(
          'https://jan.ai/guides/troubleshooting'
--- a/extensions/inference-nitro-extension/src/index.ts
+++ b/extensions/inference-nitro-extension/src/index.ts
@ -130,7 +130,7 @@ export default class JanInferenceNitroExtension extends LocalOAIEngine {
    const executableFolderPath = await joinPath([
      janDataFolderPath,
      'engines',
-      this.name ?? 'nitro',
+      this.name ?? 'cortex-cpp',
      this.version ?? '1.0.0',
    ])

@ -179,7 +179,7 @@ export default class JanInferenceNitroExtension extends LocalOAIEngine {
      const executableFolderPath = await joinPath([
        janDataFolderPath,
        'engines',
-        this.name ?? 'nitro',
+        this.name ?? 'cortex-cpp',
        this.version ?? '1.0.0',
      ])

--- a/extensions/inference-nitro-extension/src/node/execute.test.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.test.ts
@ -33,9 +33,22 @@ describe('test executable nitro file', () => {
    Object.defineProperty(process, 'platform', {
      value: 'darwin',
    })
+    Object.defineProperty(process, 'arch', {
+      value: 'arm64',
+    })
    expect(executableNitroFile(testSettings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`mac-universal${sep}nitro`),
+        executablePath: expect.stringContaining(`mac-arm64${sep}cortex-cpp`),
+        cudaVisibleDevices: '',
+        vkVisibleDevices: '',
+      })
+    )
+    Object.defineProperty(process, 'arch', {
+      value: 'amd64',
+    })
+    expect(executableNitroFile(testSettings)).toEqual(
+      expect.objectContaining({
+        executablePath: expect.stringContaining(`mac-amd64${sep}cortex-cpp`),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -56,7 +69,7 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cpu${sep}nitro.exe`),
+        executablePath: expect.stringContaining(`win-cpu${sep}cortex-cpp.exe`),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -89,7 +102,7 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-11-7${sep}nitro.exe`),
+        executablePath: expect.stringContaining(`win-cuda-11-7${sep}cortex-cpp.exe`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -122,7 +135,7 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-12-0${sep}nitro.exe`),
+        executablePath: expect.stringContaining(`win-cuda-12-0${sep}cortex-cpp.exe`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -139,7 +152,7 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cpu${sep}nitro`),
+        executablePath: expect.stringContaining(`linux-cpu${sep}cortex-cpp`),
        cudaVisibleDevices: '',
        vkVisibleDevices: '',
      })
@ -172,7 +185,7 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-11-7${sep}nitro`),
+        executablePath: expect.stringContaining(`linux-cuda-11-7${sep}cortex-cpp`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
@ -205,7 +218,7 @@ describe('test executable nitro file', () => {
    }
    expect(executableNitroFile(settings)).toEqual(
      expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-12-0${sep}nitro`),
+        executablePath: expect.stringContaining(`linux-cuda-12-0${sep}cortex-cpp`),
        cudaVisibleDevices: '0',
        vkVisibleDevices: '0',
      })
--- a/extensions/inference-nitro-extension/src/node/execute.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.ts
@ -1,4 +1,4 @@
-import { GpuSetting, SystemInformation } from '@janhq/core'
+import { GpuSetting } from '@janhq/core'
 import * as path from 'path'

 export interface NitroExecutableOptions {
@ -24,7 +24,7 @@ const os = (): string => {
  return process.platform === 'win32'
    ? 'win'
    : process.platform === 'darwin'
-      ? 'mac-universal'
+      ? process.arch === 'arm64' ? 'mac-arm64' : 'mac-amd64'
      : 'linux'
 }

@ -52,7 +52,7 @@ export const executableNitroFile = (
    .join('-')
  let cudaVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
  let vkVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
-  let binaryName = `nitro${extension()}`
+  let binaryName = `cortex-cpp${extension()}`

  return {
    executablePath: path.join(__dirname, '..', 'bin', binaryFolder, binaryName),
--- a/extensions/inference-nitro-extension/src/node/index.ts
+++ b/extensions/inference-nitro-extension/src/node/index.ts
@ -34,9 +34,9 @@ const LOCAL_HOST = '127.0.0.1'
 // The URL for the Nitro subprocess
 const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${PORT}`
 // The URL for the Nitro subprocess to load a model
-const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/loadmodel`
+const NITRO_HTTP_LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/loadmodel`
 // The URL for the Nitro subprocess to validate a model
-const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/llamacpp/modelstatus`
+const NITRO_HTTP_VALIDATE_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/modelstatus`
 // The URL for the Nitro subprocess to kill itself
 const NITRO_HTTP_KILL_URL = `${NITRO_HTTP_SERVER_URL}/processmanager/destroy`

@ -50,7 +50,7 @@ const SUPPORTED_MODEL_FORMAT = '.gguf'
 let subprocess: ChildProcessWithoutNullStreams | undefined = undefined

 // The current model settings
-let currentSettings: ModelSettingParams | undefined = undefined
+let currentSettings: ModelSettingParams & { model?: string } | undefined = undefined

 /**
 * Stops a Nitro subprocess.
@ -77,7 +77,7 @@ async function loadModel(
  }

  if (params.model.engine !== InferenceEngine.nitro) {
-    return Promise.reject('Not a nitro model')
+    return Promise.reject('Not a cortex model')
  } else {
    const nitroResourceProbe = await getSystemResourceInfo()
    // Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
@ -135,6 +135,7 @@ async function loadModel(
      // model.settings can override the default settings
      ...params.model.settings,
      llama_model_path,
+      model: params.model.id,
      // This is critical and requires real CPU physical core count (or performance core)
      ...(params.model.settings.mmproj && {
        mmproj: path.isAbsolute(params.model.settings.mmproj)
@ -142,7 +143,7 @@ async function loadModel(
          : path.join(modelFolder, params.model.settings.mmproj),
      }),
    }
-    return runNitroAndLoadModel(systemInfo)
+    return runNitroAndLoadModel(params.model.id, systemInfo)
  }
 }

@ -152,7 +153,7 @@ async function loadModel(
 * 3. Validate model status
 * @returns
 */
-async function runNitroAndLoadModel(systemInfo?: SystemInformation) {
+async function runNitroAndLoadModel(modelId: string, systemInfo?: SystemInformation) {
  // Gather system information for CPU physical cores and memory
  return killSubprocess()
    .then(() =>
@ -160,10 +161,10 @@ async function runNitroAndLoadModel(systemInfo?: SystemInformation) {
    )
    .then(() => spawnNitroProcess(systemInfo))
    .then(() => loadLLMModel(currentSettings))
-    .then(validateModelStatus)
+    .then(() => validateModelStatus(modelId))
    .catch((err) => {
      // TODO: Broadcast error so app could display proper error message
-      log(`[NITRO]::Error: ${err}`)
+      log(`[CORTEX]::Error: ${err}`)
      return { error: err }
    })
 }
@ -222,7 +223,7 @@ function loadLLMModel(settings: any): Promise<Response> {
  if (!settings?.ngl) {
    settings.ngl = 100
  }
-  log(`[NITRO]::Debug: Loading model with params ${JSON.stringify(settings)}`)
+  log(`[CORTEX]::Debug: Loading model with params ${JSON.stringify(settings)}`)
  return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
    method: 'POST',
    headers: {
@ -234,14 +235,14 @@ function loadLLMModel(settings: any): Promise<Response> {
  })
    .then((res) => {
      log(
-        `[NITRO]::Debug: Load model success with response ${JSON.stringify(
+        `[CORTEX]::Debug: Load model success with response ${JSON.stringify(
          res
        )}`
      )
      return Promise.resolve(res)
    })
    .catch((err) => {
-      log(`[NITRO]::Error: Load model failed with error ${err}`)
+      log(`[CORTEX]::Error: Load model failed with error ${err}`)
      return Promise.reject(err)
    })
 }
@ -252,11 +253,12 @@ function loadLLMModel(settings: any): Promise<Response> {
 * If the model is loaded successfully, the object is empty.
 * If the model is not loaded successfully, the object contains an error message.
 */
-async function validateModelStatus(): Promise<void> {
+async function validateModelStatus(modelId: string): Promise<void> {
  // Send a GET request to the validation URL.
  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
-    method: 'GET',
+    method: 'POST',
+    body: JSON.stringify({ model: modelId }),
    headers: {
      'Content-Type': 'application/json',
    },
@ -264,7 +266,7 @@ async function validateModelStatus(): Promise<void> {
    retryDelay: 300,
  }).then(async (res: Response) => {
    log(
-      `[NITRO]::Debug: Validate model state with response ${JSON.stringify(
+      `[CORTEX]::Debug: Validate model state with response ${JSON.stringify(
        res.status
      )}`
    )
@ -275,7 +277,7 @@ async function validateModelStatus(): Promise<void> {
      // Otherwise, return an object with an error message.
      if (body.model_loaded) {
        log(
-          `[NITRO]::Debug: Validate model state success with response ${JSON.stringify(
+          `[CORTEX]::Debug: Validate model state success with response ${JSON.stringify(
            body
          )}`
        )
@ -283,7 +285,7 @@ async function validateModelStatus(): Promise<void> {
      }
    }
    log(
-      `[NITRO]::Debug: Validate model state failed with response ${JSON.stringify(
+      `[CORTEX]::Debug: Validate model state failed with response ${JSON.stringify(
        res.statusText
      )}`
    )
@ -298,7 +300,7 @@ async function validateModelStatus(): Promise<void> {
 async function killSubprocess(): Promise<void> {
  const controller = new AbortController()
  setTimeout(() => controller.abort(), 5000)
-  log(`[NITRO]::Debug: Request to kill Nitro`)
+  log(`[CORTEX]::Debug: Request to kill cortex`)

  const killRequest = () => {
    return fetch(NITRO_HTTP_KILL_URL, {
@ -309,17 +311,17 @@ async function killSubprocess(): Promise<void> {
      .then(() =>
        tcpPortUsed.waitUntilFree(PORT, NITRO_PORT_FREE_CHECK_INTERVAL, 5000)
      )
-      .then(() => log(`[NITRO]::Debug: Nitro process is terminated`))
+      .then(() => log(`[CORTEX]::Debug: cortex process is terminated`))
      .catch((err) => {
        log(
-          `[NITRO]::Debug: Could not kill running process on port ${PORT}. Might be another process running on the same port? ${err}`
+          `[CORTEX]::Debug: Could not kill running process on port ${PORT}. Might be another process running on the same port? ${err}`
        )
        throw 'PORT_NOT_AVAILABLE'
      })
  }

  if (subprocess?.pid) {
-    log(`[NITRO]::Debug: Killing PID ${subprocess.pid}`)
+    log(`[CORTEX]::Debug: Killing PID ${subprocess.pid}`)
    const pid = subprocess.pid
    return new Promise((resolve, reject) => {
      terminate(pid, function (err) {
@ -329,7 +331,7 @@ async function killSubprocess(): Promise<void> {
          tcpPortUsed
            .waitUntilFree(PORT, NITRO_PORT_FREE_CHECK_INTERVAL, 5000)
            .then(() => resolve())
-            .then(() => log(`[NITRO]::Debug: Nitro process is terminated`))
+            .then(() => log(`[CORTEX]::Debug: cortex process is terminated`))
            .catch(() => {
              killRequest().then(resolve).catch(reject)
            })
@ -346,22 +348,24 @@ async function killSubprocess(): Promise<void> {
 * @returns A promise that resolves when the Nitro subprocess is started.
 */
 function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
-  log(`[NITRO]::Debug: Spawning Nitro subprocess...`)
+  log(`[CORTEX]::Debug: Spawning cortex subprocess...`)

  return new Promise<void>(async (resolve, reject) => {
-    let binaryFolder = path.join(__dirname, '..', 'bin') // Current directory by default
    let executableOptions = executableNitroFile(systemInfo?.gpuSetting)

    const args: string[] = ['1', LOCAL_HOST, PORT.toString()]
    // Execute the binary
    log(
-      `[NITRO]::Debug: Spawn nitro at path: ${executableOptions.executablePath}, and args: ${args}`
+      `[CORTEX]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
+    )
+    log(
+      path.parse(executableOptions.executablePath).dir
    )
    subprocess = spawn(
      executableOptions.executablePath,
      ['1', LOCAL_HOST, PORT.toString()],
      {
-        cwd: binaryFolder,
+        cwd: path.join(path.parse(executableOptions.executablePath).dir),
        env: {
          ...process.env,
          CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
@ -375,15 +379,15 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {

    // Handle subprocess output
    subprocess.stdout.on('data', (data: any) => {
-      log(`[NITRO]::Debug: ${data}`)
+      log(`[CORTEX]::Debug: ${data}`)
    })

    subprocess.stderr.on('data', (data: any) => {
-      log(`[NITRO]::Error: ${data}`)
+      log(`[CORTEX]::Error: ${data}`)
    })

    subprocess.on('close', (code: any) => {
-      log(`[NITRO]::Debug: Nitro exited with code: ${code}`)
+      log(`[CORTEX]::Debug: cortex exited with code: ${code}`)
      subprocess = undefined
      reject(`child process exited with code ${code}`)
    })
@ -391,7 +395,7 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
    tcpPortUsed
      .waitUntilUsed(PORT, NITRO_PORT_FREE_CHECK_INTERVAL, 30000)
      .then(() => {
-        log(`[NITRO]::Debug: Nitro is ready`)
+        log(`[CORTEX]::Debug: cortex is ready`)
        resolve()
      })
  })
--- a/extensions/inference-openai-extension/package.json
+++ b/extensions/inference-openai-extension/package.json
@ -1,7 +1,7 @@
 {
  "name": "@janhq/inference-openai-extension",
  "productName": "OpenAI Inference Engine",
-  "version": "1.0.0",
+  "version": "1.0.2",
  "description": "This extension enables OpenAI chat completion API calls",
  "main": "dist/index.js",
  "module": "dist/module.js",
--- a/extensions/inference-openai-extension/resources/models.json
+++ b/extensions/inference-openai-extension/resources/models.json
@ -5,20 +5,27 @@
        "url": "https://openai.com"
      }
    ],
-    "id": "gpt-4",
+    "id": "gpt-4-turbo",
    "object": "model",
-    "name": "OpenAI GPT 4",
-    "version": "1.0",
-    "description": "OpenAI GPT 4 model is extremely good",
+    "name": "OpenAI GPT 4 Turbo",
+    "version": "1.2",
+    "description": "OpenAI GPT 4 Turbo model is extremely good",
    "format": "api",
    "settings": {},
    "parameters": {
      "max_tokens": 4096,
-      "temperature": 0.7
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
    },
    "metadata": {
      "author": "OpenAI",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General"
+      ]
    },
    "engine": "openai"
  },
@ -31,8 +38,8 @@
    "id": "gpt-4-vision-preview",
    "object": "model",
    "name": "OpenAI GPT 4 with Vision (Preview)",
-    "version": "1.0",
-    "description": "OpenAI GPT 4 with Vision model is extremely good in preview",
+    "version": "1.1",
+    "description": "OpenAI GPT-4 Vision model features vision understanding capabilities",
    "format": "api",
    "settings": {
      "vision_model": true,
@ -40,34 +47,16 @@
    },
    "parameters": {
      "max_tokens": 4096,
-      "temperature": 0.7
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true
    },
    "metadata": {
      "author": "OpenAI",
-      "tags": ["General", "Big Context Length", "Vision"]
-    },
-    "engine": "openai"
-  },
-  {
-    "sources": [
-      {
-        "url": "https://openai.com"
-      }
-    ],
-    "id": "gpt-3.5-turbo-16k-0613",
-    "object": "model",
-    "name": "OpenAI GPT 3.5 Turbo 16k 0613",
-    "version": "1.0",
-    "description": "OpenAI GPT 3.5 Turbo 16k 0613 model is extremely good",
-    "format": "api",
-    "settings": {},
-    "parameters": {
-      "max_tokens": 4096,
-      "temperature": 0.7
-    },
-    "metadata": {
-      "author": "OpenAI",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General",
+        "Vision"
+      ]
    },
    "engine": "openai"
  },
@ -80,17 +69,54 @@
    "id": "gpt-3.5-turbo",
    "object": "model",
    "name": "OpenAI GPT 3.5 Turbo",
-    "version": "1.0",
-    "description": "OpenAI GPT 3.5 Turbo model is extremely good",
+    "version": "1.1",
+    "description": "OpenAI GPT 3.5 Turbo model is extremely fast",
    "format": "api",
    "settings": {},
    "parameters": {
      "max_tokens": 4096,
-      "temperature": 0.7
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
    },
    "metadata": {
      "author": "OpenAI",
-      "tags": ["General", "Big Context Length"]
+      "tags": [
+        "General"
+      ]
+    },
+    "engine": "openai"
+  },
+  {
+    "sources": [
+      {
+        "url": "https://openai.com"
+      }
+    ],
+    "id": "gpt-4o",
+    "object": "model",
+    "name": "OpenAI GPT 4o",
+    "version": "1.1",
+    "description": "OpenAI GPT 4o is a new flagship model with fast speed and high quality",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 4096,
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true,
+      "stop": [],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
+    },
+    "metadata": {
+      "author": "OpenAI",
+      "tags": [
+        "General"
+      ]
    },
    "engine": "openai"
  }
--- a/extensions/inference-openrouter-extension/README.md
+++ b/extensions/inference-openrouter-extension/README.md
@ -0,0 +1,79 @@
+# Open Router Engine Extension
+
+Created using Jan extension example
+
+# Create a Jan Extension using Typescript
+
+Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀
+
+## Create Your Own Extension
+
+To create your own extension, you can use this repository as a template! Just follow the below instructions:
+
+1. Click the Use this template button at the top of the repository
+2. Select Create a new repository
+3. Select an owner and name for your new repository
+4. Click Create repository
+5. Clone your new repository
+
+## Initial Setup
+
+After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension.
+
+> [!NOTE]
+>
+> You'll need to have a reasonably modern version of
+> [Node.js](https://nodejs.org) handy. If you are using a version manager like
+> [`nodenv`](https://github.com/nodenv/nodenv) or
+> [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
+> root of your repository to install the version specified in
+> [`package.json`](./package.json). Otherwise, 20.x or later should work!
+
+1. :hammer_and_wrench: Install the dependencies
+
+   ```bash
+   npm install
+   ```
+
+1. :building_construction: Package the TypeScript for distribution
+
+   ```bash
+   npm run bundle
+   ```
+
+1. :white_check_mark: Check your artifact
+
+   There will be a tgz file in your extension directory now
+
+## Update the Extension Metadata
+
+The [`package.json`](package.json) file defines metadata about your extension, such as
+extension name, main entry, description and version.
+
+When you copy this repository, update `package.json` with the name, description for your extension.
+
+## Update the Extension Code
+
+The [`src/`](./src/) directory is the heart of your extension! This contains the
+source code that will be run when your extension functions are invoked. You can replace the
+contents of this directory with your own code.
+
+There are a few things to keep in mind when writing your extension code:
+
+- Most Jan Extension functions are processed asynchronously.
+  In `index.ts`, you will see that the extension function will return a `Promise<any>`.
+
+  ```typescript
+  import { events, MessageEvent, MessageRequest } from '@janhq/core'
+
+  function onStart(): Promise<any> {
+    return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) =>
+      this.inference(data)
+    )
+  }
+  ```
+
+  For more information about the Jan Extension Core module, see the
+  [documentation](https://github.com/janhq/jan/blob/main/core/README.md).
+
+So, what are you waiting for? Go ahead and start customizing your extension!
--- a/extensions/inference-openrouter-extension/package.json
+++ b/extensions/inference-openrouter-extension/package.json
@ -0,0 +1,43 @@
+{
+  "name": "@janhq/inference-openrouter-extension",
+  "productName": "OpenRouter Inference Engine",
+  "version": "1.0.0",
+  "description": "This extension enables Open Router chat completion API calls",
+  "main": "dist/index.js",
+  "module": "dist/module.js",
+  "engine": "openrouter",
+  "author": "Jan <service@jan.ai>",
+  "license": "AGPL-3.0",
+  "scripts": {
+    "build": "tsc -b . && webpack --config webpack.config.js",
+    "build:publish": "rimraf *.tgz --glob && yarn build && npm pack && cpx *.tgz ../../pre-install",
+    "sync:core": "cd ../.. && yarn build:core && cd extensions && rm yarn.lock &&  cd inference-openrouter-extension && yarn && yarn build:publish"
+  },
+  "exports": {
+    ".": "./dist/index.js",
+    "./main": "./dist/module.js"
+  },
+  "devDependencies": {
+    "cpx": "^1.5.0",
+    "rimraf": "^3.0.2",
+    "webpack": "^5.88.2",
+    "webpack-cli": "^5.1.4",
+    "ts-loader": "^9.5.0"
+  },
+  "dependencies": {
+    "@janhq/core": "file:../../core",
+    "fetch-retry": "^5.0.6",
+    "ulidx": "^2.3.0"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "files": [
+    "dist/*",
+    "package.json",
+    "README.md"
+  ],
+  "bundleDependencies": [
+    "fetch-retry"
+  ]
+}
--- a/extensions/inference-openrouter-extension/resources/models.json
+++ b/extensions/inference-openrouter-extension/resources/models.json
@ -0,0 +1,28 @@
+  [
+  {
+    "sources": [
+      {
+        "url": "https://openrouter.ai"
+      }
+    ],
+    "id": "open-router-auto",
+    "object": "model",
+    "name": "OpenRouter",
+    "version": "1.0",
+    "description": " OpenRouter scouts for the lowest prices and best latencies/throughputs across dozens of providers, and lets you choose how to prioritize them.",
+    "format": "api",
+    "settings": {},
+    "parameters": {
+      "max_tokens": 1024,
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "frequency_penalty": 0,
+      "presence_penalty": 0
+    },
+    "metadata": {
+      "author": "OpenRouter",
+      "tags": ["General", "Big Context Length"]
+    },
+    "engine": "openrouter"
+  }
+]
--- a/extensions/inference-openrouter-extension/resources/settings.json
+++ b/extensions/inference-openrouter-extension/resources/settings.json
@ -0,0 +1,23 @@
+[
+  {
+    "key": "chat-completions-endpoint",
+    "title": "Chat Completions Endpoint",
+    "description": "The endpoint to use for chat completions. See the [OpenRouter API documentation](https://openrouter.ai/docs) for more information.",
+    "controllerType": "input",
+    "controllerProps": {
+      "placeholder": "https://openrouter.ai/api/v1/chat/completions",
+      "value": "https://openrouter.ai/api/v1/chat/completions"
+    }
+  },
+  {
+    "key": "openrouter-api-key",
+    "title": "API Key",
+    "description": "The OpenRouter API uses API keys for authentication. Visit your [API Keys](https://openrouter.ai/keys) page to retrieve the API key you'll use in your requests.",
+    "controllerType": "input",
+    "controllerProps": {
+      "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+      "value": "",
+      "type": "password"
+    }
+  }
+]
--- a/extensions/inference-openrouter-extension/src/index.ts
+++ b/extensions/inference-openrouter-extension/src/index.ts
@ -0,0 +1,76 @@
+/**
+ * @file This file exports a class that implements the InferenceExtension interface from the @janhq/core package.
+ * The class provides methods for initializing and stopping a model, and for making inference requests.
+ * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
+ * @version 1.0.0
+ * @module inference-openai-extension/src/index
+ */
+
+import { RemoteOAIEngine } from '@janhq/core'
+import { PayloadType } from '@janhq/core'
+import { ChatCompletionRole } from '@janhq/core'
+
+declare const SETTINGS: Array<any>
+declare const MODELS: Array<any>
+
+enum Settings {
+  apiKey = 'openrouter-api-key',
+  chatCompletionsEndPoint = 'chat-completions-endpoint',
+}
+
+enum RoleType {
+  user = 'USER',
+  chatbot = 'CHATBOT',
+  system = 'SYSTEM',
+}
+
+/**
+ * A class that implements the InferenceExtension interface from the @janhq/core package.
+ * The class provides methods for initializing and stopping a model, and for making inference requests.
+ * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
+ */
+export default class JanInferenceOpenRouterExtension extends RemoteOAIEngine {
+  inferenceUrl: string = ''
+  provider: string = 'openrouter'
+
+  override async onLoad(): Promise<void> {
+    super.onLoad()
+
+    // Register Settings
+    this.registerSettings(SETTINGS)
+    this.registerModels(MODELS)
+
+    this.apiKey = await this.getSetting<string>(Settings.apiKey, '')
+    this.inferenceUrl = await this.getSetting<string>(
+      Settings.chatCompletionsEndPoint,
+      ''
+    )
+    if (this.inferenceUrl.length === 0) {
+      SETTINGS.forEach((setting) => {
+        if (setting.key === Settings.chatCompletionsEndPoint) {
+          this.inferenceUrl = setting.controllerProps.value as string
+        }
+      })
+    }
+  }
+
+  onSettingUpdate<T>(key: string, value: T): void {
+    if (key === Settings.apiKey) {
+      this.apiKey = value as string
+    } else if (key === Settings.chatCompletionsEndPoint) {
+      if (typeof value !== 'string') return
+
+      if (value.trim().length === 0) {
+        SETTINGS.forEach((setting) => {
+          if (setting.key === Settings.chatCompletionsEndPoint) {
+            this.inferenceUrl = setting.controllerProps.value as string
+          }
+        })
+      } else {
+        this.inferenceUrl = value
+      }
+    }
+  }
+
+  transformPayload = (payload: PayloadType)=>({...payload,model:"openrouter/auto"})
+}
--- a/extensions/inference-openrouter-extension/tsconfig.json
+++ b/extensions/inference-openrouter-extension/tsconfig.json
@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "target": "es2016",
+    "module": "ES6",
+    "moduleResolution": "node",
+    "outDir": "./dist",
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": false,
+    "skipLibCheck": true,
+    "rootDir": "./src"
+  },
+  "include": ["./src"]
+}
--- a/extensions/inference-openrouter-extension/webpack.config.js
+++ b/extensions/inference-openrouter-extension/webpack.config.js
@ -0,0 +1,37 @@
+const webpack = require('webpack')
+const packageJson = require('./package.json')
+const settingJson = require('./resources/settings.json')
+const modelsJson = require('./resources/models.json')
+
+module.exports = {
+  experiments: { outputModule: true },
+  entry: './src/index.ts', // Adjust the entry point to match your project's main file
+  mode: 'production',
+  module: {
+    rules: [
+      {
+        test: /\.tsx?$/,
+        use: 'ts-loader',
+        exclude: /node_modules/,
+      },
+    ],
+  },
+  plugins: [
+    new webpack.DefinePlugin({
+      MODELS: JSON.stringify(modelsJson),
+      SETTINGS: JSON.stringify(settingJson),
+      ENGINE: JSON.stringify(packageJson.engine),
+    }),
+  ],
+  output: {
+    filename: 'index.js', // Adjust the output file name as needed
+    library: { type: 'module' }, // Specify ESM output format
+  },
+  resolve: {
+    extensions: ['.ts', '.js'],
+  },
+  optimization: {
+    minimize: false,
+  },
+  // Add loaders and other configuration as needed for your project
+}
--- a/extensions/tensorrt-llm-extension/src/node/index.ts
+++ b/extensions/tensorrt-llm-extension/src/node/index.ts
@ -97,7 +97,7 @@ function unloadModel(): Promise<void> {
  }

  if (subprocess?.pid) {
-    log(`[NITRO]::Debug: Killing PID ${subprocess.pid}`)
+    log(`[CORTEX]::Debug: Killing PID ${subprocess.pid}`)
    const pid = subprocess.pid
    return new Promise((resolve, reject) => {
      terminate(pid, function (err) {
@ -107,7 +107,7 @@ function unloadModel(): Promise<void> {
          return tcpPortUsed
            .waitUntilFree(parseInt(ENGINE_PORT), PORT_CHECK_INTERVAL, 5000)
            .then(() => resolve())
-            .then(() => log(`[NITRO]::Debug: Nitro process is terminated`))
+            .then(() => log(`[CORTEX]::Debug: cortex process is terminated`))
            .catch(() => {
              killRequest()
            })
--- a/uikit/src/main.scss
+++ b/uikit/src/main.scss
@ -45,6 +45,7 @@
  --border: 20 5.9% 90%;
  --input: 20 5.9% 90%;
  --ring: 20 14.3% 4.1%;
+  --scroll-bar: 60, 3%, 86%;

  .primary-blue {
    --primary: 221 83% 53%;
--- a/uikit/src/scroll-area/styles.scss
+++ b/uikit/src/scroll-area/styles.scss
@ -21,3 +21,38 @@
    @apply bg-border relative z-50 w-[10px] rounded-full;
  }
 }
+
+// Customized scroll bar
+::-webkit-scrollbar {
+  width: 7px;
+}
+
+::-webkit-scrollbar-thumb {
+  background-color: hsl(var(--scroll-bar));
+  border-radius: 4px;
+}
+
+::-webkit-scrollbar-track {
+  background-color: hsl(var(--background));
+}
+
+::-webkit-scrollbar-corner {
+  background-color: hsl(var(--background));
+}
+
+::-moz-scrollbar {
+  width: 7px;
+}
+
+::-moz-scrollbar-thumb {
+  background-color: hsl(var(--scroll-bar));
+  border-radius: 4px;
+}
+
+::-moz-scrollbar-track {
+  background-color: hsl(var(--background));
+}
+
+::-moz-scrollbar-corner {
+  background-color: hsl(var(--background));
+}
--- a/web/containers/Layout/index.tsx
+++ b/web/containers/Layout/index.tsx
@ -25,6 +25,8 @@ import ImportModelOptionModal from '@/screens/Settings/ImportModelOptionModal'
 import ImportingModelModal from '@/screens/Settings/ImportingModelModal'
 import SelectingModelModal from '@/screens/Settings/SelectingModelModal'

+import LoadingModal from '../LoadingModal'
+
 import MainViewContainer from '../MainViewContainer'

 import InstallingExtensionModal from './BottomBar/InstallingExtension/InstallingExtensionModal'
@ -69,6 +71,7 @@ const BaseLayout = () => {
          <BottomBar />
        </div>
      </div>
+      <LoadingModal />
      {importModelStage === 'SELECTING_MODEL' && <SelectingModelModal />}
      {importModelStage === 'MODEL_SELECTED' && <ImportModelOptionModal />}
      {importModelStage === 'IMPORTING_MODEL' && <ImportingModelModal />}
--- a/web/containers/ListContainer/index.tsx
+++ b/web/containers/ListContainer/index.tsx
@ -1,4 +1,4 @@
-import { ReactNode, useEffect, useRef } from 'react'
+import { ReactNode, useCallback, useEffect, useRef } from 'react'

 type Props = {
  children: ReactNode
@ -6,20 +6,44 @@ type Props = {

 const ListContainer: React.FC<Props> = ({ children }) => {
  const listRef = useRef<HTMLDivElement>(null)
+  const prevScrollTop = useRef(0)
+  const isUserManuallyScrollingUp = useRef(false)
+
+  const handleScroll = useCallback((event: React.UIEvent<HTMLElement>) => {
+    const currentScrollTop = event.currentTarget.scrollTop
+
+    if (prevScrollTop.current > currentScrollTop) {
+      console.debug('User is manually scrolling up')
+      isUserManuallyScrollingUp.current = true
+    } else {
+      const currentScrollTop = event.currentTarget.scrollTop
+      const scrollHeight = event.currentTarget.scrollHeight
+      const clientHeight = event.currentTarget.clientHeight
+
+      if (currentScrollTop + clientHeight >= scrollHeight) {
+        console.debug('Scrolled to the bottom')
+        isUserManuallyScrollingUp.current = false
+      }
+    }
+
+    prevScrollTop.current = currentScrollTop
+  }, [])

  useEffect(() => {
-    const scrollHeight = listRef.current?.scrollHeight ?? 0
+    if (isUserManuallyScrollingUp.current === true) return

+    const scrollHeight = listRef.current?.scrollHeight ?? 0
    listRef.current?.scrollTo({
      top: scrollHeight,
-      behavior: 'smooth',
+      behavior: 'instant',
    })
-  })
+  }, [listRef.current?.scrollHeight, isUserManuallyScrollingUp])

  return (
    <div
      ref={listRef}
      className="flex h-full w-full flex-col overflow-y-scroll"
+      onScroll={handleScroll}
    >
      {children}
    </div>
--- a/web/containers/LoadingModal/index.tsx
+++ b/web/containers/LoadingModal/index.tsx
@ -0,0 +1,26 @@
+import { Modal, ModalContent, ModalHeader, ModalTitle } from '@janhq/uikit'
+import { atom, useAtomValue } from 'jotai'
+
+export type LoadingInfo = {
+  title: string
+  message: string
+}
+
+export const loadingModalInfoAtom = atom<LoadingInfo | undefined>(undefined)
+
+const ResettingModal: React.FC = () => {
+  const loadingInfo = useAtomValue(loadingModalInfoAtom)
+
+  return (
+    <Modal open={loadingInfo != null}>
+      <ModalContent>
+        <ModalHeader>
+          <ModalTitle>{loadingInfo?.title}</ModalTitle>
+        </ModalHeader>
+        <p className="text-muted-foreground">{loadingInfo?.message}</p>
+      </ModalContent>
+    </Modal>
+  )
+}
+
+export default ResettingModal
--- a/web/containers/Providers/DeepLinkListener.tsx
+++ b/web/containers/Providers/DeepLinkListener.tsx
@ -0,0 +1,101 @@
+import { Fragment, ReactNode } from 'react'
+
+import { useSetAtom } from 'jotai'
+
+import { useDebouncedCallback } from 'use-debounce'
+
+import { useGetHFRepoData } from '@/hooks/useGetHFRepoData'
+
+import { loadingModalInfoAtom } from '../LoadingModal'
+import { toaster } from '../Toast'
+
+import {
+  importHuggingFaceModelStageAtom,
+  importingHuggingFaceRepoDataAtom,
+} from '@/helpers/atoms/HuggingFace.atom'
+type Props = {
+  children: ReactNode
+}
+
+const DeepLinkListener: React.FC<Props> = ({ children }) => {
+  const { getHfRepoData } = useGetHFRepoData()
+  const setLoadingInfo = useSetAtom(loadingModalInfoAtom)
+  const setImportingHuggingFaceRepoData = useSetAtom(
+    importingHuggingFaceRepoDataAtom
+  )
+  const setImportHuggingFaceModelStage = useSetAtom(
+    importHuggingFaceModelStageAtom
+  )
+
+  const handleDeepLinkAction = useDebouncedCallback(
+    async (deepLinkAction: DeepLinkAction) => {
+      if (
+        deepLinkAction.action !== 'models' ||
+        deepLinkAction.provider !== 'huggingface'
+      ) {
+        console.error(
+          `Invalid deeplink action (${deepLinkAction.action}) or provider (${deepLinkAction.provider})`
+        )
+        return
+      }
+
+      try {
+        setLoadingInfo({
+          title: 'Getting Hugging Face models',
+          message: 'Please wait..',
+        })
+        const data = await getHfRepoData(deepLinkAction.resource)
+        setImportingHuggingFaceRepoData(data)
+        setImportHuggingFaceModelStage('REPO_DETAIL')
+        setLoadingInfo(undefined)
+      } catch (err) {
+        setLoadingInfo(undefined)
+        toaster({
+          title: 'Failed to get Hugging Face models',
+          description: err instanceof Error ? err.message : 'Unexpected Error',
+          type: 'error',
+        })
+        console.error(err)
+      }
+    },
+    300
+  )
+
+  window.electronAPI?.onDeepLink((_event: string, input: string) => {
+    window.core?.api?.ackDeepLink()
+
+    const action = deeplinkParser(input)
+    if (!action) return
+    handleDeepLinkAction(action)
+  })
+
+  return <Fragment>{children}</Fragment>
+}
+
+type DeepLinkAction = {
+  action: string
+  provider: string
+  resource: string
+}
+
+const deeplinkParser = (
+  deepLink: string | undefined
+): DeepLinkAction | undefined => {
+  if (!deepLink) return undefined
+
+  try {
+    const url = new URL(deepLink)
+    const params = url.pathname.split('/').filter((str) => str.length > 0)
+
+    if (params.length < 3) return undefined
+    const action = params[0]
+    const provider = params[1]
+    const resource = params.slice(2).join('/')
+    return { action, provider, resource }
+  } catch (err) {
+    console.error(err)
+    return undefined
+  }
+}
+
+export default DeepLinkListener
--- a/web/containers/Providers/index.tsx
+++ b/web/containers/Providers/index.tsx
@ -22,6 +22,7 @@ import Loader from '../Loader'

 import DataLoader from './DataLoader'

+import DeepLinkListener from './DeepLinkListener'
 import KeyListener from './KeyListener'

 import { extensionManager } from '@/extension'
@ -78,7 +79,9 @@ const Providers = ({ children }: PropsWithChildren) => {
          <KeyListener>
            <EventListenerWrapper>
              <TooltipProvider delayDuration={0}>
-                <DataLoader>{children}</DataLoader>
+                <DataLoader>
+                  <DeepLinkListener>{children}</DeepLinkListener>
+                </DataLoader>
              </TooltipProvider>
            </EventListenerWrapper>
            <Toaster />
--- a/web/hooks/useCreateNewThread.ts
+++ b/web/hooks/useCreateNewThread.ts
@ -99,6 +99,11 @@ export const useCreateNewThread = () => {
        ? { ctx_len: 2048 }
        : {}

+    const overriddenParameters =
+      defaultModel?.parameters.max_tokens && defaultModel.parameters.max_tokens
+        ? { max_tokens: 2048 }
+        : {}
+
    const createdAt = Date.now()
    const assistantInfo: ThreadAssistantInfo = {
      assistant_id: assistant.id,
@ -107,7 +112,8 @@ export const useCreateNewThread = () => {
      model: {
        id: defaultModel?.id ?? '*',
        settings: { ...defaultModel?.settings, ...overriddenSettings } ?? {},
-        parameters: defaultModel?.parameters ?? {},
+        parameters:
+          { ...defaultModel?.parameters, ...overriddenParameters } ?? {},
        engine: defaultModel?.engine,
      },
      instructions: assistant.instructions,
--- a/web/screens/Chat/ChatBody/index.tsx
+++ b/web/screens/Chat/ChatBody/index.tsx
@ -22,8 +22,8 @@ const ChatBody: React.FC = () => {
  const downloadedModels = useAtomValue(downloadedModelsAtom)
  const loadModelError = useAtomValue(loadModelErrorAtom)

-  if (downloadedModels.length === 0) return <EmptyModel />
-  if (messages.length === 0) return <EmptyThread />
+  if (!downloadedModels.length) return <EmptyModel />
+  if (!messages.length) return <EmptyThread />

  return (
    <ListContainer>
--- a/web/screens/Chat/EditChatInput/index.tsx
+++ b/web/screens/Chat/EditChatInput/index.tsx
@ -129,12 +129,10 @@ const EditChatInput: React.FC<Props> = ({ message }) => {
  }

  return (
-    <div className="mx-auto flex w-full flex-shrink-0 items-end justify-center space-x-4 pb-0 pt-1">
+    <div className="mx-auto flex w-full flex-shrink-0 flex-col items-start justify-center space-y-4 pb-0 pt-1">
      <div className="relative flex w-full flex-col">
        <Textarea
-          className={twMerge(
-            'max-h-[400px] resize-none overflow-y-hidden pr-20'
-          )}
+          className={twMerge('max-h-[400px] resize-none pr-20')}
          style={{ height: '40px' }}
          ref={textareaRef}
          onKeyDown={onKeyDown}
--- a/web/screens/Chat/ModelSetting/SettingComponent.tsx
+++ b/web/screens/Chat/ModelSetting/SettingComponent.tsx
@ -3,12 +3,17 @@ import {
  InputComponentProps,
  CheckboxComponentProps,
  SliderComponentProps,
+  InferenceEngine,
 } from '@janhq/core'

+import { useAtomValue } from 'jotai/react'
+
 import Checkbox from '@/containers/Checkbox'
 import ModelConfigInput from '@/containers/ModelConfigInput'
 import SliderRightPanel from '@/containers/SliderRightPanel'

+import { activeThreadAtom } from '@/helpers/atoms/Thread.atom'
+
 type Props = {
  componentProps: SettingComponentProps[]
  disabled?: boolean
@ -20,6 +25,7 @@ const SettingComponent: React.FC<Props> = ({
  disabled = false,
  onValueUpdated,
 }) => {
+  const activeThread = useAtomValue(activeThreadAtom)
  const components = componentProps.map((data) => {
    switch (data.controllerType) {
      case 'slider': {
@ -31,7 +37,16 @@ const SettingComponent: React.FC<Props> = ({
            title={data.title}
            description={data.description}
            min={min}
-            max={max}
+            max={
+              data.key === 'max_tokens' &&
+              activeThread &&
+              activeThread.assistants[0].model.engine === InferenceEngine.nitro
+                ? Number(
+                    activeThread &&
+                      activeThread.assistants[0].model.settings.ctx_len
+                  )
+                : max
+            }
            step={step}
            value={value}
            name={data.key}
--- a/web/screens/Chat/ModelSetting/predefinedComponent.ts
+++ b/web/screens/Chat/ModelSetting/predefinedComponent.ts
@ -33,7 +33,7 @@ export const presetConfiguration: Record<string, SettingComponentProps> = {
      'The context length for model operations varies; the maximum depends on the specific model used.',
    controllerType: 'slider',
    controllerProps: {
-      min: 0,
+      min: 128,
      max: 4096,
      step: 128,
      value: 2048,
--- a/web/screens/Chat/Sidebar/index.tsx
+++ b/web/screens/Chat/Sidebar/index.tsx
@ -118,6 +118,32 @@ const Sidebar: React.FC = () => {
      updateModelParameter(activeThread, {
        params: { [key]: value },
      })
+
+      if (
+        activeThread.assistants[0].model.parameters.max_tokens &&
+        activeThread.assistants[0].model.settings.ctx_len
+      ) {
+        if (
+          key === 'max_tokens' &&
+          Number(value) > activeThread.assistants[0].model.settings.ctx_len
+        ) {
+          updateModelParameter(activeThread, {
+            params: {
+              max_tokens: activeThread.assistants[0].model.settings.ctx_len,
+            },
+          })
+        }
+        if (
+          key === 'ctx_len' &&
+          Number(value) < activeThread.assistants[0].model.parameters.max_tokens
+        ) {
+          updateModelParameter(activeThread, {
+            params: {
+              max_tokens: activeThread.assistants[0].model.settings.ctx_len,
+            },
+          })
+        }
+      }
    },
    [activeThread, setEngineParamsUpdate, stopModel, updateModelParameter]
  )
 @ -1 +1 @@
 .3.22
 .4.4