Merge pull request #2787 from janhq/dev

Release cut 0.4.12
2024-04-23 09:48:24 +07:00 · 2024-04-23 09:48:24 +07:00 · e54e7c04b3
commit e54e7c04b3
parent 9369ac3e8b f288a86647
124 changed files with 7242 additions and 1702 deletions
--- a/.github/workflows/jan-electron-linter-and-test.yml
+++ b/.github/workflows/jan-electron-linter-and-test.yml
@ -57,19 +57,19 @@ jobs:
          rm -rf ~/jan
          make clean

-      - name: Get Commit Message for PR
-        if : github.event_name == 'pull_request'
-        run: |
-          echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}})" >> $GITHUB_ENV
+      # - name: Get Commit Message for PR
+      #   if : github.event_name == 'pull_request'
+      #   run: |
+      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}})" >> $GITHUB_ENV

-      - name: Get Commit Message for push event
-        if : github.event_name == 'push'
-        run: |
-          echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}})" >> $GITHUB_ENV
+      # - name: Get Commit Message for push event
+      #   if : github.event_name == 'push'
+      #   run: |
+      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}})" >> $GITHUB_ENV

-      - name: "Config report portal"
-        run: |
-          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App macos" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      # - name: "Config report portal"
+      #   run: |
+      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App macos" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        run: |
@ -78,9 +78,9 @@ jobs:
          make test
        env:
          CSC_IDENTITY_AUTO_DISCOVERY: "false"
-          TURBO_API: "${{ secrets.TURBO_API }}"
-          TURBO_TEAM: "macos"
-          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+          # TURBO_API: "${{ secrets.TURBO_API }}"
+          # TURBO_TEAM: "macos"
+          # TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"

  test-on-macos-pr-target:
    if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository
@ -141,16 +141,16 @@ jobs:
          }
          make clean
  
-      - name: Get Commit Message for push event
-        if : github.event_name == 'push'
-        shell: bash
-        run: |
-          echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV
+      # - name: Get Commit Message for push event
+      #   if : github.event_name == 'push'
+      #   shell: bash
+      #   run: |
+      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV

-      - name: "Config report portal"
-        shell: bash
-        run: |
-          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows ${{ matrix.antivirus-tools }}" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      # - name: "Config report portal"
+      #   shell: bash
+      #   run: |
+      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows ${{ matrix.antivirus-tools }}" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        shell: powershell
@ -158,10 +158,10 @@ jobs:
          npm config set registry ${{ secrets.NPM_PROXY }} --global
          yarn config set registry ${{ secrets.NPM_PROXY }} --global
          make test
-        env:
-          TURBO_API: "${{ secrets.TURBO_API }}"
-          TURBO_TEAM: "windows"
-          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+        # env:
+        #   TURBO_API: "${{ secrets.TURBO_API }}"
+        #   TURBO_TEAM: "windows"
+        #   TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
  test-on-windows-pr:
    if: (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository)
    runs-on: windows-desktop-default-windows-security
@ -189,16 +189,16 @@ jobs:
          }
          make clean

-      - name: Get Commit Message for PR
-        if : github.event_name == 'pull_request'
-        shell: bash
-        run: |
-          echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV
+      # - name: Get Commit Message for PR
+      #   if : github.event_name == 'pull_request'
+      #   shell: bash
+      #   run: |
+      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV

-      - name: "Config report portal"
-        shell: bash
-        run: |
-          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      # - name: "Config report portal"
+      #   shell: bash
+      #   run: |
+      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        shell: powershell
@ -206,10 +206,10 @@ jobs:
          npm config set registry ${{ secrets.NPM_PROXY }} --global
          yarn config set registry ${{ secrets.NPM_PROXY }} --global
          make test
-        env:
-          TURBO_API: "${{ secrets.TURBO_API }}"
-          TURBO_TEAM: "windows"
-          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+        # env:
+        #   TURBO_API: "${{ secrets.TURBO_API }}"
+        #   TURBO_TEAM: "windows"
+        #   TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"

  test-on-windows-pr-target:
    if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository
@ -266,20 +266,20 @@ jobs:
          rm -rf ~/jan
          make clean

-      - name: Get Commit Message for PR
-        if : github.event_name == 'pull_request'
-        run: |
-          echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV
+      # - name: Get Commit Message for PR
+      #   if : github.event_name == 'pull_request'
+      #   run: |
+      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV

-      - name: Get Commit Message for push event
-        if : github.event_name == 'push'
-        run: |
-          echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV
+      # - name: Get Commit Message for push event
+      #   if : github.event_name == 'push'
+      #   run: |
+      #     echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV

-      - name: "Config report portal"
-        shell: bash
-        run: |
-          make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Linux" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"
+      # - name: "Config report portal"
+      #   shell: bash
+      #   run: |
+      #     make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Linux" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}"

      - name: Linter and test
        run: |
@ -288,10 +288,10 @@ jobs:
          npm config set registry ${{ secrets.NPM_PROXY }} --global
          yarn config set registry ${{ secrets.NPM_PROXY }} --global
          make test
-        env:
-          TURBO_API: "${{ secrets.TURBO_API }}"
-          TURBO_TEAM: "linux"
-          TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"
+        # env:
+        #   TURBO_API: "${{ secrets.TURBO_API }}"
+        #   TURBO_TEAM: "linux"
+        #   TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}"

  test-on-ubuntu-pr-target:
    runs-on: [self-hosted, Linux, ubuntu-desktop]
--- a/core/src/browser/core.ts
+++ b/core/src/browser/core.ts
@ -28,6 +28,15 @@ const downloadFile: (downloadRequest: DownloadRequest, network?: NetworkConfig)
  network
 ) => globalThis.core?.api?.downloadFile(downloadRequest, network)

+/**
+ * Get unit in bytes for a remote file.
+ *
+ * @param url - The url of the file.
+ * @returns {Promise<number>} - A promise that resolves with the file size.
+ */
+const getFileSize: (url: string) => Promise<number> = (url: string) =>
+  globalThis.core.api?.getFileSize(url)
+
 /**
 * Aborts the download of a specific file.
 * @param {string} fileName - The name of the file whose download is to be aborted.
@ -122,6 +131,7 @@ const systemInformation: () => Promise<SystemInformation> = () =>
 */
 const showToast: (title: string, message: string) => void = (title, message) =>
  globalThis.core.api?.showToast(title, message)
+
 /**
 * Register extension point function type definition
 */
@ -150,5 +160,6 @@ export {
  getUserHomePath,
  systemInformation,
  showToast,
+  getFileSize,
  FileStat,
 }
--- a/core/src/browser/extensions/huggingface.ts
+++ b/core/src/browser/extensions/huggingface.ts
@ -1,30 +0,0 @@
-import { BaseExtension, ExtensionTypeEnum } from '../extension'
-import { HuggingFaceInterface, HuggingFaceRepoData, Quantization } from '../../types/huggingface'
-import { Model } from '../../types/model'
-
-/**
- * Hugging Face extension for converting HF models to GGUF.
- */
-export abstract class HuggingFaceExtension extends BaseExtension implements HuggingFaceInterface {
-  interrupted = false
-  /**
-   * Hugging Face extension type.
-   */
-  type(): ExtensionTypeEnum | undefined {
-    return ExtensionTypeEnum.HuggingFace
-  }
-
-  abstract downloadModelFiles(
-    repoID: string,
-    repoData: HuggingFaceRepoData,
-    network?: { ignoreSSL?: boolean; proxy?: string }
-  ): Promise<void>
-  abstract convert(repoID: string): Promise<void>
-  abstract quantize(repoID: string, quantization: Quantization): Promise<void>
-  abstract generateMetadata(
-    repoID: string,
-    repoData: HuggingFaceRepoData,
-    quantization: Quantization
-  ): Promise<void>
-  abstract cancelConvert(repoID: string, repoData: HuggingFaceRepoData): Promise<void>
-}
--- a/core/src/browser/extensions/index.ts
+++ b/core/src/browser/extensions/index.ts
@ -24,11 +24,6 @@ export { AssistantExtension } from './assistant'
 */
 export { ModelExtension } from './model'

-/**
- * Hugging Face extension for converting HF models to GGUF.
- */
-export { HuggingFaceExtension } from './huggingface'
-
 /**
 * Base AI Engines.
 */
--- a/core/src/browser/extensions/model.ts
+++ b/core/src/browser/extensions/model.ts
@ -1,5 +1,12 @@
 import { BaseExtension, ExtensionTypeEnum } from '../extension'
-import { GpuSetting, ImportingModel, Model, ModelInterface, OptionType } from '../../types'
+import {
+  GpuSetting,
+  HuggingFaceRepoData,
+  ImportingModel,
+  Model,
+  ModelInterface,
+  OptionType,
+} from '../../types'

 /**
 * Model extension for managing models.
@ -24,4 +31,6 @@ export abstract class ModelExtension extends BaseExtension implements ModelInter
  abstract getConfiguredModels(): Promise<Model[]>
  abstract importModels(models: ImportingModel[], optionType: OptionType): Promise<void>
  abstract updateModelInfo(modelInfo: Partial<Model>): Promise<Model>
+  abstract fetchHuggingFaceRepoData(repoId: string): Promise<HuggingFaceRepoData>
+  abstract getDefaultModel(): Promise<Model>
 }
--- a/core/src/node/api/processors/download.ts
+++ b/core/src/node/api/processors/download.ts
@ -66,6 +66,7 @@ export class Downloader implements Processor {
      localPath: normalizedPath,
    }
    DownloadManager.instance.downloadProgressMap[modelId] = initialDownloadState
+    DownloadManager.instance.downloadInfo[normalizedPath] = initialDownloadState

    if (downloadRequest.downloadType === 'extension') {
      observer?.(DownloadEvent.onFileDownloadUpdate, initialDownloadState)
@ -118,19 +119,42 @@ export class Downloader implements Processor {
    if (rq) {
      DownloadManager.instance.networkRequests[fileName] = undefined
      rq?.abort()
-    } else {
+    }
+
+    const downloadInfo = DownloadManager.instance.downloadInfo[fileName]
    observer?.(DownloadEvent.onFileDownloadError, {
+      ...downloadInfo,
      fileName,
      error: 'aborted',
    })
  }
-  }

-  resumeDownload(observer: any, fileName: any) {
+  resumeDownload(_observer: any, fileName: any) {
    DownloadManager.instance.networkRequests[fileName]?.resume()
  }

-  pauseDownload(observer: any, fileName: any) {
+  pauseDownload(_observer: any, fileName: any) {
    DownloadManager.instance.networkRequests[fileName]?.pause()
  }
+
+  async getFileSize(_observer: any, url: string): Promise<number> {
+    return new Promise((resolve, reject) => {
+      const request = require('request')
+      request(
+        {
+          url,
+          method: 'HEAD',
+        },
+        function (err: any, response: any) {
+          if (err) {
+            console.error('Getting file size failed:', err)
+            reject(err)
+          } else {
+            const size: number = response.headers['content-length'] ?? -1
+            resolve(size)
+          }
+        }
+      )
+    })
+  }
 }
--- a/core/src/node/helper/download.ts
+++ b/core/src/node/helper/download.ts
@ -8,8 +8,12 @@ export class DownloadManager {

  public static instance: DownloadManager = new DownloadManager()

+  // store the download information with key is model id
  public downloadProgressMap: Record<string, DownloadState> = {}

+  // store the download infomation with key is normalized file path
+  public downloadInfo: Record<string, DownloadState> = {}
+
  constructor() {
    if (DownloadManager.instance) {
      return DownloadManager.instance
--- a/core/src/types/api/index.ts
+++ b/core/src/types/api/index.ts
@ -32,7 +32,6 @@ export enum AppRoute {
  startServer = 'startServer',
  stopServer = 'stopServer',
  log = 'log',
-  logServer = 'logServer',
  systemInformation = 'systemInformation',
  showToast = 'showToast',
 }
@ -52,6 +51,7 @@ export enum DownloadRoute {
  pauseDownload = 'pauseDownload',
  resumeDownload = 'resumeDownload',
  getDownloadProgress = 'getDownloadProgress',
+  getFileSize = 'getFileSize',
 }

 export enum DownloadEvent {
--- a/core/src/types/huggingface/huggingfaceEntity.ts
+++ b/core/src/types/huggingface/huggingfaceEntity.ts
@ -1,34 +1,65 @@
 export interface HuggingFaceRepoData {
  id: string
+  modelId: string
+  modelUrl?: string
  author: string
+  sha: string
+  downloads: number
+  lastModified: string
+  private: boolean
+  disabled: boolean
+  gated: boolean
+  pipeline_tag: 'text-generation'
  tags: Array<'transformers' | 'pytorch' | 'safetensors' | string>
+  cardData: Record<CardDataKeys | string, unknown>
  siblings: {
    rfilename: string
+    downloadUrl?: string
+    fileSize?: number
+    quantization?: Quantization
  }[]
-  createdAt: string // ISO 8601 timestamp
+  createdAt: string
 }

-/* eslint-disable @typescript-eslint/naming-convention */
-export enum Quantization {
-  Q3_K_S = 'Q3_K_S',
-  Q3_K_M = 'Q3_K_M', // eslint-disable-line @typescript-eslint/no-duplicate-enum-values
-  Q3_K_L = 'Q3_K_L',
-  Q4_K_S = 'Q4_K_S',
-  Q4_K_M = 'Q4_K_M', // eslint-disable-line @typescript-eslint/no-duplicate-enum-values
-  Q5_K_S = 'Q5_K_S',
-  Q5_K_M = 'Q5_K_M', // eslint-disable-line @typescript-eslint/no-duplicate-enum-values
-  Q4_0 = 'Q4_0',
-  Q4_1 = 'Q4_1',
-  Q5_0 = 'Q5_0',
-  Q5_1 = 'Q5_1',
-  IQ2_XXS = 'IQ2_XXS',
-  IQ2_XS = 'IQ2_XS',
-  Q2_K = 'Q2_K',
-  Q2_K_S = 'Q2_K_S',
-  Q6_K = 'Q6_K',
-  Q8_0 = 'Q8_0',
-  F16 = 'F16',
-  F32 = 'F32',
-  COPY = 'COPY',
-}
-/* eslint-enable @typescript-eslint/naming-convention */
+const CardDataKeys = [
+  'base_model',
+  'datasets',
+  'inference',
+  'language',
+  'library_name',
+  'license',
+  'model_creator',
+  'model_name',
+  'model_type',
+  'pipeline_tag',
+  'prompt_template',
+  'quantized_by',
+  'tags',
+] as const
+export type CardDataKeysTuple = typeof CardDataKeys
+export type CardDataKeys = CardDataKeysTuple[number]
+
+export const AllQuantizations = [
+  'Q3_K_S',
+  'Q3_K_M',
+  'Q3_K_L',
+  'Q4_K_S',
+  'Q4_K_M',
+  'Q5_K_S',
+  'Q5_K_M',
+  'Q4_0',
+  'Q4_1',
+  'Q5_0',
+  'Q5_1',
+  'IQ2_XXS',
+  'IQ2_XS',
+  'Q2_K',
+  'Q2_K_S',
+  'Q6_K',
+  'Q8_0',
+  'F16',
+  'F32',
+  'COPY',
+]
+export type QuantizationsTuple = typeof AllQuantizations
+export type Quantization = QuantizationsTuple[number]
--- a/core/src/types/huggingface/huggingfaceInterface.ts
+++ b/core/src/types/huggingface/huggingfaceInterface.ts
@ -1,58 +0,0 @@
-import { Model } from '../model'
-import { HuggingFaceRepoData, Quantization } from './huggingfaceEntity'
-
-/**
- * Hugging Face extension for converting HF models to GGUF.
- * @extends BaseExtension
- */
-export interface HuggingFaceInterface {
-  interrupted: boolean
-  /**
-   * Downloads a Hugging Face model.
-   * @param repoID - The repo ID of the model to convert.
-   * @param repoData - The repo data of the model to convert.
-   * @param network - Optional object to specify proxy/whether to ignore SSL certificates.
-   * @returns A promise that resolves when the download is complete.
-   */
-  downloadModelFiles(
-    repoID: string,
-    repoData: HuggingFaceRepoData,
-    network?: { ignoreSSL?: boolean; proxy?: string }
-  ): Promise<void>
-
-  /**
-   * Converts a Hugging Face model to GGUF.
-   * @param repoID - The repo ID of the model to convert.
-   * @returns A promise that resolves when the conversion is complete.
-   */
-  convert(repoID: string): Promise<void>
-
-  /**
-   * Quantizes a GGUF model.
-   * @param repoID - The repo ID of the model to quantize.
-   * @param quantization - The quantization to use.
-   * @returns A promise that resolves when the quantization is complete.
-   */
-  quantize(repoID: string, quantization: Quantization): Promise<void>
-
-  /**
-   * Generates Jan model metadata from a Hugging Face model.
-   * @param repoID - The repo ID of the model to generate metadata for.
-   * @param repoData - The repo data of the model to generate metadata for.
-   * @param quantization - The quantization of the model.
-   * @returns A promise that resolves when the model metadata generation is complete.
-   */
-  generateMetadata(
-    repoID: string,
-    repoData: HuggingFaceRepoData,
-    quantization: Quantization
-  ): Promise<void>
-
-  /**
-   * Cancels the convert of current Hugging Face model.
-   * @param repoID - The repository ID to cancel.
-   * @param repoData - The repository data to cancel.
-   * @returns {Promise<void>} A promise that resolves when the download has been cancelled.
-   */
-  cancelConvert(repoID: string, repoData: HuggingFaceRepoData): Promise<void>
-}
--- a/core/src/types/huggingface/index.ts
+++ b/core/src/types/huggingface/index.ts
@ -1,2 +1 @@
-export * from './huggingfaceInterface'
 export * from './huggingfaceEntity'
--- a/electron/icons/icon.ico
+++ b/electron/icons/icon.ico
--- a/electron/package.json
+++ b/electron/package.json
@ -54,6 +54,13 @@
        "nsis"
      ]
    },
+    "nsis": {
+      "oneClick": true,
+      "installerIcon": "icons/icon.ico",
+      "uninstallerIcon": "icons/icon.ico",
+      "include": "scripts/uninstaller.nsh",
+      "deleteAppDataOnUninstall": true
+    },
    "artifactName": "jan-${os}-${arch}-${version}.${ext}"
  },
  "scripts": {
--- a/electron/scripts/uninstaller.nsh
+++ b/electron/scripts/uninstaller.nsh
@ -0,0 +1,18 @@
+!include nsDialogs.nsh
+
+XPStyle on
+
+!macro customUnInstall
+; Uninstall process execution
+    ${ifNot} ${isUpdated}
+        # If you tick Delete fixed folder
+        MessageBox MB_OKCANCEL "Do you also want to delete the DEFAULT Jan data folder at $PROFILE\jan?" IDOK label_ok  IDCANCEL  label_cancel
+        label_ok:
+            # Delete user data folder
+            RMDir /r $PROFILE\jan
+            Goto end
+        label_cancel:
+            Goto end
+        end:
+    ${endIf}
+!macroend
--- a/extensions/assistant-extension/package.json
+++ b/extensions/assistant-extension/package.json
@ -35,7 +35,6 @@
    "@langchain/community": "0.0.13",
    "hnswlib-node": "^1.4.2",
    "langchain": "^0.0.214",
-    "path-browserify": "^1.0.1",
    "pdf-parse": "^1.1.1",
    "ts-loader": "^9.5.0"
  },
--- a/extensions/conversational-extension/package.json
+++ b/extensions/conversational-extension/package.json
@ -22,8 +22,7 @@
    "ts-loader": "^9.5.0"
  },
  "dependencies": {
-    "@janhq/core": "file:../../core",
-    "path-browserify": "^1.0.1"
+    "@janhq/core": "file:../../core"
  },
  "engines": {
    "node": ">=18.0.0"
--- a/extensions/conversational-extension/webpack.config.js
+++ b/extensions/conversational-extension/webpack.config.js
@ -1,4 +1,3 @@
-const path = require('path')
 const webpack = require('webpack')

 module.exports = {
@ -16,15 +15,11 @@ module.exports = {
  },
  output: {
    filename: 'index.js', // Adjust the output file name as needed
-    path: path.resolve(__dirname, 'dist'),
    library: { type: 'module' }, // Specify ESM output format
  },
  plugins: [new webpack.DefinePlugin({})],
  resolve: {
    extensions: ['.ts', '.js'],
-    fallback: {
-      path: require.resolve('path-browserify'),
-    }
  },
  // Do not minify the output, otherwise it breaks the class registration
  optimization: {
--- a/extensions/huggingface-extension/.gitignore
+++ b/extensions/huggingface-extension/.gitignore
@ -1,3 +0,0 @@
-bin
-scripts/convert*
-scripts/gguf-py
--- a/extensions/huggingface-extension/.prettierrc
+++ b/extensions/huggingface-extension/.prettierrc
@ -1,8 +0,0 @@
-{
-  "semi": false,
-  "singleQuote": true,
-  "quoteProps": "consistent",
-  "trailingComma": "es5",
-  "endOfLine": "auto",
-  "plugins": ["prettier-plugin-tailwindcss"]
-}
--- a/extensions/huggingface-extension/README.md
+++ b/extensions/huggingface-extension/README.md
@ -1,75 +0,0 @@
-# Create a Jan Extension using Typescript
-
-Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀
-
-## Create Your Own Extension
-
-To create your own extension, you can use this repository as a template! Just follow the below instructions:
-
-1. Click the Use this template button at the top of the repository
-2. Select Create a new repository
-3. Select an owner and name for your new repository
-4. Click Create repository
-5. Clone your new repository
-
-## Initial Setup
-
-After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension.
-
-> [!NOTE]
->
-> You'll need to have a reasonably modern version of
-> [Node.js](https://nodejs.org) handy. If you are using a version manager like
-> [`nodenv`](https://github.com/nodenv/nodenv) or
-> [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the
-> root of your repository to install the version specified in
-> [`package.json`](./package.json). Otherwise, 20.x or later should work!
-
-1. :hammer_and_wrench: Install the dependencies
-
-   ```bash
-   npm install
-   ```
-
-1. :building_construction: Package the TypeScript for distribution
-
-   ```bash
-   npm run bundle
-   ```
-
-1. :white_check_mark: Check your artifact
-
-   There will be a tgz file in your extension directory now
-
-## Update the Extension Metadata
-
-The [`package.json`](package.json) file defines metadata about your extension, such as
-extension name, main entry, description and version.
-
-When you copy this repository, update `package.json` with the name, description for your extension.
-
-## Update the Extension Code
-
-The [`src/`](./src/) directory is the heart of your extension! This contains the
-source code that will be run when your extension functions are invoked. You can replace the
-contents of this directory with your own code.
-
-There are a few things to keep in mind when writing your extension code:
-
- Most Jan Extension functions are processed asynchronously.
-  In `index.ts`, you will see that the extension function will return a `Promise<any>`.
-
-  ```typescript
-  import { events, MessageEvent, MessageRequest } from '@janhq/core'
-
-  function onStart(): Promise<any> {
-    return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) =>
-      this.inference(data)
-    )
-  }
-  ```
-
-  For more information about the Jan Extension Core module, see the
-  [documentation](https://github.com/janhq/jan/blob/main/core/README.md).
-
-So, what are you waiting for? Go ahead and start customizing your extension!
--- a/extensions/huggingface-extension/bin/mac-arm64/quantize
+++ b/extensions/huggingface-extension/bin/mac-arm64/quantize
--- a/extensions/huggingface-extension/package.json
+++ b/extensions/huggingface-extension/package.json
@ -1,58 +0,0 @@
-{
-  "name": "@janhq/huggingface-extension",
-  "productName": "HuggingFace",
-  "version": "1.0.0",
-  "description": "Hugging Face extension for converting HF models to GGUF",
-  "main": "dist/index.js",
-  "node": "dist/node/index.cjs.js",
-  "author": "Jan <service@jan.ai>",
-  "license": "AGPL-3.0",
-  "scripts": {
-    "build": "tsc --module commonjs && rollup -c rollup.config.ts --configPlugin @rollup/plugin-typescript --bundleConfigAsCjs",
-    "download:llama": "run-script-os",
-    "download:llama:linux": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz --wildcards '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"",
-    "download:llama:darwin": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"",
-    "download:llama:win32": "download.bat",
-    "build:publish:linux": "rimraf *.tgz --glob && npm run build && npm run download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:darwin": "rimraf *.tgz --glob && npm run build && npm run download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && ../../.github/scripts/auto-sign.sh && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:win32": "rimraf *.tgz --glob && npm run build && npm run download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish": "run-script-os"
-  },
-  "exports": {
-    ".": "./dist/index.js",
-    "./main": "./dist/node/index.cjs.js"
-  },
-  "devDependencies": {
-    "@rollup/plugin-commonjs": "^25.0.7",
-    "@rollup/plugin-json": "^6.1.0",
-    "@rollup/plugin-node-resolve": "^15.2.3",
-    "@rollup/plugin-replace": "^5.0.5",
-    "@rollup/plugin-typescript": "^11.1.6",
-    "@types/node": "^20.11.16",
-    "cpx": "^1.5.0",
-    "download-cli": "^1.1.1",
-    "rimraf": "^5.0.5",
-    "rollup": "^4.9.6",
-    "rollup-plugin-sourcemaps": "^0.6.3",
-    "rollup-plugin-typescript2": "^0.36.0",
-    "run-script-os": "^1.1.6",
-    "typescript": "^5.3.3"
-  },
-  "dependencies": {
-    "@janhq/core": "file:../../core",
-    "hyllama": "^0.1.2",
-    "python-shell": "^5.0.0",
-    "ts-loader": "^9.5.0"
-  },
-  "bundledDependencies": [
-    "python-shell"
-  ],
-  "engines": {
-    "node": ">=18.0.0"
-  },
-  "files": [
-    "dist/*",
-    "package.json",
-    "README.md"
-  ]
-}
--- a/extensions/huggingface-extension/rollup.config.ts
+++ b/extensions/huggingface-extension/rollup.config.ts
@ -1,72 +0,0 @@
-import resolve from '@rollup/plugin-node-resolve'
-import commonjs from '@rollup/plugin-commonjs'
-import sourceMaps from 'rollup-plugin-sourcemaps'
-import typescript from 'rollup-plugin-typescript2'
-import json from '@rollup/plugin-json'
-import replace from '@rollup/plugin-replace'
-
-const packageJson = require('./package.json')
-
-export default [
-  {
-    input: `src/index.ts`,
-    output: [{ file: packageJson.main, format: 'es', sourcemap: true }],
-    // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
-    external: [],
-    watch: {
-      include: 'src/**',
-    },
-    plugins: [
-      replace({
-        preventAssignment: true,
-        NODE_MODULE_PATH: JSON.stringify(
-          `${packageJson.name}/${packageJson.node}`
-        ),
-      }),
-      // Allow json resolution
-      json(),
-      //     Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
-      // Compile TypeScript files
-      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
-      commonjs(),
-      // Allow node_modules resolution, so you can use 'external' to control
-      // which external modules to include in the bundle
-      // https://github.com/rollup/rollup-plugin-node-resolve#usage
-      resolve({
-        extensions: ['.js', '.ts'],
-      }),
-
-      // Resolve source maps to the original source
-      sourceMaps(),
-    ],
-  },
-  {
-    input: `src/node/index.ts`,
-    output: [
-      { file: 'dist/node/index.cjs.js', format: 'cjs', sourcemap: true },
-    ],
-    // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
-    external: [],
-    watch: {
-      include: 'src/node/**',
-    },
-    plugins: [
-      // Allow json resolution
-      json(),
-      // Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
-      // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
-      commonjs(),
-      // Allow node_modules resolution, so you can use 'external' to control
-      // which external modules to include in the bundle
-      // https://github.com/rollup/rollup-plugin-node-resolve#usage
-      resolve({
-        extensions: ['.ts', '.js', '.json'],
-      }),
-
-      // Resolve source maps to the original source
-      sourceMaps(),
-    ],
-  },
-]
--- a/extensions/huggingface-extension/src/@types/global.d.ts
+++ b/extensions/huggingface-extension/src/@types/global.d.ts
@ -1 +0,0 @@
-declare const NODE_MODULE_PATH: string
--- a/extensions/huggingface-extension/src/index.ts
+++ b/extensions/huggingface-extension/src/index.ts
@ -1,399 +0,0 @@
-import {
-  fs,
-  downloadFile,
-  abortDownload,
-  joinPath,
-  HuggingFaceExtension,
-  HuggingFaceRepoData,
-  executeOnMain,
-  Quantization,
-  Model,
-  InferenceEngine,
-  getJanDataFolderPath,
-  events,
-  DownloadEvent,
-  log,
-  DownloadRequest,
-} from '@janhq/core'
-
-declare global {
-  interface Window {
-    electronAPI?: any
-  }
-}
-
-/**
- * A extension for models
- */
-export default class JanHuggingFaceExtension extends HuggingFaceExtension {
-  private static readonly _safetensorsRegexs = [
-    /model\.safetensors$/,
-    /model-[0-9]+-of-[0-9]+\.safetensors$/,
-  ]
-  private static readonly _pytorchRegexs = [
-    /pytorch_model\.bin$/,
-    /consolidated\.[0-9]+\.pth$/,
-    /pytorch_model-[0-9]+-of-[0-9]+\.bin$/,
-    /.*\.pt$/,
-  ]
-  interrupted = false
-
-  /**
-   * Called when the extension is loaded.
-   * @override
-   */
-  onLoad() {}
-
-  /**
-   * Called when the extension is unloaded.
-   * @override
-   */
-  onUnload(): void {}
-
-  private getFileList(repoData: HuggingFaceRepoData): string[] {
-    // SafeTensors first, if not, then PyTorch
-    const modelFiles = repoData.siblings
-      .map((file) => file.rfilename)
-      .filter((file) =>
-        JanHuggingFaceExtension._safetensorsRegexs.some((regex) =>
-          regex.test(file)
-        )
-      )
-    if (modelFiles.length === 0) {
-      repoData.siblings.forEach((file) => {
-        if (
-          JanHuggingFaceExtension._pytorchRegexs.some((regex) =>
-            regex.test(file.rfilename)
-          )
-        ) {
-          modelFiles.push(file.rfilename)
-        }
-      })
-    }
-
-    const vocabFiles = [
-      'tokenizer.model',
-      'vocab.json',
-      'tokenizer.json',
-    ].filter((file) =>
-      repoData.siblings.some((sibling) => sibling.rfilename === file)
-    )
-
-    const etcFiles = repoData.siblings
-      .map((file) => file.rfilename)
-      .filter(
-        (file) =>
-          (file.endsWith('.json') && !vocabFiles.includes(file)) ||
-          file.endsWith('.txt') ||
-          file.endsWith('.py') ||
-          file.endsWith('.tiktoken')
-      )
-
-    return [...modelFiles, ...vocabFiles, ...etcFiles]
-  }
-
-  private async getModelDirPath(repoID: string): Promise<string> {
-    const modelName = repoID.split('/').slice(1).join('/')
-    return joinPath([await getJanDataFolderPath(), 'models', modelName])
-  }
-  private async getConvertedModelPath(repoID: string): Promise<string> {
-    const modelName = repoID.split('/').slice(1).join('/')
-    const modelDirPath = await this.getModelDirPath(repoID)
-    return joinPath([modelDirPath, modelName + '.gguf'])
-  }
-  private async getQuantizedModelPath(
-    repoID: string,
-    quantization: Quantization
-  ): Promise<string> {
-    const modelName = repoID.split('/').slice(1).join('/')
-    const modelDirPath = await this.getModelDirPath(repoID)
-    return joinPath([
-      modelDirPath,
-      modelName + `-${quantization.toLowerCase()}.gguf`,
-    ])
-  }
-  private getCtxLength(config: {
-    max_sequence_length?: number
-    max_position_embeddings?: number
-    n_ctx?: number
-  }): number {
-    if (config.max_sequence_length) return config.max_sequence_length
-    if (config.max_position_embeddings) return config.max_position_embeddings
-    if (config.n_ctx) return config.n_ctx
-    return 4096
-  }
-
-  /**
-   * Downloads a Hugging Face model.
-   * @param repoID - The repo ID of the model to convert.
-   * @param repoData - The repo data of the model to convert.
-   * @param network - Optional object to specify proxy/whether to ignore SSL certificates.
-   * @returns A promise that resolves when the download is complete.
-   */
-  async downloadModelFiles(
-    repoID: string,
-    repoData: HuggingFaceRepoData,
-    network?: { ignoreSSL?: boolean; proxy?: string }
-  ): Promise<void> {
-    if (this.interrupted) return
-    const modelDirPath = await this.getModelDirPath(repoID)
-    if (!(await fs.existsSync(modelDirPath))) await fs.mkdir(modelDirPath)
-    const files = this.getFileList(repoData)
-    const filePaths: string[] = []
-
-    for (const file of files) {
-      const filePath = file
-      const localPath = await joinPath([modelDirPath, filePath])
-      const url = `https://huggingface.co/${repoID}/resolve/main/${filePath}`
-
-      if (this.interrupted) return
-      if (!(await fs.existsSync(localPath))) {
-        const downloadRequest: DownloadRequest = {
-          url,
-          localPath,
-        }
-        downloadFile(downloadRequest, network)
-        filePaths.push(filePath)
-      }
-    }
-
-    await new Promise<void>((resolve, reject) => {
-      if (filePaths.length === 0) resolve()
-      const onDownloadSuccess = async ({ fileName }: { fileName: string }) => {
-        if (filePaths.includes(fileName)) {
-          filePaths.splice(filePaths.indexOf(fileName), 1)
-          if (filePaths.length === 0) {
-            events.off(DownloadEvent.onFileDownloadSuccess, onDownloadSuccess)
-            events.off(DownloadEvent.onFileDownloadError, onDownloadError)
-            resolve()
-          }
-        }
-      }
-
-      const onDownloadError = async ({
-        fileName,
-        error,
-      }: {
-        fileName: string
-        error: Error
-      }) => {
-        if (filePaths.includes(fileName)) {
-          this.cancelConvert(repoID, repoData)
-          events.off(DownloadEvent.onFileDownloadSuccess, onDownloadSuccess)
-          events.off(DownloadEvent.onFileDownloadError, onDownloadError)
-          reject(error)
-        }
-      }
-
-      events.on(DownloadEvent.onFileDownloadSuccess, onDownloadSuccess)
-      events.on(DownloadEvent.onFileDownloadError, onDownloadError)
-    })
-  }
-
-  /**
-   * Converts a Hugging Face model to GGUF.
-   * @param repoID - The repo ID of the model to convert.
-   * @returns A promise that resolves when the conversion is complete.
-   */
-  async convert(repoID: string): Promise<void> {
-    if (this.interrupted) return
-    const modelDirPath = await this.getModelDirPath(repoID)
-    const modelOutPath = await this.getConvertedModelPath(repoID)
-    if (!(await fs.existsSync(modelDirPath))) {
-      throw new Error('Model dir not found')
-    }
-    if (await fs.existsSync(modelOutPath)) return
-
-    await executeOnMain(NODE_MODULE_PATH, 'installDeps')
-    if (this.interrupted) return
-
-    try {
-      await executeOnMain(
-        NODE_MODULE_PATH,
-        'convertHf',
-        modelDirPath,
-        modelOutPath + '.temp'
-      )
-    } catch (err) {
-      log(`[Conversion]::Debug: Error using hf-to-gguf.py, trying convert.py`)
-
-      let ctx = 4096
-      try {
-        const config = await fs.readFileSync(
-          await joinPath([modelDirPath, 'config.json']),
-          'utf8'
-        )
-        const configParsed = JSON.parse(config)
-        ctx = this.getCtxLength(configParsed)
-        configParsed.max_sequence_length = ctx
-        await fs.writeFileSync(
-          await joinPath([modelDirPath, 'config.json']),
-          JSON.stringify(configParsed, null, 2)
-        )
-      } catch (err) {
-        log(`${err}`)
-        // ignore missing config.json
-      }
-
-      const bpe = await fs.existsSync(
-        await joinPath([modelDirPath, 'vocab.json'])
-      )
-
-      await executeOnMain(
-        NODE_MODULE_PATH,
-        'convert',
-        modelDirPath,
-        modelOutPath + '.temp',
-        {
-          ctx,
-          bpe,
-        }
-      )
-    }
-    await executeOnMain(
-      NODE_MODULE_PATH,
-      'renameSync',
-      modelOutPath + '.temp',
-      modelOutPath
-    )
-
-    for (const file of await fs.readdirSync(modelDirPath)) {
-      if (
-        modelOutPath.endsWith(file) ||
-        (file.endsWith('config.json') && !file.endsWith('_config.json'))
-      )
-        continue
-      await fs.unlinkSync(await joinPath([modelDirPath, file]))
-    }
-  }
-
-  /**
-   * Quantizes a GGUF model.
-   * @param repoID - The repo ID of the model to quantize.
-   * @param quantization - The quantization to use.
-   * @returns A promise that resolves when the quantization is complete.
-   */
-  async quantize(repoID: string, quantization: Quantization): Promise<void> {
-    if (this.interrupted) return
-    const modelDirPath = await this.getModelDirPath(repoID)
-    const modelOutPath = await this.getQuantizedModelPath(repoID, quantization)
-    if (!(await fs.existsSync(modelDirPath))) {
-      throw new Error('Model dir not found')
-    }
-    if (await fs.existsSync(modelOutPath)) return
-
-    await executeOnMain(
-      NODE_MODULE_PATH,
-      'quantize',
-      await this.getConvertedModelPath(repoID),
-      modelOutPath + '.temp',
-      quantization
-    )
-    await executeOnMain(
-      NODE_MODULE_PATH,
-      'renameSync',
-      modelOutPath + '.temp',
-      modelOutPath
-    )
-
-    await fs.unlinkSync(await this.getConvertedModelPath(repoID))
-  }
-
-  /**
-   * Generates Jan model metadata from a Hugging Face model.
-   * @param repoID - The repo ID of the model to generate metadata for.
-   * @param repoData - The repo data of the model to generate metadata for.
-   * @param quantization - The quantization of the model.
-   * @returns A promise that resolves when the model metadata generation is complete.
-   */
-  async generateMetadata(
-    repoID: string,
-    repoData: HuggingFaceRepoData,
-    quantization: Quantization
-  ): Promise<void> {
-    const modelName = repoID.split('/').slice(1).join('/')
-    const filename = `${modelName}-${quantization.toLowerCase()}.gguf`
-    const modelDirPath = await this.getModelDirPath(repoID)
-    const modelPath = await this.getQuantizedModelPath(repoID, quantization)
-    const modelConfigPath = await joinPath([modelDirPath, 'model.json'])
-    if (!(await fs.existsSync(modelPath))) {
-      throw new Error('Model not found')
-    }
-
-    const size = await executeOnMain(NODE_MODULE_PATH, 'getSize', modelPath)
-    let ctx = 4096
-    try {
-      const config = await fs.readFileSync(
-        await joinPath([modelDirPath, 'config.json']),
-        'utf8'
-      )
-      ctx = this.getCtxLength(JSON.parse(config))
-      fs.unlinkSync(await joinPath([modelDirPath, 'config.json']))
-    } catch (err) {
-      // ignore missing config.json
-    }
-    // maybe later, currently it's gonna use too much memory
-    // const buffer = await fs.readFileSync(quantizedModelPath)
-    // const ggufData = ggufMetadata(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))
-
-    const metadata: Model = {
-      object: 'model',
-      version: '1.0',
-      format: 'gguf',
-      sources: [
-        {
-          url: `https://huggingface.co/${repoID}`, // i think this is just for download but not sure,
-          filename,
-        },
-      ],
-      id: modelName,
-      name: modelName,
-      created: Date.now(),
-      description: `Auto converted from Hugging Face model: ${repoID}`,
-      settings: {
-        ctx_len: ctx,
-        prompt_template: '',
-        llama_model_path: modelName,
-      },
-      parameters: {
-        temperature: 0.7,
-        top_p: 0.95,
-        stream: true,
-        max_tokens: 4096,
-        // stop: [''], seems like we dont really need this..?
-        frequency_penalty: 0,
-        presence_penalty: 0,
-      },
-      metadata: {
-        author: repoData.author,
-        tags: repoData.tags,
-        size,
-      },
-      engine: InferenceEngine.nitro,
-    }
-
-    await fs.writeFileSync(modelConfigPath, JSON.stringify(metadata, null, 2))
-  }
-
-  /**
-   * Cancels the convert of current Hugging Face model.
-   * @param repoID - The repository ID to cancel.
-   * @param repoData - The repository data to cancel.
-   * @returns {Promise<void>} A promise that resolves when the download has been cancelled.
-   */
-  async cancelConvert(
-    repoID: string,
-    repoData: HuggingFaceRepoData
-  ): Promise<void> {
-    this.interrupted = true
-    const modelDirPath = await this.getModelDirPath(repoID)
-    const files = this.getFileList(repoData)
-    for (const file of files) {
-      const filePath = file
-      const localPath = await joinPath([modelDirPath, filePath])
-      await abortDownload(localPath)
-    }
-
-    executeOnMain(NODE_MODULE_PATH, 'killProcesses')
-  }
-}
--- a/extensions/huggingface-extension/tsconfig.json
+++ b/extensions/huggingface-extension/tsconfig.json
@ -1,20 +0,0 @@
-{
-  "compilerOptions": {
-    "moduleResolution": "node",
-    "target": "es2020",
-    "module": "ES2020",
-    "lib": ["es2015", "es2016", "es2017", "dom"],
-    "strict": true,
-    "sourceMap": true,
-    "declaration": true,
-    "allowSyntheticDefaultImports": true,
-    "experimentalDecorators": true,
-    "emitDecoratorMetadata": true,
-    "declarationDir": "dist/types",
-    "outDir": "dist",
-    "importHelpers": true,
-    "typeRoots": ["node_modules/@types"],
-    "resolveJsonModule": true,
-  },
-  "include": ["src"],
-}
--- a/extensions/inference-groq-extension/package.json
+++ b/extensions/inference-groq-extension/package.json
@ -25,7 +25,6 @@
  "dependencies": {
    "@janhq/core": "file:../../core",
    "fetch-retry": "^5.0.6",
-    "path-browserify": "^1.0.1",
    "ulidx": "^2.3.0"
  },
  "engines": {
--- a/extensions/inference-groq-extension/resources/models.json
+++ b/extensions/inference-groq-extension/resources/models.json
@ -1,4 +1,88 @@
 [
+  {
+    "sources": [
+      {
+        "url": "https://groq.com"
+      }
+    ],
+    "id": "llama3-70b-8192",
+    "object": "model",
+    "name": "Groq Llama 3 70b",
+    "version": "1.0",
+    "description": "Groq Llama 3 70b with supercharged speed!",
+    "format": "api",
+    "settings": {
+      "text_model": false
+    },
+    "parameters": {
+      "max_tokens": 8192,
+      "temperature": 0.7,
+      "top_p": 1,
+      "stop": null,
+      "stream": true
+    },
+    "metadata": {
+      "author": "Meta",
+      "tags": ["General", "Big Context Length"]
+    },
+    "engine": "groq"
+  },
+  {
+    "sources": [
+      {
+        "url": "https://groq.com"
+      }
+    ],
+    "id": "llama3-8b-8192",
+    "object": "model",
+    "name": "Groq Llama 3 8b",
+    "version": "1.0",
+    "description": "Groq Llama 3 8b with supercharged speed!",
+    "format": "api",
+    "settings": {
+      "text_model": false
+    },
+    "parameters": {
+      "max_tokens": 8192,
+      "temperature": 0.7,
+      "top_p": 1,
+      "stop": null,
+      "stream": true
+    },
+    "metadata": {
+      "author": "Meta",
+      "tags": ["General", "Big Context Length"]
+    },
+    "engine": "groq"
+  },
+  {
+    "sources": [
+      {
+        "url": "https://groq.com"
+      }
+    ],
+    "id": "gemma-7b-it",
+    "object": "model",
+    "name": "Groq Gemma 7b Instruct",
+    "version": "1.0",
+    "description": "Groq Gemma 7b Instruct with supercharged speed!",
+    "format": "api",
+    "settings": {
+      "text_model": false
+    },
+    "parameters": {
+      "max_tokens": 4096,
+      "temperature": 0.7,
+      "top_p": 1,
+      "stop": null,
+      "stream": true
+    },
+    "metadata": {
+      "author": "Google",
+      "tags": ["General"]
+    },
+    "engine": "groq"
+  },
  {
    "sources": [
      {
--- a/extensions/inference-groq-extension/webpack.config.js
+++ b/extensions/inference-groq-extension/webpack.config.js
@ -1,4 +1,3 @@
-const path = require('path')
 const webpack = require('webpack')
 const packageJson = require('./package.json')
 const settingJson = require('./resources/settings.json')
@ -26,14 +25,10 @@ module.exports = {
  ],
  output: {
    filename: 'index.js', // Adjust the output file name as needed
-    path: path.resolve(__dirname, 'dist'),
    library: { type: 'module' }, // Specify ESM output format
  },
  resolve: {
    extensions: ['.ts', '.js'],
-    fallback: {
-      path: require.resolve('path-browserify'),
-    },
  },
  optimization: {
    minimize: false,
--- a/extensions/inference-nitro-extension/bin/version.txt
+++ b/extensions/inference-nitro-extension/bin/version.txt
@ -1 +1 @@
-0.3.16-hotfix
+0.3.22
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@ -1,7 +1,7 @@
 {
  "name": "@janhq/inference-nitro-extension",
  "productName": "Nitro Inference Engine",
-  "version": "1.0.1",
+  "version": "1.0.2",
  "description": "This extension embeds Nitro, a lightweight (3mb) inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
  "main": "dist/index.js",
  "node": "dist/node/index.cjs.js",
@ -51,7 +51,6 @@
    "@janhq/core": "file:../../core",
    "decompress": "^4.2.1",
    "fetch-retry": "^5.0.6",
-    "path-browserify": "^1.0.1",
    "rxjs": "^7.8.1",
    "tcp-port-used": "^1.0.2",
    "terminate": "^2.6.1",
--- a/extensions/inference-nitro-extension/resources/default_settings.json
+++ b/extensions/inference-nitro-extension/resources/default_settings.json
@ -27,7 +27,7 @@
      "min": 0,
      "max": 4096,
      "step": 128,
-      "value": 4096
+      "value": 2048
    }
  }
 ]
--- a/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json
@ -8,7 +8,7 @@
    "id": "command-r-34b",
    "object": "model",
    "name": "Command-R v01 34B Q4",
-    "version": "1.1",
+    "version": "1.2",
    "description": "C4AI Command-R developed by CohereAI is optimized for a variety of use cases including reasoning, summarization, and question answering.",
    "format": "gguf",
    "settings": {
@ -27,7 +27,7 @@
    },
    "metadata": {
      "author": "CohereAI",
-      "tags": ["34B", "Finetuned", "Coming Soon", "Unavailable"],
+      "tags": ["34B", "Finetuned"],
      "size": 21500000000
    },
    "engine": "nitro"
--- a/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json
+++ b/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json
@ -0,0 +1,34 @@
+{
+    "sources": [
+      {
+        "filename": "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
+        "url": "https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
+      }
+    ],
+    "id": "llama3-8b-instruct",
+    "object": "model",
+    "name": "Llama 3 8B Q4",
+    "version": "1.0",
+    "description": "Meta's Llama 3 excels at general usage situations, including chat, general world knowledge, and coding.",
+    "format": "gguf",
+    "settings": {
+      "ctx_len": 8192,
+      "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "llama_model_path": "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
+    },
+    "parameters": {
+      "temperature": 0.7,
+      "top_p": 0.95,
+      "stream": true,
+      "max_tokens": 4096,
+      "stop": ["<|end_of_text|>","<|eot_id|>"],
+      "frequency_penalty": 0,
+      "presence_penalty": 0
+    },
+    "metadata": {
+      "author": "MetaAI",
+      "tags": ["7B", "Featured"],
+      "size": 4920000000
+    },
+    "engine": "nitro"
+}
--- a/extensions/inference-nitro-extension/resources/models/phind-34b/model.json
+++ b/extensions/inference-nitro-extension/resources/models/phind-34b/model.json
@ -1,14 +1,14 @@
 {
  "sources": [
    {
-      "filename": "phind-codellama-34b-v2.Q4_K_M.gguf",
+      "filename": "phind-codellama-34b-v2.Q5_K_M.gguf",
      "url": "https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v2-GGUF/resolve/main/phind-codellama-34b-v2.Q5_K_M.gguf"
    }
  ],
  "id": "phind-34b",
  "object": "model",
  "name": "Phind 34B Q4",
-  "version": "1.0",
+  "version": "1.1",
  "description": "Phind 34B is the best Open-source coding model.",
  "format": "gguf",
  "settings": {
--- a/extensions/inference-nitro-extension/rollup.config.ts
+++ b/extensions/inference-nitro-extension/rollup.config.ts
@ -36,6 +36,7 @@ const trinityv127bJson = require('./resources/models/trinity-v1.2-7b/model.json'
 const vistral7bJson = require('./resources/models/vistral-7b/model.json')
 const wizardcoder13bJson = require('./resources/models/wizardcoder-13b/model.json')
 const yi34bJson = require('./resources/models/yi-34b/model.json')
+const llama3Json = require('./resources/models/llama3-8b-instruct/model.json')

 export default [
  {
@ -79,6 +80,7 @@ export default [
          vistral7bJson,
          wizardcoder13bJson,
          yi34bJson,
+          llama3Json
        ]),
        NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
        DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson),
--- a/extensions/inference-openai-extension/package.json
+++ b/extensions/inference-openai-extension/package.json
@ -26,7 +26,6 @@
  "dependencies": {
    "@janhq/core": "file:../../core",
    "fetch-retry": "^5.0.6",
-    "path-browserify": "^1.0.1",
    "ulidx": "^2.3.0"
  },
  "engines": {
--- a/extensions/inference-openai-extension/webpack.config.js
+++ b/extensions/inference-openai-extension/webpack.config.js
@ -1,4 +1,3 @@
-const path = require('path')
 const webpack = require('webpack')
 const packageJson = require('./package.json')
 const settingJson = require('./resources/settings.json')
@ -26,14 +25,10 @@ module.exports = {
  ],
  output: {
    filename: 'index.js', // Adjust the output file name as needed
-    path: path.resolve(__dirname, 'dist'),
    library: { type: 'module' }, // Specify ESM output format
  },
  resolve: {
    extensions: ['.ts', '.js'],
-    fallback: {
-      path: require.resolve('path-browserify'),
-    },
  },
  optimization: {
    minimize: false,
--- a/extensions/inference-triton-trtllm-extension/package.json
+++ b/extensions/inference-triton-trtllm-extension/package.json
@ -26,7 +26,6 @@
  "dependencies": {
    "@janhq/core": "file:../../core",
    "fetch-retry": "^5.0.6",
-    "path-browserify": "^1.0.1",
    "rxjs": "^7.8.1",
    "ulidx": "^2.3.0"
  },
--- a/extensions/inference-triton-trtllm-extension/webpack.config.js
+++ b/extensions/inference-triton-trtllm-extension/webpack.config.js
@ -1,4 +1,3 @@
-const path = require('path')
 const webpack = require('webpack')
 const packageJson = require('./package.json')
 const settingJson = require('./resources/settings.json')
@ -24,14 +23,10 @@ module.exports = {
  ],
  output: {
    filename: 'index.js', // Adjust the output file name as needed
-    path: path.resolve(__dirname, 'dist'),
    library: { type: 'module' }, // Specify ESM output format
  },
  resolve: {
    extensions: ['.ts', '.js'],
-    fallback: {
-      path: require.resolve('path-browserify'),
-    },
  },
  optimization: {
    minimize: false,
--- a/extensions/huggingface-extension/download.bat
+++ b/extensions/huggingface-extension/download.bat
--- a/extensions/model-extension/package.json
+++ b/extensions/model-extension/package.json
@ -4,15 +4,23 @@
  "version": "1.0.30",
  "description": "Model Management Extension provides model exploration and seamless downloads",
  "main": "dist/index.js",
-  "module": "dist/module.js",
+  "node": "dist/node/index.cjs.js",
  "author": "Jan <service@jan.ai>",
  "license": "AGPL-3.0",
  "scripts": {
-    "build": "rollup -c rollup.config.ts",
-    "build:publish": "rimraf *.tgz --glob && yarn build && npm pack && cpx *.tgz ../../pre-install"
+    "build": "tsc --module commonjs && rollup -c rollup.config.ts --configPlugin @rollup/plugin-typescript --bundleConfigAsCjs",
+    "download:llama": "run-script-os",
+    "download:llama:linux": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz --wildcards '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"",
+    "download:llama:darwin": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"",
+    "download:llama:win32": "download.bat",
+    "build:publish:linux": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
+    "build:publish:darwin": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && ../../.github/scripts/auto-sign.sh && npm pack && cpx *.tgz ../../pre-install",
+    "build:publish:win32": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
+    "build:publish": "run-script-os"
  },
  "devDependencies": {
    "cpx": "^1.5.0",
+    "download-cli": "^1.1.1",
    "rimraf": "^3.0.2",
    "ts-loader": "^9.5.0",
    "typescript": "5.3.3",
@ -20,6 +28,7 @@
    "@rollup/plugin-json": "^6.1.0",
    "@rollup/plugin-node-resolve": "^15.2.3",
    "@rollup/plugin-replace": "^5.0.5",
+    "@rollup/plugin-typescript": "^11.1.6",
    "@types/pdf-parse": "^1.1.4",
    "rollup": "^2.38.5",
    "rollup-plugin-define": "^1.0.1",
@ -33,6 +42,7 @@
  ],
  "dependencies": {
    "@janhq/core": "file:../../core",
-    "path-browserify": "^1.0.1"
+    "@huggingface/gguf": "^0.0.11",
+    "python-shell": "^5.0.0"
  }
 }
--- a/extensions/model-extension/resources/default-model.json
+++ b/extensions/model-extension/resources/default-model.json
@ -13,7 +13,7 @@
  "created": 0,
  "description": "User self import model",
  "settings": {
-    "ctx_len": 4096,
+    "ctx_len": 2048,
    "embedding": false,
    "prompt_template": "{system_message}\n### Instruction: {prompt}\n### Response:",
    "llama_model_path": "N/A"
--- a/extensions/model-extension/rollup.config.ts
+++ b/extensions/model-extension/rollup.config.ts
@ -20,10 +20,7 @@ export default [
      replace({
        preventAssignment: true,
        DEFAULT_MODEL: JSON.stringify(defaultModelJson),
-        MODULE_PATH: JSON.stringify(
-          `${packageJson.name}/${packageJson.module}`
-        ),
-        VERSION: JSON.stringify(packageJson.version),
+        NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
      }),
      // Allow json resolution
      json(),
--- a/extensions/model-extension/scripts/convert-hf-to-gguf.py
+++ b/extensions/model-extension/scripts/convert-hf-to-gguf.py
--- a/extensions/model-extension/scripts/convert.py
+++ b/extensions/model-extension/scripts/convert.py
--- a/extensions/model-extension/scripts/gguf-py/LICENSE
+++ b/extensions/model-extension/scripts/gguf-py/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Georgi Gerganov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/extensions/model-extension/scripts/gguf-py/README.md
+++ b/extensions/model-extension/scripts/gguf-py/README.md
@ -0,0 +1,81 @@
+## gguf
+
+This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
+(GGML Universal File) format.
+
+See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py)
+as an example for its usage.
+
+## Installation
+```sh
+pip install gguf
+```
+
+## API Examples/Simple Tools
+
+[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
+
+[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console.
+
+[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key.
+
+[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files.
+
+## Development
+Maintainers who participate in development of this package are advised to install it in editable mode:
+
+```sh
+cd /path/to/llama.cpp/gguf-py
+
+pip install --editable .
+```
+
+**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
+In this case, upgrade Pip to the latest:
+
+```sh
+pip install --upgrade pip
+```
+
+## Automatic publishing with CI
+
+There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
+
+1. Bump the version in `pyproject.toml`.
+2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
+
+```sh
+git tag -a gguf-v1.0.0 -m "Version 1.0 release"
+```
+
+3. Push the tags.
+
+```sh
+git push origin --tags
+```
+
+## Manual publishing
+If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:
+
+```sh
+pip install build twine
+```
+
+Then, follow these steps to release a new version:
+
+1. Bump the version in `pyproject.toml`.
+2. Build the package:
+
+```sh
+python -m build
+```
+
+3. Upload the generated distribution archives:
+
+```sh
+python -m twine upload dist/*
+```
+
+## TODO
+- [ ] Add tests
+- [ ] Include conversion scripts as command line entry points in this package.
--- a/extensions/model-extension/scripts/gguf-py/examples/writer.py
+++ b/extensions/model-extension/scripts/gguf-py/examples/writer.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+
+import numpy as np
+
+# Necessary to load the local gguf package
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from gguf import GGUFWriter  # noqa: E402
+
+
+# Example usage:
+def writer_example() -> None:
+    # Example usage with a file
+    gguf_writer = GGUFWriter("example.gguf", "llama")
+
+    gguf_writer.add_architecture()
+    gguf_writer.add_block_count(12)
+    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
+    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
+    gguf_writer.add_custom_alignment(64)
+
+    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
+    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
+    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
+
+    gguf_writer.add_tensor("tensor1", tensor1)
+    gguf_writer.add_tensor("tensor2", tensor2)
+    gguf_writer.add_tensor("tensor3", tensor3)
+
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+
+    gguf_writer.close()
+
+
+if __name__ == '__main__':
+    writer_example()
--- a/extensions/model-extension/scripts/gguf-py/gguf/init.py
+++ b/extensions/model-extension/scripts/gguf-py/gguf/init.py
@ -0,0 +1,5 @@
+from .constants import *
+from .gguf_reader import *
+from .gguf_writer import *
+from .tensor_mapping import *
+from .vocab import *
--- a/extensions/model-extension/scripts/gguf-py/gguf/constants.py
+++ b/extensions/model-extension/scripts/gguf-py/gguf/constants.py
@ -0,0 +1,665 @@
+from __future__ import annotations
+
+import sys
+from enum import Enum, IntEnum, auto
+from typing import Any
+
+#
+# constants
+#
+
+GGUF_MAGIC             = 0x46554747  # "GGUF"
+GGUF_VERSION           = 3
+GGUF_DEFAULT_ALIGNMENT = 32
+
+#
+# metadata keys
+#
+
+
+class Keys:
+    class General:
+        ARCHITECTURE         = "general.architecture"
+        QUANTIZATION_VERSION = "general.quantization_version"
+        ALIGNMENT            = "general.alignment"
+        NAME                 = "general.name"
+        AUTHOR               = "general.author"
+        URL                  = "general.url"
+        DESCRIPTION          = "general.description"
+        LICENSE              = "general.license"
+        SOURCE_URL           = "general.source.url"
+        SOURCE_HF_REPO       = "general.source.huggingface.repository"
+        FILE_TYPE            = "general.file_type"
+
+    class LLM:
+        CONTEXT_LENGTH        = "{arch}.context_length"
+        EMBEDDING_LENGTH      = "{arch}.embedding_length"
+        BLOCK_COUNT           = "{arch}.block_count"
+        FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
+        USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
+        TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+        EXPERT_COUNT          = "{arch}.expert_count"
+        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
+
+    class Attention:
+        HEAD_COUNT        = "{arch}.attention.head_count"
+        HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
+        MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
+        CLAMP_KQV         = "{arch}.attention.clamp_kqv"
+        KEY_LENGTH        = "{arch}.attention.key_length"
+        VALUE_LENGTH      = "{arch}.attention.value_length"
+        LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
+        LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+
+    class Rope:
+        DIMENSION_COUNT      = "{arch}.rope.dimension_count"
+        FREQ_BASE            = "{arch}.rope.freq_base"
+        SCALING_TYPE         = "{arch}.rope.scaling.type"
+        SCALING_FACTOR       = "{arch}.rope.scaling.factor"
+        SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
+        SCALING_FINETUNED    = "{arch}.rope.scaling.finetuned"
+
+    class Tokenizer:
+        MODEL         = "tokenizer.ggml.model"
+        LIST          = "tokenizer.ggml.tokens"
+        TOKEN_TYPE    = "tokenizer.ggml.token_type"
+        SCORES        = "tokenizer.ggml.scores"
+        MERGES        = "tokenizer.ggml.merges"
+        BOS_ID        = "tokenizer.ggml.bos_token_id"
+        EOS_ID        = "tokenizer.ggml.eos_token_id"
+        UNK_ID        = "tokenizer.ggml.unknown_token_id"
+        SEP_ID        = "tokenizer.ggml.seperator_token_id"
+        PAD_ID        = "tokenizer.ggml.padding_token_id"
+        ADD_BOS       = "tokenizer.ggml.add_bos_token"
+        ADD_EOS       = "tokenizer.ggml.add_eos_token"
+        ADD_PREFIX    = "tokenizer.ggml.add_space_prefix"
+        HF_JSON       = "tokenizer.huggingface.json"
+        RWKV          = "tokenizer.rwkv.world"
+        CHAT_TEMPLATE = "tokenizer.chat_template"
+
+
+#
+# recommended mapping of model tensor names for storage in gguf
+#
+
+
+class MODEL_ARCH(IntEnum):
+    LLAMA     = auto()
+    FALCON    = auto()
+    BAICHUAN  = auto()
+    GPT2      = auto()
+    GPTJ      = auto()
+    GPTNEOX   = auto()
+    MPT       = auto()
+    STARCODER = auto()
+    PERSIMMON = auto()
+    REFACT    = auto()
+    BERT      = auto()
+    BLOOM     = auto()
+    STABLELM  = auto()
+    QWEN      = auto()
+    QWEN2     = auto()
+    PHI2      = auto()
+    PLAMO     = auto()
+    CODESHELL = auto()
+    ORION     = auto()
+    INTERNLM2  = auto()
+    MINICPM   = auto()
+
+
+class MODEL_TENSOR(IntEnum):
+    TOKEN_EMBD      = auto()
+    TOKEN_EMBD_NORM = auto()
+    TOKEN_TYPES     = auto()
+    POS_EMBD        = auto()
+    OUTPUT          = auto()
+    OUTPUT_NORM     = auto()
+    ROPE_FREQS      = auto()
+    ATTN_Q          = auto()
+    ATTN_K          = auto()
+    ATTN_V          = auto()
+    ATTN_QKV        = auto()
+    ATTN_OUT        = auto()
+    ATTN_NORM       = auto()
+    ATTN_NORM_2     = auto()
+    ATTN_ROT_EMBD   = auto()
+    FFN_GATE_INP    = auto()
+    FFN_NORM        = auto()
+    FFN_GATE        = auto()
+    FFN_DOWN        = auto()
+    FFN_UP          = auto()
+    FFN_ACT         = auto()
+    FFN_GATE_EXP    = auto()
+    FFN_DOWN_EXP    = auto()
+    FFN_UP_EXP      = auto()
+    ATTN_Q_NORM     = auto()
+    ATTN_K_NORM     = auto()
+
+
+MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
+    MODEL_ARCH.LLAMA:          "llama",
+    MODEL_ARCH.FALCON:         "falcon",
+    MODEL_ARCH.BAICHUAN:       "baichuan",
+    MODEL_ARCH.GPT2:           "gpt2",
+    MODEL_ARCH.GPTJ:           "gptj",
+    MODEL_ARCH.GPTNEOX:        "gptneox",
+    MODEL_ARCH.MPT:            "mpt",
+    MODEL_ARCH.STARCODER:      "starcoder",
+    MODEL_ARCH.PERSIMMON:      "persimmon",
+    MODEL_ARCH.REFACT:         "refact",
+    MODEL_ARCH.BERT:           "bert",
+    MODEL_ARCH.BLOOM:          "bloom",
+    MODEL_ARCH.STABLELM:       "stablelm",
+    MODEL_ARCH.QWEN:           "qwen",
+    MODEL_ARCH.QWEN2:          "qwen2",
+    MODEL_ARCH.PHI2:           "phi2",
+    MODEL_ARCH.PLAMO:          "plamo",
+    MODEL_ARCH.CODESHELL:      "codeshell",
+    MODEL_ARCH.ORION:          "orion",
+    MODEL_ARCH.INTERNLM2:      "internlm2",
+    MODEL_ARCH.MINICPM:        "minicpm",
+}
+
+TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
+    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
+    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
+    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
+    MODEL_TENSOR.POS_EMBD:        "position_embd",
+    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
+    MODEL_TENSOR.OUTPUT:          "output",
+    MODEL_TENSOR.ROPE_FREQS:      "rope_freqs",
+    MODEL_TENSOR.ATTN_NORM:       "blk.{bid}.attn_norm",
+    MODEL_TENSOR.ATTN_NORM_2:     "blk.{bid}.attn_norm_2",
+    MODEL_TENSOR.ATTN_QKV:        "blk.{bid}.attn_qkv",
+    MODEL_TENSOR.ATTN_Q:          "blk.{bid}.attn_q",
+    MODEL_TENSOR.ATTN_K:          "blk.{bid}.attn_k",
+    MODEL_TENSOR.ATTN_V:          "blk.{bid}.attn_v",
+    MODEL_TENSOR.ATTN_OUT:        "blk.{bid}.attn_output",
+    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
+    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
+    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
+    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
+    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
+    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
+    MODEL_TENSOR.FFN_ACT:         "blk.{bid}.ffn",
+    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
+    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
+    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
+}
+
+MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
+    MODEL_ARCH.LLAMA: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    MODEL_ARCH.GPTNEOX: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.FALCON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_NORM_2,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.BAICHUAN: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.STARCODER: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.BERT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_TYPES,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.MPT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_ACT,
+    ],
+    MODEL_ARCH.GPTJ: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.PERSIMMON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.REFACT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.BLOOM: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.STABLELM: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.QWEN: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.QWEN2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.PLAMO: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.GPT2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.PHI2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.CODESHELL: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.ORION: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.INTERNLM2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.MINICPM: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    # TODO
+}
+
+# tensors that will not be serialized
+MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
+    MODEL_ARCH.LLAMA: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.BAICHUAN: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.PERSIMMON: [
+        MODEL_TENSOR.ROPE_FREQS,
+    ],
+    MODEL_ARCH.QWEN: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.CODESHELL: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.ORION: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+}
+
+#
+# types
+#
+
+
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+
+
+class RopeScalingType(Enum):
+    NONE   = 'none'
+    LINEAR = 'linear'
+    YARN   = 'yarn'
+
+
+class GGMLQuantizationType(IntEnum):
+    F32  = 0
+    F16  = 1
+    Q4_0 = 2
+    Q4_1 = 3
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15
+
+
+class GGUFEndian(IntEnum):
+    LITTLE = 0
+    BIG = 1
+
+
+class GGUFValueType(IntEnum):
+    UINT8   = 0
+    INT8    = 1
+    UINT16  = 2
+    INT16   = 3
+    UINT32  = 4
+    INT32   = 5
+    FLOAT32 = 6
+    BOOL    = 7
+    STRING  = 8
+    ARRAY   = 9
+    UINT64  = 10
+    INT64   = 11
+    FLOAT64 = 12
+
+    @staticmethod
+    def get_type(val: Any) -> GGUFValueType:
+        if isinstance(val, (str, bytes, bytearray)):
+            return GGUFValueType.STRING
+        elif isinstance(val, list):
+            return GGUFValueType.ARRAY
+        elif isinstance(val, float):
+            return GGUFValueType.FLOAT32
+        elif isinstance(val, bool):
+            return GGUFValueType.BOOL
+        elif isinstance(val, int):
+            return GGUFValueType.INT32
+        # TODO: need help with 64-bit types in Python
+        else:
+            print("Unknown type:", type(val))
+            sys.exit()
+
+
+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+    GGMLQuantizationType.F32:  (1, 4),
+    GGMLQuantizationType.F16:  (1, 2),
+    GGMLQuantizationType.Q4_0: (32, 2 + 16),
+    GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
+    GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
+    GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
+    GGMLQuantizationType.Q8_0: (32, 2 + 32),
+    GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
+    GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
+    GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
+}
+
+
+# Aliases for backward compatibility.
+
+# general
+KEY_GENERAL_ARCHITECTURE         = Keys.General.ARCHITECTURE
+KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
+KEY_GENERAL_ALIGNMENT            = Keys.General.ALIGNMENT
+KEY_GENERAL_NAME                 = Keys.General.NAME
+KEY_GENERAL_AUTHOR               = Keys.General.AUTHOR
+KEY_GENERAL_URL                  = Keys.General.URL
+KEY_GENERAL_DESCRIPTION          = Keys.General.DESCRIPTION
+KEY_GENERAL_LICENSE              = Keys.General.LICENSE
+KEY_GENERAL_SOURCE_URL           = Keys.General.SOURCE_URL
+KEY_GENERAL_SOURCE_HF_REPO       = Keys.General.SOURCE_HF_REPO
+KEY_GENERAL_FILE_TYPE            = Keys.General.FILE_TYPE
+
+# LLM
+KEY_CONTEXT_LENGTH        = Keys.LLM.CONTEXT_LENGTH
+KEY_EMBEDDING_LENGTH      = Keys.LLM.EMBEDDING_LENGTH
+KEY_BLOCK_COUNT           = Keys.LLM.BLOCK_COUNT
+KEY_FEED_FORWARD_LENGTH   = Keys.LLM.FEED_FORWARD_LENGTH
+KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
+KEY_TENSOR_DATA_LAYOUT    = Keys.LLM.TENSOR_DATA_LAYOUT
+
+# attention
+KEY_ATTENTION_HEAD_COUNT        = Keys.Attention.HEAD_COUNT
+KEY_ATTENTION_HEAD_COUNT_KV     = Keys.Attention.HEAD_COUNT_KV
+KEY_ATTENTION_MAX_ALIBI_BIAS    = Keys.Attention.MAX_ALIBI_BIAS
+KEY_ATTENTION_CLAMP_KQV         = Keys.Attention.CLAMP_KQV
+KEY_ATTENTION_LAYERNORM_EPS     = Keys.Attention.LAYERNORM_EPS
+KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
+
+# RoPE
+KEY_ROPE_DIMENSION_COUNT      = Keys.Rope.DIMENSION_COUNT
+KEY_ROPE_FREQ_BASE            = Keys.Rope.FREQ_BASE
+KEY_ROPE_SCALING_TYPE         = Keys.Rope.SCALING_TYPE
+KEY_ROPE_SCALING_FACTOR       = Keys.Rope.SCALING_FACTOR
+KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
+KEY_ROPE_SCALING_FINETUNED    = Keys.Rope.SCALING_FINETUNED
+
+# tokenization
+KEY_TOKENIZER_MODEL      = Keys.Tokenizer.MODEL
+KEY_TOKENIZER_LIST       = Keys.Tokenizer.LIST
+KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
+KEY_TOKENIZER_SCORES     = Keys.Tokenizer.SCORES
+KEY_TOKENIZER_MERGES     = Keys.Tokenizer.MERGES
+KEY_TOKENIZER_BOS_ID     = Keys.Tokenizer.BOS_ID
+KEY_TOKENIZER_EOS_ID     = Keys.Tokenizer.EOS_ID
+KEY_TOKENIZER_UNK_ID     = Keys.Tokenizer.UNK_ID
+KEY_TOKENIZER_SEP_ID     = Keys.Tokenizer.SEP_ID
+KEY_TOKENIZER_PAD_ID     = Keys.Tokenizer.PAD_ID
+KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
+KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
--- a/extensions/model-extension/scripts/gguf-py/gguf/gguf.py
+++ b/extensions/model-extension/scripts/gguf-py/gguf/gguf.py
@ -0,0 +1,15 @@
+# This file left for compatibility. If you want to use the GGUF API from Python
+# then don't import gguf/gguf.py directly. If you're looking for examples, see the
+# examples/ directory for gguf-py
+
+import importlib
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+# Compatibility for people trying to import gguf/gguf.py directly instead of as a package.
+importlib.invalidate_caches()
+import gguf  # noqa: E402
+
+importlib.reload(gguf)
--- a/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py
+++ b/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py
@ -0,0 +1,264 @@
+#
+# GGUF file reading/modification support. For API usage information,
+# please see the files scripts/ for some fairly simple examples.
+#
+from __future__ import annotations
+
+import os
+from collections import OrderedDict
+from typing import Any, Literal, NamedTuple, TypeVar, Union
+
+import numpy as np
+import numpy.typing as npt
+
+if __name__ == "__main__":
+    import sys
+    from pathlib import Path
+
+    # Allow running file in package as a script.
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from gguf.constants import (
+    GGML_QUANT_SIZES,
+    GGUF_DEFAULT_ALIGNMENT,
+    GGUF_MAGIC,
+    GGUF_VERSION,
+    GGMLQuantizationType,
+    GGUFValueType,
+)
+
+
+READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
+
+
+class ReaderField(NamedTuple):
+    # Offset to start of this field.
+    offset: int
+
+    # Name of the field (not necessarily from file data).
+    name: str
+
+    # Data parts. Some types have multiple components, such as strings
+    # that consist of a length followed by the string data.
+    parts: list[npt.NDArray[Any]] = []
+
+    # Indexes into parts that we can call the actual data. For example
+    # an array of strings will be populated with indexes to the actual
+    # string data.
+    data: list[int] = [-1]
+
+    types: list[GGUFValueType] = []
+
+
+class ReaderTensor(NamedTuple):
+    name: str
+    tensor_type: GGMLQuantizationType
+    shape: npt.NDArray[np.uint32]
+    n_elements: int
+    n_bytes: int
+    data_offset: int
+    data: npt.NDArray[Any]
+    field: ReaderField
+
+
+class GGUFReader:
+    # I - same as host, S - swapped
+    byte_order: Literal['I' | 'S'] = 'I'
+    alignment: int = GGUF_DEFAULT_ALIGNMENT
+
+    # Note: Internal helper, API may change.
+    gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
+        GGUFValueType.UINT8:   np.uint8,
+        GGUFValueType.INT8:    np.int8,
+        GGUFValueType.UINT16:  np.uint16,
+        GGUFValueType.INT16:   np.int16,
+        GGUFValueType.UINT32:  np.uint32,
+        GGUFValueType.INT32:   np.int32,
+        GGUFValueType.FLOAT32: np.float32,
+        GGUFValueType.UINT64:  np.uint64,
+        GGUFValueType.INT64:   np.int64,
+        GGUFValueType.FLOAT64: np.float64,
+        GGUFValueType.BOOL:    np.bool_,
+    }
+
+    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
+        self.data = np.memmap(path, mode = mode)
+        offs = 0
+        if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
+            raise ValueError('GGUF magic invalid')
+        offs += 4
+        temp_version = self._get(offs, np.uint32)
+        if temp_version[0] & 65535 == 0:
+            # If we get 0 here that means it's (probably) a GGUF file created for
+            # the opposite byte order of the machine this script is running on.
+            self.byte_order = 'S'
+            temp_version = temp_version.newbyteorder(self.byte_order)
+        version = temp_version[0]
+        if version not in READER_SUPPORTED_VERSIONS:
+            raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle')
+        self.fields: OrderedDict[str, ReaderField] = OrderedDict()
+        self.tensors: list[ReaderTensor] = []
+        offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
+        temp_counts = self._get(offs, np.uint64, 2)
+        offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
+        offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
+        tensor_count, kv_count = temp_counts
+        offs = self._build_fields(offs, kv_count)
+        offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
+        new_align = self.fields.get('general.alignment')
+        if new_align is not None:
+            if new_align.types != [GGUFValueType.UINT32]:
+                raise ValueError('Bad type for general.alignment field')
+            self.alignment = new_align.parts[-1][0]
+        padding = offs % self.alignment
+        if padding != 0:
+            offs += self.alignment - padding
+        self._build_tensors(offs, tensors_fields)
+
+    _DT = TypeVar('_DT', bound = npt.DTypeLike)
+
+    # Fetch a key/value metadata field by key.
+    def get_field(self, key: str) -> Union[ReaderField, None]:
+        return self.fields.get(key, None)
+
+    # Fetch a tensor from the list by index.
+    def get_tensor(self, idx: int) -> ReaderTensor:
+        return self.tensors[idx]
+
+    def _get(
+        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
+    ) -> npt.NDArray[Any]:
+        count = int(count)
+        itemsize = int(np.empty([], dtype = dtype).itemsize)
+        end_offs = offset + itemsize * count
+        return (
+            self.data[offset:end_offs]
+            .view(dtype = dtype)[:count]
+            .newbyteorder(override_order or self.byte_order)
+        )
+
+    def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
+        if field.name in self.fields:
+            raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
+        self.fields[field.name] = field
+        return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
+
+    def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
+        slen = self._get(offset, np.uint64)
+        return slen, self._get(offset + 8, np.uint8, slen[0])
+
+    def _get_field_parts(
+        self, orig_offs: int, raw_type: int,
+    ) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]:
+        offs = orig_offs
+        types: list[GGUFValueType] = []
+        gtype = GGUFValueType(raw_type)
+        types.append(gtype)
+        # Handle strings.
+        if gtype == GGUFValueType.STRING:
+            sparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
+            size = sum(int(part.nbytes) for part in sparts)
+            return size, sparts, [1], types
+        # Check if it's a simple scalar type.
+        nptype = self.gguf_scalar_to_np.get(gtype)
+        if nptype is not None:
+            val = self._get(offs, nptype)
+            return int(val.nbytes), [val], [0], types
+        # Handle arrays.
+        if gtype == GGUFValueType.ARRAY:
+            raw_itype = self._get(offs, np.uint32)
+            offs += int(raw_itype.nbytes)
+            alen = self._get(offs, np.uint64)
+            offs += int(alen.nbytes)
+            aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
+            data_idxs: list[int] = []
+            for idx in range(alen[0]):
+                curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
+                if idx == 0:
+                    types += curr_types
+                idxs_offs = len(aparts)
+                aparts += curr_parts
+                data_idxs += (idx + idxs_offs for idx in curr_idxs)
+                offs += curr_size
+            return offs - orig_offs, aparts, data_idxs, types
+        # We can't deal with this one.
+        raise ValueError('Unknown/unhandled field type {gtype}')
+
+    def _get_tensor(self, orig_offs: int) -> ReaderField:
+        offs = orig_offs
+        name_len, name_data = self._get_str(offs)
+        offs += int(name_len.nbytes + name_data.nbytes)
+        n_dims = self._get(offs, np.uint32)
+        offs += int(n_dims.nbytes)
+        dims = self._get(offs, np.uint64, n_dims[0])
+        offs += int(dims.nbytes)
+        raw_dtype = self._get(offs, np.uint32)
+        offs += int(raw_dtype.nbytes)
+        offset_tensor = self._get(offs, np.uint64)
+        offs += int(offset_tensor.nbytes)
+        return ReaderField(
+            orig_offs,
+            str(bytes(name_data), encoding = 'utf-8'),
+            [name_len, name_data, n_dims, dims, raw_dtype, offset_tensor],
+            [1, 3, 4, 5],
+        )
+
+    def _build_fields(self, offs: int, count: int) -> int:
+        for _ in range(count):
+            orig_offs = offs
+            kv_klen, kv_kdata = self._get_str(offs)
+            offs += int(kv_klen.nbytes + kv_kdata.nbytes)
+            raw_kv_type = self._get(offs, np.uint32)
+            offs += int(raw_kv_type.nbytes)
+            parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
+            idxs_offs = len(parts)
+            field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0])
+            parts += field_parts
+            self._push_field(ReaderField(
+                orig_offs,
+                str(bytes(kv_kdata), encoding = 'utf-8'),
+                parts,
+                [idx + idxs_offs for idx in field_idxs],
+                field_types,
+            ), skip_sum = True)
+            offs += field_size
+        return offs
+
+    def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
+        tensor_fields = []
+        for _ in range(count):
+            field = self._get_tensor(offs)
+            offs += sum(int(part.nbytes) for part in field.parts)
+            tensor_fields.append(field)
+        return offs, tensor_fields
+
+    def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
+        tensors = []
+        for field in fields:
+            _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
+            ggml_type = GGMLQuantizationType(raw_dtype[0])
+            n_elems = np.prod(dims)
+            block_size, type_size = GGML_QUANT_SIZES[ggml_type]
+            n_bytes = n_elems * type_size // block_size
+            data_offs = int(start_offs + offset_tensor[0])
+            item_type: npt.DTypeLike
+            if ggml_type == GGMLQuantizationType.F32:
+                item_count = n_elems
+                item_type = np.float32
+            elif ggml_type == GGMLQuantizationType.F16:
+                item_count = n_elems
+                item_type = np.float16
+            else:
+                item_count = n_bytes
+                item_type = np.uint8
+            tensors.append(ReaderTensor(
+                name = str(bytes(name_data), encoding = 'utf-8'),
+                tensor_type = ggml_type,
+                shape = dims,
+                n_elements = n_elems,
+                n_bytes = n_bytes,
+                data_offset = data_offs,
+                data = self._get(data_offs, item_type, item_count),
+                field = field,
+            ))
+        self.tensors = tensors
--- a/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py
+++ b/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py
@ -0,0 +1,427 @@
+from __future__ import annotations
+
+import os
+import shutil
+import struct
+import tempfile
+from enum import Enum, auto
+from io import BufferedWriter
+from typing import IO, Any, Sequence
+
+import numpy as np
+
+from .constants import (
+    GGUF_DEFAULT_ALIGNMENT,
+    GGUF_MAGIC,
+    GGUF_VERSION,
+    GGMLQuantizationType,
+    GGUFEndian,
+    GGUFValueType,
+    Keys,
+    RopeScalingType,
+    TokenType,
+)
+
+
+class WriterState(Enum):
+    EMPTY   = auto()
+    HEADER  = auto()
+    KV_DATA = auto()
+    TI_DATA = auto()
+
+
+class GGUFWriter:
+    fout: BufferedWriter
+    temp_file: tempfile.SpooledTemporaryFile[bytes] | None
+    tensors: list[np.ndarray[Any, Any]]
+    _simple_value_packing = {
+        GGUFValueType.UINT8:   "B",
+        GGUFValueType.INT8:    "b",
+        GGUFValueType.UINT16:  "H",
+        GGUFValueType.INT16:   "h",
+        GGUFValueType.UINT32:  "I",
+        GGUFValueType.INT32:   "i",
+        GGUFValueType.FLOAT32: "f",
+        GGUFValueType.UINT64:  "Q",
+        GGUFValueType.INT64:   "q",
+        GGUFValueType.FLOAT64: "d",
+        GGUFValueType.BOOL:    "?",
+    }
+
+    def __init__(
+        self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True,
+        endianess: GGUFEndian = GGUFEndian.LITTLE,
+    ):
+        self.fout = open(path, "wb")
+        self.arch = arch
+        self.endianess = endianess
+        self.offset_tensor = 0
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
+        self.kv_data = bytearray()
+        self.kv_data_count = 0
+        self.ti_data = bytearray()
+        self.ti_data_count = 0
+        self.use_temp_file = use_temp_file
+        self.temp_file = None
+        self.tensors = []
+        print("gguf: This GGUF file is for {0} Endian only".format(
+            "Big" if self.endianess == GGUFEndian.BIG else "Little",
+        ))
+        self.state = WriterState.EMPTY
+
+        self.add_architecture()
+
+    def write_header_to_file(self) -> None:
+        if self.state is not WriterState.EMPTY:
+            raise ValueError(f'Expected output file to be empty, got {self.state}')
+
+        self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
+        self._write_packed("I", GGUF_VERSION)
+        self._write_packed("Q", self.ti_data_count)
+        self._write_packed("Q", self.kv_data_count)
+        self.flush()
+        self.state = WriterState.HEADER
+
+    def write_kv_data_to_file(self) -> None:
+        if self.state is not WriterState.HEADER:
+            raise ValueError(f'Expected output file to contain the header, got {self.state}')
+
+        self.fout.write(self.kv_data)
+        self.flush()
+        self.state = WriterState.KV_DATA
+
+    def write_ti_data_to_file(self) -> None:
+        if self.state is not WriterState.KV_DATA:
+            raise ValueError(f'Expected output file to contain KV data, got {self.state}')
+
+        self.fout.write(self.ti_data)
+        self.flush()
+        self.state = WriterState.TI_DATA
+
+    def add_key(self, key: str) -> None:
+        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+
+    def add_uint8(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT8)
+
+    def add_int8(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT8)
+
+    def add_uint16(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT16)
+
+    def add_int16(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT16)
+
+    def add_uint32(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT32)
+
+    def add_int32(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT32)
+
+    def add_float32(self, key: str, val: float) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.FLOAT32)
+
+    def add_uint64(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT64)
+
+    def add_int64(self, key: str, val: int) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT64)
+
+    def add_float64(self, key: str, val: float) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.FLOAT64)
+
+    def add_bool(self, key: str, val: bool) -> None:
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.BOOL)
+
+    def add_string(self, key: str, val: str) -> None:
+        if not val:
+            return
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.STRING)
+
+    def add_array(self, key: str, val: Sequence[Any]) -> None:
+        if not isinstance(val, Sequence):
+            raise ValueError("Value must be a sequence for array type")
+
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.ARRAY)
+
+    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None:
+        if vtype is None:
+            vtype = GGUFValueType.get_type(val)
+
+        if add_vtype:
+            self.kv_data += self._pack("I", vtype)
+            self.kv_data_count += 1
+
+        pack_fmt = self._simple_value_packing.get(vtype)
+        if pack_fmt is not None:
+            self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf8") if isinstance(val, str) else val
+            self.kv_data += self._pack("Q", len(encoded_val))
+            self.kv_data += encoded_val
+        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
+            ltype = GGUFValueType.get_type(val[0])
+            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
+                raise ValueError("All items in a GGUF array should be of the same type")
+            self.kv_data += self._pack("I", ltype)
+            self.kv_data += self._pack("Q", len(val))
+            for item in val:
+                self.add_val(item, add_vtype=False)
+        else:
+            raise ValueError("Invalid GGUF metadata value type or value")
+
+    @staticmethod
+    def ggml_pad(x: int, n: int) -> int:
+        return ((x + n - 1) // n) * n
+
+    def add_tensor_info(
+        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32],
+        tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
+    ) -> None:
+        if self.state is not WriterState.EMPTY:
+            raise ValueError(f'Expected output file to be empty, got {self.state}')
+
+        if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
+            raise ValueError("Only F32 and F16 tensors are supported for now")
+
+        encoded_name = name.encode("utf8")
+        self.ti_data += self._pack("Q", len(encoded_name))
+        self.ti_data += encoded_name
+        n_dims = len(tensor_shape)
+        self.ti_data += self._pack("I", n_dims)
+        for i in range(n_dims):
+            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
+        if raw_dtype is None:
+            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        else:
+            dtype = raw_dtype
+        self.ti_data += self._pack("I", dtype)
+        self.ti_data += self._pack("Q", self.offset_tensor)
+        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
+        self.ti_data_count += 1
+
+    def add_tensor(
+        self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
+        raw_dtype: GGMLQuantizationType | None = None,
+    ) -> None:
+        if self.endianess == GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
+        if self.use_temp_file and self.temp_file is None:
+            fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
+            fp.seek(0)
+            self.temp_file = fp
+
+        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
+        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+
+        if self.temp_file is None:
+            self.tensors.append(tensor)
+            return
+
+        tensor.tofile(self.temp_file)
+        self.write_padding(self.temp_file, tensor.nbytes)
+
+    def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None:
+        pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
+        if pad != 0:
+            fp.write(bytes([0] * pad))
+
+    def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
+        if self.state is not WriterState.TI_DATA:
+            raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
+
+        if self.endianess == GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
+        self.write_padding(self.fout, self.fout.tell())
+        tensor.tofile(self.fout)
+        self.write_padding(self.fout, tensor.nbytes)
+
+    def write_tensors_to_file(self) -> None:
+        self.write_ti_data_to_file()
+
+        self.write_padding(self.fout, self.fout.tell())
+
+        if self.temp_file is None:
+            while True:
+                try:
+                    tensor = self.tensors.pop(0)
+                except IndexError:
+                    break
+                tensor.tofile(self.fout)
+                self.write_padding(self.fout, tensor.nbytes)
+            return
+
+        self.temp_file.seek(0)
+
+        shutil.copyfileobj(self.temp_file, self.fout)
+        self.flush()
+        self.temp_file.close()
+
+    def flush(self) -> None:
+        self.fout.flush()
+
+    def close(self) -> None:
+        self.fout.close()
+
+    def add_architecture(self) -> None:
+        self.add_string(Keys.General.ARCHITECTURE, self.arch)
+
+    def add_author(self, author: str) -> None:
+        self.add_string(Keys.General.AUTHOR, author)
+
+    def add_tensor_data_layout(self, layout: str) -> None:
+        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_url(self, url: str) -> None:
+        self.add_string(Keys.General.URL, url)
+
+    def add_description(self, description: str) -> None:
+        self.add_string(Keys.General.DESCRIPTION, description)
+
+    def add_source_url(self, url: str) -> None:
+        self.add_string(Keys.General.SOURCE_URL, url)
+
+    def add_source_hf_repo(self, repo: str) -> None:
+        self.add_string(Keys.General.SOURCE_HF_REPO, repo)
+
+    def add_file_type(self, ftype: int) -> None:
+        self.add_uint32(Keys.General.FILE_TYPE, ftype)
+
+    def add_name(self, name: str) -> None:
+        self.add_string(Keys.General.NAME, name)
+
+    def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
+        self.add_uint32(
+            Keys.General.QUANTIZATION_VERSION, quantization_version)
+
+    def add_custom_alignment(self, alignment: int) -> None:
+        self.data_alignment = alignment
+        self.add_uint32(Keys.General.ALIGNMENT, alignment)
+
+    def add_context_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
+
+    def add_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_feed_forward_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_parallel_residual(self, use: bool) -> None:
+        self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
+
+    def add_head_count(self, count: int) -> None:
+        self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
+
+    def add_head_count_kv(self, count: int) -> None:
+        self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
+
+    def add_key_length(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
+
+    def add_value_length(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
+
+    def add_max_alibi_bias(self, bias: float) -> None:
+        self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
+
+    def add_clamp_kqv(self, value: float) -> None:
+        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
+
+    def add_expert_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
+
+    def add_expert_used_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
+
+    def add_layer_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
+
+    def add_layer_norm_rms_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
+
+    def add_rope_dimension_count(self, count: int) -> None:
+        self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
+
+    def add_rope_freq_base(self, value: float) -> None:
+        self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
+
+    def add_rope_scaling_type(self, value: RopeScalingType) -> None:
+        self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value)
+
+    def add_rope_scaling_factor(self, value: float) -> None:
+        self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
+
+    def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
+        self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
+
+    def add_rope_scaling_finetuned(self, value: bool) -> None:
+        self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
+
+    def add_tokenizer_model(self, model: str) -> None:
+        self.add_string(Keys.Tokenizer.MODEL, model)
+
+    def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
+        self.add_array(Keys.Tokenizer.LIST, tokens)
+
+    def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
+        self.add_array(Keys.Tokenizer.MERGES, merges)
+
+    def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
+        self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
+
+    def add_token_scores(self, scores: Sequence[float]) -> None:
+        self.add_array(Keys.Tokenizer.SCORES, scores)
+
+    def add_bos_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.BOS_ID, id)
+
+    def add_eos_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOS_ID, id)
+
+    def add_unk_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.UNK_ID, id)
+
+    def add_sep_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.SEP_ID, id)
+
+    def add_pad_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.PAD_ID, id)
+
+    def add_add_bos_token(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_BOS, value)
+
+    def add_add_eos_token(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_EOS, value)
+
+    def add_add_space_prefix(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
+
+    def add_chat_template(self, value: str) -> None:
+        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
+
+    def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
+        pack_prefix = ''
+        if not skip_pack_prefix:
+            pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
+        return struct.pack(f'{pack_prefix}{fmt}', value)
+
+    def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
+        self.fout.write(self._pack(fmt, value, skip_pack_prefix))
--- a/extensions/model-extension/scripts/gguf-py/gguf/py.typed
+++ b/extensions/model-extension/scripts/gguf-py/gguf/py.typed
--- a/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py
+++ b/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py
@ -0,0 +1,332 @@
+from __future__ import annotations
+
+from typing import Sequence
+
+from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES
+
+
+class TensorNameMap:
+    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Token embeddings
+        MODEL_TENSOR.TOKEN_EMBD: (
+            "gpt_neox.embed_in",                         # gptneox
+            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen
+            "transformer.word_embeddings",               # falcon
+            "word_embeddings",                           # bloom
+            "model.embed_tokens",                        # llama-hf
+            "tok_embeddings",                            # llama-pth
+            "embeddings.word_embeddings",                # bert
+            "language_model.embedding.word_embeddings",  # persimmon
+            "wte",                                       # gpt2
+            "transformer.embd.wte",                      # phi2
+            "model.tok_embeddings",                      # internlm2
+        ),
+
+        # Token type embeddings
+        MODEL_TENSOR.TOKEN_TYPES: (
+            "embeddings.token_type_embeddings",  # bert
+        ),
+
+        # Normalization of token embeddings
+        MODEL_TENSOR.TOKEN_EMBD_NORM: (
+            "word_embeddings_layernorm",  # bloom
+        ),
+
+        # Position embeddings
+        MODEL_TENSOR.POS_EMBD: (
+            "transformer.wpe",                 # gpt2
+            "embeddings.position_embeddings",  # bert
+            "wpe",                             # gpt2
+        ),
+
+        # Output
+        MODEL_TENSOR.OUTPUT: (
+            "embed_out",                 # gptneox
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen
+            "output",                    # llama-pth bloom internlm2
+            "word_embeddings_for_head",  # persimmon
+            "lm_head.linear",            # phi2
+        ),
+
+        # Output norm
+        MODEL_TENSOR.OUTPUT_NORM: (
+            "gpt_neox.final_layer_norm",               # gptneox
+            "transformer.ln_f",                        # gpt2 gpt-j falcon
+            "model.norm",                              # llama-hf baichuan internlm2
+            "norm",                                    # llama-pth
+            "embeddings.LayerNorm",                    # bert
+            "transformer.norm_f",                      # mpt
+            "ln_f",                                    # refact bloom qwen gpt2
+            "language_model.encoder.final_layernorm",  # persimmon
+            "model.final_layernorm",                   # persimmon
+            "lm_head.ln",                              # phi2
+        ),
+
+        # Rope frequencies
+        MODEL_TENSOR.ROPE_FREQS: (
+            "rope.freqs",  # llama-pth
+        ),
+    }
+
+    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Attention norm
+        MODEL_TENSOR.ATTN_NORM: (
+            "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
+            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen
+            "transformer.blocks.{bid}.norm_1",                      # mpt
+            "transformer.h.{bid}.input_layernorm",                  # falcon7b
+            "h.{bid}.input_layernorm",                              # bloom
+            "transformer.h.{bid}.ln_mlp",                           # falcon40b
+            "model.layers.{bid}.input_layernorm",                   # llama-hf
+            "layers.{bid}.attention_norm",                          # llama-pth
+            "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
+            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
+            "model.layers.{bid}.ln1",                               # yi
+            "h.{bid}.ln_1",                                         # gpt2
+            "transformer.h.{bid}.ln",                               # phi2
+            "model.layers.layers.{bid}.norm",                       # plamo
+            "model.layers.{bid}.attention_norm",                    # internlm2
+        ),
+
+        # Attention norm 2
+        MODEL_TENSOR.ATTN_NORM_2: (
+            "transformer.h.{bid}.ln_attn",  # falcon40b
+        ),
+
+        # Attention query-key-value
+        MODEL_TENSOR.ATTN_QKV: (
+            "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
+            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen
+            "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
+            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
+            "h.{bid}.self_attention.query_key_value",                              # bloom
+            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
+            "model.layers.{bid}.self_attn.query_key_value",                        # persimmon
+            "h.{bid}.attn.c_attn",                                                 # gpt2
+            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
+        ),
+
+        # Attention query
+        MODEL_TENSOR.ATTN_Q: (
+            "model.layers.{bid}.self_attn.q_proj",         # llama-hf
+            "layers.{bid}.attention.wq",                   # llama-pth
+            "encoder.layer.{bid}.attention.self.query",    # bert
+            "transformer.h.{bid}.attn.q_proj",             # gpt-j
+            "model.layers.layers.{bid}.self_attn.q_proj",  # plamo
+            "model.layers.{bid}.attention.wq"             # internlm2
+        ),
+
+        # Attention key
+        MODEL_TENSOR.ATTN_K: (
+            "model.layers.{bid}.self_attn.k_proj",         # llama-hf
+            "layers.{bid}.attention.wk",                   # llama-pth
+            "encoder.layer.{bid}.attention.self.key",      # bert
+            "transformer.h.{bid}.attn.k_proj",             # gpt-j
+            "model.layers.layers.{bid}.self_attn.k_proj",  # plamo
+            "model.layers.{bid}.attention.wk"             # internlm2
+        ),
+
+        # Attention value
+        MODEL_TENSOR.ATTN_V: (
+            "model.layers.{bid}.self_attn.v_proj",         # llama-hf
+            "layers.{bid}.attention.wv",                   # llama-pth
+            "encoder.layer.{bid}.attention.self.value",    # bert
+            "transformer.h.{bid}.attn.v_proj",             # gpt-j
+            "model.layers.layers.{bid}.self_attn.v_proj",  # plamo
+            "model.layers.{bid}.attention.wv"             # internlm2
+        ),
+
+        # Attention output
+        MODEL_TENSOR.ATTN_OUT: (
+            "gpt_neox.layers.{bid}.attention.dense",                     # gptneox
+            "transformer.h.{bid}.attn.c_proj",                           # gpt2 refact qwen
+            "transformer.blocks.{bid}.attn.out_proj",                    # mpt
+            "transformer.h.{bid}.self_attention.dense",                  # falcon
+            "h.{bid}.self_attention.dense",                              # bloom
+            "model.layers.{bid}.self_attn.o_proj",                       # llama-hf
+            "layers.{bid}.attention.wo",                                 # llama-pth
+            "encoder.layer.{bid}.attention.output.dense",                # bert
+            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
+            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
+            "model.layers.{bid}.self_attn.dense",                        # persimmon
+            "h.{bid}.attn.c_proj",                                       # gpt2
+            "transformer.h.{bid}.mixer.out_proj",                        # phi2
+            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
+            "model.layers.{bid}.attention.wo",                           # internlm2
+        ),
+
+        # Rotary embeddings
+        MODEL_TENSOR.ATTN_ROT_EMBD: (
+            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
+            "layers.{bid}.attention.inner_attention.rope.freqs",       # llama-pth
+            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
+            "transformer.h.{bid}.attn.rotary_emb.inv_freq",            # codeshell
+        ),
+
+        # Feed-forward norm
+        MODEL_TENSOR.FFN_NORM: (
+            "gpt_neox.layers.{bid}.post_attention_layernorm",                # gptneox
+            "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen
+            "h.{bid}.post_attention_layernorm",                              # bloom
+            "transformer.blocks.{bid}.norm_2",                               # mpt
+            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf
+            "layers.{bid}.ffn_norm",                                         # llama-pth
+            "encoder.layer.{bid}.output.LayerNorm",                          # bert
+            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
+            "model.layers.{bid}.ln2",                                        # yi
+            "h.{bid}.ln_2",                                                  # gpt2
+            "model.layers.{bid}.ffn_norm",                                   # internlm2
+        ),
+
+        MODEL_TENSOR.FFN_GATE_INP: (
+            "layers.{bid}.feed_forward.gate",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
+        ),
+
+        # Feed-forward up
+        MODEL_TENSOR.FFN_UP: (
+            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
+            "transformer.h.{bid}.mlp.c_fc",                           # gpt2
+            "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
+            "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
+            "h.{bid}.mlp.dense_h_to_4h",                              # bloom
+            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact
+            "layers.{bid}.feed_forward.w3",                           # llama-pth
+            "encoder.layer.{bid}.intermediate.dense",                 # bert
+            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
+            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
+            "model.layers.{bid}.mlp.dense_h_to_4h",                   # persimmon
+            "transformer.h.{bid}.mlp.w1",                             # qwen
+            "h.{bid}.mlp.c_fc",                                       # gpt2
+            "transformer.h.{bid}.mlp.fc1",                            # phi2
+            "model.layers.{bid}.mlp.fc1",                             # phi2
+            "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
+            "model.layers.{bid}.feed_forward.w3",                     # internlm2
+        ),
+
+        MODEL_TENSOR.FFN_UP_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
+        ),
+
+        # AWQ-activation gate
+        MODEL_TENSOR.FFN_ACT: (
+            "transformer.blocks.{bid}.ffn.act",  # mpt
+        ),
+
+        # Feed-forward gate
+        MODEL_TENSOR.FFN_GATE: (
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
+            "layers.{bid}.feed_forward.w1",               # llama-pth
+            "transformer.h.{bid}.mlp.w2",                 # qwen
+            "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
+            "model.layers.{bid}.feed_forward.w1",         # internlm2
+        ),
+
+        MODEL_TENSOR.FFN_GATE_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
+        ),
+
+        # Feed-forward down
+        MODEL_TENSOR.FFN_DOWN: (
+            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
+            "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact qwen
+            "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
+            "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
+            "h.{bid}.mlp.dense_4h_to_h",                              # bloom
+            "model.layers.{bid}.mlp.down_proj",                       # llama-hf
+            "layers.{bid}.feed_forward.w2",                           # llama-pth
+            "encoder.layer.{bid}.output.dense",                       # bert
+            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
+            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
+            "h.{bid}.mlp.c_proj",                                     # gpt2
+            "transformer.h.{bid}.mlp.fc2",                            # phi2
+            "model.layers.{bid}.mlp.fc2",                             # phi2
+            "model.layers.layers.{bid}.mlp.down_proj",                # plamo
+            "model.layers.{bid}.feed_forward.w2",                     # internlm2
+        ),
+
+        MODEL_TENSOR.FFN_DOWN_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
+        ),
+
+        MODEL_TENSOR.ATTN_Q_NORM: (
+            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
+            "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
+        ),
+
+        MODEL_TENSOR.ATTN_K_NORM: (
+            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
+            "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
+        ),
+
+        MODEL_TENSOR.ROPE_FREQS: (
+            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",  # persimmon
+        ),
+    }
+
+    mapping: dict[str, tuple[MODEL_TENSOR, str]]
+
+    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
+        self.mapping = {}
+        for tensor, keys in self.mappings_cfg.items():
+            if tensor not in MODEL_TENSORS[arch]:
+                continue
+            tensor_name = TENSOR_NAMES[tensor]
+            self.mapping[tensor_name] = (tensor, tensor_name)
+            for key in keys:
+                self.mapping[key] = (tensor, tensor_name)
+        for bid in range(n_blocks):
+            for tensor, keys in self.block_mappings_cfg.items():
+                if tensor not in MODEL_TENSORS[arch]:
+                    continue
+                # TODO: make this configurable
+                n_experts = 8
+                for xid in range(n_experts):
+                    tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
+                    self.mapping[tensor_name] = (tensor, tensor_name)
+                    for key in keys:
+                        key = key.format(bid = bid, xid = xid)
+                        self.mapping[key] = (tensor, tensor_name)
+
+    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
+        result = self.mapping.get(key)
+        if result is not None:
+            return result
+        for suffix in try_suffixes:
+            if key.endswith(suffix):
+                result = self.mapping.get(key[:-len(suffix)])
+                if result is not None:
+                    return result[0], result[1] + suffix
+        return None
+
+    def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
+        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
+        if result is None:
+            return None
+        return result[1]
+
+    def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
+        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
+        if result is None:
+            return None
+        return result[0]
+
+    def __getitem__(self, key: str) -> str:
+        try:
+            return self.mapping[key][1]
+        except KeyError:
+            raise KeyError(key)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self.mapping
+
+    def __repr__(self) -> str:
+        return repr(self.mapping)
+
+
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
+    return TensorNameMap(arch, n_blocks)
--- a/extensions/model-extension/scripts/gguf-py/gguf/vocab.py
+++ b/extensions/model-extension/scripts/gguf-py/gguf/vocab.py
@ -0,0 +1,185 @@
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any, Callable
+
+from .gguf_writer import GGUFWriter
+
+
+class SpecialVocab:
+    merges: list[str]
+    add_special_token: dict[str, bool]
+    special_token_ids: dict[str, int]
+    chat_template: str | None
+
+    def __init__(
+        self, path: str | os.PathLike[str], load_merges: bool = False,
+        special_token_types: tuple[str, ...] | None = None,
+        n_vocab: int | None = None,
+    ):
+        self.special_token_ids = {}
+        self.add_special_token = {}
+        self.n_vocab = n_vocab
+        self.load_merges = load_merges
+        self.merges = []
+        self.chat_template = None
+        if special_token_types is not None:
+            self.special_token_types = special_token_types
+        else:
+            self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
+        self._load(Path(path))
+
+    def __repr__(self) -> str:
+        return '<SpecialVocab with {} merges, special tokens {}, add special tokens {}>'.format(
+            len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset",
+        )
+
+    def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
+        if self.merges:
+            if not quiet:
+                print(f'gguf: Adding {len(self.merges)} merge(s).')
+            gw.add_token_merges(self.merges)
+        elif self.load_merges:
+            print(
+                'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
+                file = sys.stderr,
+            )
+        for typ, tokid in self.special_token_ids.items():
+            id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
+            if id_handler is None:
+                print(
+                    f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
+                    file = sys.stderr,
+                )
+                continue
+            if not quiet:
+                print(f'gguf: Setting special token type {typ} to {tokid}')
+            id_handler(tokid)
+        for typ, value in self.add_special_token.items():
+            add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
+            if add_handler is None:
+                print(
+                    f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
+                    file = sys.stderr,
+                )
+                continue
+            if not quiet:
+                print(f'gguf: Setting add_{typ}_token to {value}')
+            add_handler(value)
+        if self.chat_template is not None:
+            if not quiet:
+                print(f'gguf: Setting chat_template to {self.chat_template}')
+            gw.add_chat_template(self.chat_template)
+
+    def _load(self, path: Path) -> None:
+        self._try_load_from_tokenizer_json(path)
+        self._try_load_from_config_json(path)
+        if self.load_merges and not self.merges:
+            self._try_load_merges_txt(path)
+
+    def _try_load_merges_txt(self, path: Path) -> bool:
+        merges_file = path / 'merges.txt'
+        if not merges_file.is_file():
+            return False
+        with open(merges_file, 'r', encoding = 'utf-8') as fp:
+            first_line = next(fp, '').strip()
+            if not first_line.startswith('#'):
+                fp.seek(0)
+                line_num = 0
+            else:
+                line_num = 1
+            merges = []
+            for line in fp:
+                line_num += 1
+                line = line.strip()
+                if not line:
+                    continue
+                parts = line.split(None, 3)
+                if len(parts) != 2:
+                    print(
+                        f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
+                        file = sys.stderr,
+                    )
+                    continue
+                merges.append(f'{parts[0]} {parts[1]}')
+        self.merges = merges
+        return True
+
+    def _set_special_token(self, typ: str, tid: Any) -> None:
+        if not isinstance(tid, int):
+            return
+        if tid < 0:
+            raise ValueError(f'invalid value for special token type {typ}: {tid}')
+        if self.n_vocab is None or tid < self.n_vocab:
+            if typ in self.special_token_ids:
+                return
+            self.special_token_ids[typ] = tid
+            return
+        print(
+            f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
+            file = sys.stderr,
+        )
+
+    def _try_load_from_tokenizer_json(self, path: Path) -> bool:
+        tokenizer_file = path / 'tokenizer.json'
+        if tokenizer_file.is_file():
+            with open(tokenizer_file, encoding = 'utf-8') as f:
+                tokenizer = json.load(f)
+            if self.load_merges:
+                merges = tokenizer.get('model', {}).get('merges')
+                if isinstance(merges, list) and merges and isinstance(merges[0], str):
+                    self.merges = merges
+            added_tokens = tokenizer.get('added_tokens', {})
+        else:
+            added_tokens = {}
+        tokenizer_config_file = path / 'tokenizer_config.json'
+        if not tokenizer_config_file.is_file():
+            return True
+        with open(tokenizer_config_file, encoding = 'utf-8') as f:
+            tokenizer_config = json.load(f)
+        chat_template = tokenizer_config.get('chat_template')
+        if chat_template is None or isinstance(chat_template, str):
+            self.chat_template = chat_template
+        else:
+            print(
+                f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
+                file = sys.stderr
+            )
+        for typ in self.special_token_types:
+            add_entry = tokenizer_config.get(f'add_{typ}_token')
+            if isinstance(add_entry, bool):
+                self.add_special_token[typ] = add_entry
+            if not added_tokens:
+                # We will need this to get the content for the token, so if it's empty
+                # may as well just give up.
+                continue
+            entry = tokenizer_config.get(f'{typ}_token')
+            if isinstance(entry, str):
+                tc_content = entry
+            elif isinstance(entry, dict):
+                entry_content = entry.get('content')
+                if not isinstance(entry_content, str):
+                    continue
+                tc_content = entry_content
+            else:
+                continue
+            # We only need the first match here.
+            maybe_token_id = next(
+                (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content),
+                None,
+            )
+            self._set_special_token(typ, maybe_token_id)
+        return True
+
+    def _try_load_from_config_json(self, path: Path) -> bool:
+        config_file = path / 'config.json'
+        if not config_file.is_file():
+            return False
+        with open(config_file, encoding = 'utf-8') as f:
+            config = json.load(f)
+        for typ in self.special_token_types:
+            self._set_special_token(typ, config.get(f'{typ}_token_id'))
+        return True
--- a/extensions/model-extension/scripts/gguf-py/pyproject.toml
+++ b/extensions/model-extension/scripts/gguf-py/pyproject.toml
@ -0,0 +1,35 @@
+[tool.poetry]
+name = "gguf"
+version = "0.7.0"
+description = "Read and write ML models in GGUF for GGML"
+authors = ["GGML <ggml@ggml.ai>"]
+packages = [
+    {include = "gguf"},
+    {include = "gguf/py.typed"},
+    {include = "scripts"},
+]
+readme = "README.md"
+homepage = "https://ggml.ai"
+repository = "https://github.com/ggerganov/llama.cpp"
+keywords = ["ggml", "gguf", "llama.cpp"]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+[tool.poetry.dependencies]
+python = ">=3.8"
+numpy = ">=1.17"
+
+[tool.poetry.dev-dependencies]
+pytest = "^5.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
+gguf-dump = "scripts:gguf_dump_entrypoint"
+gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
--- a/extensions/model-extension/scripts/gguf-py/scripts/init.py
+++ b/extensions/model-extension/scripts/gguf-py/scripts/init.py
@ -0,0 +1,12 @@
+import os
+
+from importlib import import_module
+
+
+os.environ["NO_LOCAL_GGUF"] = "TRUE"
+
+gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main
+gguf_dump_entrypoint           = import_module("scripts.gguf-dump").main
+gguf_set_metadata_entrypoint   = import_module("scripts.gguf-set-metadata").main
+
+del import_module, os
--- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py
+++ b/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import gguf
+
+
+def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
+    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+        # Host is little endian
+        host_endian = "little"
+        swapped_endian = "big"
+    else:
+        # Sorry PDP or other weird systems that don't use BE or LE.
+        host_endian = "big"
+        swapped_endian = "little"
+    if reader.byte_order == "S":
+        file_endian = swapped_endian
+    else:
+        file_endian = host_endian
+    order = host_endian if args.order == "native" else args.order
+    print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
+    if file_endian == order:
+        print(f"* File is already {order.upper()} endian. Nothing to do.")
+        sys.exit(0)
+    print("* Checking tensors for conversion compatibility")
+    for tensor in reader.tensors:
+        if tensor.tensor_type not in (
+            gguf.GGMLQuantizationType.F32,
+            gguf.GGMLQuantizationType.F16,
+            gguf.GGMLQuantizationType.Q8_0,
+        ):
+            raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
+    print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
+    if args.dry_run:
+        return
+    print("\n*** Warning *** Warning *** Warning **")
+    print("* This conversion process may damage the file. Ensure you have a backup.")
+    if order != host_endian:
+        print("* Requested endian differs from host, you will not be able to load the model on this machine.")
+    print("* The file will be modified immediately, so if conversion fails or is interrupted")
+    print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
+    response = input("YES, I am sure> ")
+    if response != "YES":
+        print("You didn't enter YES. Okay then, see ya!")
+        sys.exit(0)
+    print(f"\n* Converting fields ({len(reader.fields)})")
+    for idx, field in enumerate(reader.fields.values()):
+        print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
+        for part in field.parts:
+            part.byteswap(inplace=True)
+    print(f"\n* Converting tensors ({len(reader.tensors)})")
+    for idx, tensor in enumerate(reader.tensors):
+        print(
+            f"  - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, "
+            f"elements={tensor.n_elements}... ",
+            end="",
+        )
+        tensor_type = tensor.tensor_type
+        for part in tensor.field.parts:
+            part.byteswap(inplace=True)
+        if tensor_type != gguf.GGMLQuantizationType.Q8_0:
+            tensor.data.byteswap(inplace=True)
+            print()
+            continue
+        # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes
+        block_size = 34
+        n_blocks = len(tensor.data) // block_size
+        for block_num in range(n_blocks):
+            block_offs = block_num * block_size
+            # I know I said f16, but it doesn't matter here - any simple 16 bit type works.
+            delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
+            delta.byteswap(inplace=True)
+            if block_num % 100000 == 0:
+                print(f"[{(n_blocks - block_num) // 1000}K]", end="")
+                sys.stdout.flush()
+        print()
+    print("* Completion")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Convert GGUF file byte order")
+    parser.add_argument(
+        "model", type=str,
+        help="GGUF format model filename",
+    )
+    parser.add_argument(
+        "order", type=str, choices=['big', 'little', 'native'],
+        help="Requested byte order",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Don't actually change anything",
+    )
+    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
+    print(f'* Loading: {args.model}')
+    reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+')
+    convert_byteorder(reader, args)
+
+
+if __name__ == "__main__":
+    main()
--- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py
+++ b/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py
@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from gguf import GGUFReader, GGUFValueType  # noqa: E402
+
+
+def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
+    host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
+    if reader.byte_order == 'S':
+        file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE'
+    else:
+        file_endian = host_endian
+    return (host_endian, file_endian)
+
+
+# For more information about what field.parts and field.data represent,
+# please see the comments in the modify_gguf.py example.
+def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
+    host_endian, file_endian = get_file_host_endian(reader)
+    print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.')
+    print(f'\n* Dumping {len(reader.fields)} key/value pair(s)')
+    for n, field in enumerate(reader.fields.values(), 1):
+        if not field.types:
+            pretty_type = 'N/A'
+        elif field.types[0] == GGUFValueType.ARRAY:
+            nest_count = len(field.types) - 1
+            pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
+        else:
+            pretty_type = str(field.types[-1].name)
+        print(f'  {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '')
+        if len(field.types) == 1:
+            curr_type = field.types[0]
+            if curr_type == GGUFValueType.STRING:
+                print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
+            elif field.types[0] in reader.gguf_scalar_to_np:
+                print(' = {0}'.format(field.parts[-1][0]), end = '')
+        print()
+    if args.no_tensors:
+        return
+    print(f'\n* Dumping {len(reader.tensors)} tensor(s)')
+    for n, tensor in enumerate(reader.tensors, 1):
+        prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)))
+        print(f'  {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}')
+
+
+def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
+    import json
+    host_endian, file_endian = get_file_host_endian(reader)
+    metadata: dict[str, Any] = {}
+    tensors: dict[str, Any] = {}
+    result = {
+        "filename": args.model,
+        "endian": file_endian,
+        "metadata": metadata,
+        "tensors": tensors,
+    }
+    for idx, field in enumerate(reader.fields.values()):
+        curr: dict[str, Any] = {
+            "index": idx,
+            "type": field.types[0].name if field.types else 'UNKNOWN',
+            "offset": field.offset,
+        }
+        metadata[field.name] = curr
+        if field.types[:1] == [GGUFValueType.ARRAY]:
+            curr["array_types"] = [t.name for t in field.types][1:]
+            if not args.json_array:
+                continue
+            itype = field.types[-1]
+            if itype == GGUFValueType.STRING:
+                curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
+            else:
+                curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
+        elif field.types[0] == GGUFValueType.STRING:
+            curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
+        else:
+            curr["value"] = field.parts[-1].tolist()[0]
+    if not args.no_tensors:
+        for idx, tensor in enumerate(reader.tensors):
+            tensors[tensor.name] = {
+                "index": idx,
+                "shape": tensor.shape.tolist(),
+                "type": tensor.tensor_type.name,
+                "offset": tensor.field.offset,
+            }
+    json.dump(result, sys.stdout)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
+    parser.add_argument("model",           type=str,            help="GGUF format model filename")
+    parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
+    parser.add_argument("--json",       action="store_true", help="Produce JSON output")
+    parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
+    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
+    if not args.json:
+        print(f'* Loading: {args.model}')
+    reader = GGUFReader(args.model, 'r')
+    if args.json:
+        dump_metadata_json(reader, args)
+    else:
+        dump_metadata(reader, args)
+
+
+if __name__ == '__main__':
+    main()
--- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py
+++ b/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py
@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import sys
+from pathlib import Path
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from gguf import GGUFReader  # noqa: E402
+
+
+def minimal_example(filename: str) -> None:
+    reader = GGUFReader(filename, 'r+')
+    field = reader.fields['tokenizer.ggml.bos_token_id']
+    if field is None:
+        return
+    part_index = field.data[0]
+    field.parts[part_index][0] = 2  # Set tokenizer.ggml.bos_token_id to 2
+    #
+    # So what's this field.data thing? It's helpful because field.parts contains
+    # _every_ part of the GGUF field. For example, tokenizer.ggml.bos_token_id consists
+    # of:
+    #
+    #  Part index 0: Key length (27)
+    #  Part index 1: Key data ("tokenizer.ggml.bos_token_id")
+    #  Part index 2: Field type (4, the id for GGUFValueType.UINT32)
+    #  Part index 3: Field value
+    #
+    # Note also that each part is an NDArray slice, so even a part that
+    # is only a single value like the key length will be a NDArray of
+    # the key length type (numpy.uint32).
+    #
+    # The .data attribute in the Field is a list of relevant part indexes
+    # and doesn't contain internal GGUF details like the key length part.
+    # In this case, .data will be [3] - just the part index of the
+    # field value itself.
+
+
+def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
+    field = reader.get_field(args.key)
+    if field is None:
+        print(f'! Field {repr(args.key)} not found', file = sys.stderr)
+        sys.exit(1)
+    # Note that field.types is a list of types. This is because the GGUF
+    # format supports arrays. For example, an array of UINT32 would
+    # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32]
+    handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None
+    if handler is None:
+        print(
+            f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}',
+            file = sys.stderr,
+        )
+        sys.exit(1)
+    current_value = field.parts[field.data[0]][0]
+    new_value = handler(args.value)
+    print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
+    if current_value == new_value:
+        print(f'- Key {repr(args.key)} already set to requested value {current_value}')
+        sys.exit(0)
+    if args.dry_run:
+        sys.exit(0)
+    if not args.force:
+        print('*** Warning *** Warning *** Warning **')
+        print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
+        print('* Enter exactly YES if you are positive you want to proceed:')
+        response = input('YES, I am sure> ')
+        if response != 'YES':
+            print("You didn't enter YES. Okay then, see ya!")
+            sys.exit(0)
+    field.parts[field.data[0]][0] = new_value
+    print('* Field changed. Successful completion.')
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Set a simple value in GGUF file metadata")
+    parser.add_argument("model",     type=str,            help="GGUF format model filename")
+    parser.add_argument("key",       type=str,            help="Metadata key to set")
+    parser.add_argument("value",     type=str,            help="Metadata value to set")
+    parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything")
+    parser.add_argument("--force",   action="store_true", help="Change the field without confirmation")
+    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
+    print(f'* Loading: {args.model}')
+    reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+')
+    set_metadata(reader, args)
+
+
+if __name__ == '__main__':
+    main()
--- a/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py
+++ b/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py
@ -0,0 +1,7 @@
+import gguf  # noqa: F401
+
+# TODO: add tests
+
+
+def test_write_gguf() -> None:
+    pass
--- a/extensions/huggingface-extension/scripts/install_deps.py
+++ b/extensions/huggingface-extension/scripts/install_deps.py
--- a/extensions/huggingface-extension/scripts/version.txt
+++ b/extensions/huggingface-extension/scripts/version.txt
--- a/extensions/model-extension/src/@types/InvalidHostError.ts
+++ b/extensions/model-extension/src/@types/InvalidHostError.ts
@ -0,0 +1,6 @@
+export class InvalidHostError extends Error {
+  constructor(message: string) {
+    super(message)
+    this.name = 'InvalidHostError'
+  }
+}
--- a/extensions/model-extension/src/@types/NotSupportModelError.ts
+++ b/extensions/model-extension/src/@types/NotSupportModelError.ts
@ -0,0 +1,6 @@
+export class NotSupportedModelError extends Error {
+  constructor(message: string) {
+    super(message)
+    this.name = 'NotSupportedModelError'
+  }
+}
--- a/extensions/model-extension/src/@types/global.d.ts
+++ b/extensions/model-extension/src/@types/global.d.ts
@ -1,8 +1,7 @@
 export {}
 declare global {
  declare const DEFAULT_MODEL: object
-  declare const MODULE_PATH: string
-  declare const VERSION: string
+  declare const NODE: string

  interface Core {
    api: APIFunctions
--- a/extensions/model-extension/src/index.ts
+++ b/extensions/model-extension/src/index.ts
@ -17,9 +17,19 @@ import {
  baseName,
  GpuSetting,
  DownloadRequest,
+  executeOnMain,
+  HuggingFaceRepoData,
+  Quantization,
+  log,
+  getFileSize,
+  AllQuantizations,
+  ModelEvent,
 } from '@janhq/core'

 import { extractFileName } from './helpers/path'
+import { GGUFMetadata, gguf } from '@huggingface/gguf'
+import { NotSupportedModelError } from './@types/NotSupportModelError'
+import { InvalidHostError } from './@types/InvalidHostError'

 /**
 * A extension for models
@ -35,6 +45,17 @@ export default class JanModelExtension extends ModelExtension {
  ]
  private static readonly _tensorRtEngineFormat = '.engine'
  private static readonly _supportedGpuArch = ['ampere', 'ada']
+  private static readonly _safetensorsRegexs = [
+    /model\.safetensors$/,
+    /model-[0-9]+-of-[0-9]+\.safetensors$/,
+  ]
+  private static readonly _pytorchRegexs = [
+    /pytorch_model\.bin$/,
+    /consolidated\.[0-9]+\.pth$/,
+    /pytorch_model-[0-9]+-of-[0-9]+\.bin$/,
+    /.*\.pt$/,
+  ]
+  interrupted = false

  /**
   * Called when the extension is loaded.
@ -49,7 +70,7 @@ export default class JanModelExtension extends ModelExtension {
   * Called when the extension is unloaded.
   * @override
   */
-  onUnload(): void {}
+  async onUnload() {}

  /**
   * Downloads a machine learning model.
@ -65,7 +86,11 @@ export default class JanModelExtension extends ModelExtension {
    // create corresponding directory
    const modelDirPath = await joinPath([JanModelExtension._homeDir, model.id])
    if (!(await fs.existsSync(modelDirPath))) await fs.mkdir(modelDirPath)
-
+    const modelJsonPath = await joinPath([modelDirPath, 'model.json'])
+    if (!(await fs.existsSync(modelJsonPath))) {
+      await fs.writeFileSync(modelJsonPath, JSON.stringify(model, null, 2))
+      events.emit(ModelEvent.OnModelsUpdate, {})
+    }
    if (model.engine === InferenceEngine.nitro_tensorrt_llm) {
      if (!gpuSettings || gpuSettings.gpus.length === 0) {
        console.error('No GPU found. Please check your GPU setting.')
@ -140,6 +165,84 @@ export default class JanModelExtension extends ModelExtension {
    }
  }

+  private toHuggingFaceUrl(repoId: string): string {
+    try {
+      const url = new URL(repoId)
+      if (url.host !== 'huggingface.co') {
+        throw new InvalidHostError(`Invalid Hugging Face repo URL: ${repoId}`)
+      }
+
+      const paths = url.pathname.split('/').filter((e) => e.trim().length > 0)
+      if (paths.length < 2) {
+        throw new InvalidHostError(`Invalid Hugging Face repo URL: ${repoId}`)
+      }
+
+      return `${url.origin}/api/models/${paths[0]}/${paths[1]}`
+    } catch (err) {
+      if (err instanceof InvalidHostError) {
+        throw err
+      }
+
+      if (repoId.startsWith('https')) {
+        throw new Error(`Cannot parse url: ${repoId}`)
+      }
+
+      return `https://huggingface.co/api/models/${repoId}`
+    }
+  }
+
+  async fetchHuggingFaceRepoData(repoId: string): Promise<HuggingFaceRepoData> {
+    const sanitizedUrl = this.toHuggingFaceUrl(repoId)
+    console.debug('sanitizedUrl', sanitizedUrl)
+
+    const res = await fetch(sanitizedUrl)
+    const response = await res.json()
+    if (response['error'] != null) {
+      throw new Error(response['error'])
+    }
+
+    const data = response as HuggingFaceRepoData
+
+    if (data.tags.indexOf('gguf') === -1) {
+      throw new NotSupportedModelError(
+        `${repoId} is not supported. Only GGUF models are supported.`
+      )
+    }
+
+    const promises: Promise<number>[] = []
+
+    // fetching file sizes
+    const url = new URL(sanitizedUrl)
+    const paths = url.pathname.split('/').filter((e) => e.trim().length > 0)
+
+    for (const sibling of data.siblings) {
+      const downloadUrl = `https://huggingface.co/${paths[2]}/${paths[3]}/resolve/main/${sibling.rfilename}`
+      sibling.downloadUrl = downloadUrl
+      promises.push(getFileSize(downloadUrl))
+    }
+
+    const result = await Promise.all(promises)
+    for (let i = 0; i < data.siblings.length; i++) {
+      data.siblings[i].fileSize = result[i]
+    }
+
+    AllQuantizations.forEach((quantization) => {
+      data.siblings.forEach((sibling) => {
+        if (!sibling.quantization && sibling.rfilename.includes(quantization)) {
+          sibling.quantization = quantization
+        }
+      })
+    })
+
+    data.modelUrl = `https://huggingface.co/${paths[2]}/${paths[3]}`
+    return data
+  }
+
+  async fetchModelMetadata(url: string): Promise<GGUFMetadata> {
+    const { metadata } = await gguf(url)
+    return metadata
+  }
+
  /**
   * Specifically for Jan server.
   */
@ -453,7 +556,7 @@ export default class JanModelExtension extends ModelExtension {
    return model
  }

-  private async getDefaultModel(): Promise<Model> {
+  override async getDefaultModel(): Promise<Model> {
    const defaultModel = DEFAULT_MODEL as Model
    return defaultModel
  }
@ -674,4 +777,218 @@ export default class JanModelExtension extends ModelExtension {
      importedModels
    )
  }
+
+  private getGgufFileList(
+    repoData: HuggingFaceRepoData,
+    selectedQuantization: Quantization
+  ): string[] {
+    return repoData.siblings
+      .map((file) => file.rfilename)
+      .filter((file) => file.indexOf(selectedQuantization) !== -1)
+      .filter((file) => file.endsWith('.gguf'))
+  }
+
+  private getFileList(repoData: HuggingFaceRepoData): string[] {
+    // SafeTensors first, if not, then PyTorch
+    const modelFiles = repoData.siblings
+      .map((file) => file.rfilename)
+      .filter((file) =>
+        JanModelExtension._safetensorsRegexs.some((regex) => regex.test(file))
+      )
+    if (modelFiles.length === 0) {
+      repoData.siblings.forEach((file) => {
+        if (
+          JanModelExtension._pytorchRegexs.some((regex) =>
+            regex.test(file.rfilename)
+          )
+        ) {
+          modelFiles.push(file.rfilename)
+        }
+      })
+    }
+
+    const vocabFiles = [
+      'tokenizer.model',
+      'vocab.json',
+      'tokenizer.json',
+    ].filter((file) =>
+      repoData.siblings.some((sibling) => sibling.rfilename === file)
+    )
+
+    const etcFiles = repoData.siblings
+      .map((file) => file.rfilename)
+      .filter(
+        (file) =>
+          (file.endsWith('.json') && !vocabFiles.includes(file)) ||
+          file.endsWith('.txt') ||
+          file.endsWith('.py') ||
+          file.endsWith('.tiktoken')
+      )
+
+    return [...modelFiles, ...vocabFiles, ...etcFiles]
+  }
+
+  private async getModelDirPath(repoID: string): Promise<string> {
+    const modelName = repoID.split('/').slice(1).join('/')
+    return joinPath([await getJanDataFolderPath(), 'models', modelName])
+  }
+
+  private async getConvertedModelPath(repoID: string): Promise<string> {
+    const modelName = repoID.split('/').slice(1).join('/')
+    const modelDirPath = await this.getModelDirPath(repoID)
+    return joinPath([modelDirPath, modelName + '.gguf'])
+  }
+
+  private async getQuantizedModelPath(
+    repoID: string,
+    quantization: Quantization
+  ): Promise<string> {
+    const modelName = repoID.split('/').slice(1).join('/')
+    const modelDirPath = await this.getModelDirPath(repoID)
+    return joinPath([
+      modelDirPath,
+      modelName + `-${quantization.toLowerCase()}.gguf`,
+    ])
+  }
+  private getCtxLength(config: {
+    max_sequence_length?: number
+    max_position_embeddings?: number
+    n_ctx?: number
+  }): number {
+    if (config.max_sequence_length) return config.max_sequence_length
+    if (config.max_position_embeddings) return config.max_position_embeddings
+    if (config.n_ctx) return config.n_ctx
+    return 2048
+  }
+
+  /**
+   * Converts a Hugging Face model to GGUF.
+   * @param repoID - The repo ID of the model to convert.
+   * @returns A promise that resolves when the conversion is complete.
+   */
+  async convert(repoID: string): Promise<void> {
+    if (this.interrupted) return
+    const modelDirPath = await this.getModelDirPath(repoID)
+    const modelOutPath = await this.getConvertedModelPath(repoID)
+    if (!(await fs.existsSync(modelDirPath))) {
+      throw new Error('Model dir not found')
+    }
+    if (await fs.existsSync(modelOutPath)) return
+
+    await executeOnMain(NODE, 'installDeps')
+    if (this.interrupted) return
+
+    try {
+      await executeOnMain(
+        NODE,
+        'convertHf',
+        modelDirPath,
+        modelOutPath + '.temp'
+      )
+    } catch (err) {
+      log(`[Conversion]::Debug: Error using hf-to-gguf.py, trying convert.py`)
+
+      let ctx = 2048
+      try {
+        const config = await fs.readFileSync(
+          await joinPath([modelDirPath, 'config.json']),
+          'utf8'
+        )
+        const configParsed = JSON.parse(config)
+        ctx = this.getCtxLength(configParsed)
+        configParsed.max_sequence_length = ctx
+        await fs.writeFileSync(
+          await joinPath([modelDirPath, 'config.json']),
+          JSON.stringify(configParsed, null, 2)
+        )
+      } catch (err) {
+        log(`${err}`)
+        // ignore missing config.json
+      }
+
+      const bpe = await fs.existsSync(
+        await joinPath([modelDirPath, 'vocab.json'])
+      )
+
+      await executeOnMain(
+        NODE,
+        'convert',
+        modelDirPath,
+        modelOutPath + '.temp',
+        {
+          ctx,
+          bpe,
+        }
+      )
+    }
+    await executeOnMain(
+      NODE,
+      'renameSync',
+      modelOutPath + '.temp',
+      modelOutPath
+    )
+
+    for (const file of await fs.readdirSync(modelDirPath)) {
+      if (
+        modelOutPath.endsWith(file) ||
+        (file.endsWith('config.json') && !file.endsWith('_config.json'))
+      )
+        continue
+      await fs.unlinkSync(await joinPath([modelDirPath, file]))
+    }
+  }
+
+  /**
+   * Quantizes a GGUF model.
+   * @param repoID - The repo ID of the model to quantize.
+   * @param quantization - The quantization to use.
+   * @returns A promise that resolves when the quantization is complete.
+   */
+  async quantize(repoID: string, quantization: Quantization): Promise<void> {
+    if (this.interrupted) return
+    const modelDirPath = await this.getModelDirPath(repoID)
+    const modelOutPath = await this.getQuantizedModelPath(repoID, quantization)
+    if (!(await fs.existsSync(modelDirPath))) {
+      throw new Error('Model dir not found')
+    }
+    if (await fs.existsSync(modelOutPath)) return
+
+    await executeOnMain(
+      NODE,
+      'quantize',
+      await this.getConvertedModelPath(repoID),
+      modelOutPath + '.temp',
+      quantization
+    )
+    await executeOnMain(
+      NODE,
+      'renameSync',
+      modelOutPath + '.temp',
+      modelOutPath
+    )
+
+    await fs.unlinkSync(await this.getConvertedModelPath(repoID))
+  }
+
+  /**
+   * Cancels the convert of current Hugging Face model.
+   * @param repoID - The repository ID to cancel.
+   * @param repoData - The repository data to cancel.
+   * @returns {Promise<void>} A promise that resolves when the download has been cancelled.
+   */
+  async cancelConvert(
+    repoID: string,
+    repoData: HuggingFaceRepoData
+  ): Promise<void> {
+    this.interrupted = true
+    const modelDirPath = await this.getModelDirPath(repoID)
+    const files = this.getFileList(repoData)
+    for (const file of files) {
+      const filePath = file
+      const localPath = await joinPath([modelDirPath, filePath])
+      await abortDownload(localPath)
+    }
+
+    executeOnMain(NODE, 'killProcesses')
+  }
 }
--- a/extensions/huggingface-extension/src/node/index.ts
+++ b/extensions/huggingface-extension/src/node/index.ts
--- a/extensions/monitoring-extension/src/index.ts
+++ b/extensions/monitoring-extension/src/index.ts
@ -1,7 +1,6 @@
 import {
  GpuSetting,
  MonitoringExtension,
-  MonitoringInterface,
  OperatingSystemInfo,
  executeOnMain,
 } from '@janhq/core'
--- a/extensions/monitoring-extension/src/node/index.ts
+++ b/extensions/monitoring-extension/src/node/index.ts
@ -335,7 +335,7 @@ const updateCudaExistence = async (

  // Attempt to query CUDA using NVIDIA SMI
  if (!cudaExists) {
-    await new Promise<void>((resolve, reject) => {
+    await new Promise<void>((resolve) => {
      exec('nvidia-smi', (error, stdout) => {
        if (!error) {
          const regex = /CUDA\s*Version:\s*(\d+\.\d+)/g
--- a/extensions/monitoring-extension/src/node/logger.ts
+++ b/extensions/monitoring-extension/src/node/logger.ts
@ -126,6 +126,10 @@ export class FileLogger extends Logger {

 const writeLog = (message: string, logPath: string) => {
  if (!fs.existsSync(logPath)) {
+    const logDirectory = path.join(getJanDataFolderPath(), 'logs')
+    if (!fs.existsSync(logDirectory)) {
+      fs.mkdirSync(logDirectory)
+    }
    fs.writeFileSync(logPath, message)
  } else {
    const logFile = fs.createWriteStream(logPath, {
--- a/extensions/tensorrt-llm-extension/package.json
+++ b/extensions/tensorrt-llm-extension/package.json
@ -55,7 +55,6 @@
    "@janhq/core": "file:../../core",
    "decompress": "^4.2.1",
    "fetch-retry": "^5.0.6",
-    "path-browserify": "^1.0.1",
    "rxjs": "^7.8.1",
    "tcp-port-used": "^1.0.2",
    "terminate": "^2.6.1",
--- a/server/helpers/logger.ts
+++ b/server/helpers/logger.ts
@ -14,22 +14,45 @@ export class Logger implements FastifyBaseLogger {

  silent = () => {}

-  info = function (msg: any) {
-    log(msg)
-  }
-  error = function (msg: any) {
-    log(msg)
-  }
-  debug = function (msg: any) {
-    log(msg)
-  }
-  fatal = function (msg: any) {
-    log(msg)
-  }
-  warn = function (msg: any) {
-    log(msg)
-  }
-  trace = function (msg: any) {
-    log(msg)
+  info = (obj?: any, msg?: string, ...args: any[]) => {
+    if (obj?.res?.raw?.statusCode || obj?.req?.url) {
+      log(
+        `[SERVER]::${JSON.stringify({
+          level: obj?.level,
+          time: obj?.time,
+          hostname: obj?.hostname,
+          reqId: obj?.req?.id ?? obj?.res?.request?.id,
+          res: {
+            statusCode: obj?.res?.raw?.statusCode,
+          },
+          req: {
+            method: obj?.req?.method,
+            url: obj?.req?.url,
+            path: obj?.req?.path,
+            hostname: obj?.req?.hostname,
+            remoteAddress: obj?.req?.remoteAddress,
+            remotePort: obj?.req?.remotePort,
+          },
+          msg,
+          responseTime: obj?.responseTime,
+          ...args,
+        })}`
+      )
+    }
+  }
+  error = function (message: any) {
+    log(`[SERVER]::${JSON.stringify(message)}`)
+  }
+  debug = function (message: any) {
+    log(`[SERVER]::${JSON.stringify(message)}`)
+  }
+  fatal = function (message: any) {
+    log(`[SERVER]::${JSON.stringify(message)}`)
+  }
+  warn = function (message: any) {
+    log(`[SERVER]::${JSON.stringify(message)}`)
+  }
+  trace = function (message: any) {
+    log(`[SERVER]::${JSON.stringify(message)}`)
  }
 }
--- a/web/containers/DropdownListSidebar/index.tsx
+++ b/web/containers/DropdownListSidebar/index.tsx
@ -266,12 +266,21 @@ const DropdownListSidebar = ({
                        value={x.id}
                        className={twMerge(
                          x.id === selectedModel?.id && 'bg-secondary',
-                          'my-0 pb-8 pt-4'
+                          'my-0 py-2'
                        )}
                      >
-                        <div className="relative flex w-full justify-between">
-                          <span className="line-clamp-1 block">{x.name}</span>
-                          <div className="absolute right-0 top-2 space-x-2">
+                        <div className="flex w-full items-center justify-between gap-x-4">
+                          <div className="max-w-[200px]">
+                            <p className="line-clamp-2">{x.name}</p>
+                            <div
+                              className={twMerge(
+                                'mt-2 inline-flex items-center space-x-2 text-muted-foreground'
+                              )}
+                            >
+                              <p className="line-clamp-2 text-xs">{x.id}</p>
+                            </div>
+                          </div>
+                          <div className="flex-shrink-0 space-x-2">
                            <span className="font-bold text-muted-foreground">
                              {toGibibytes(x.metadata.size)}
                            </span>
@ -283,10 +292,12 @@ const DropdownListSidebar = ({
                      </SelectItem>
                      <div
                        className={twMerge(
-                          'absolute -mt-6 inline-flex items-center space-x-2 px-4 pb-2 text-muted-foreground'
+                          'absolute -mt-6 ml-4 flex max-w-[200px] items-center space-x-2 text-muted-foreground'
                        )}
                      >
-                        <span className="text-xs">{x.id}</span>
+                        <p className="line-clamp-1 flex-1 text-xs text-transparent">
+                          {x.id}
+                        </p>
                        {clipboard.copied && copyId === x.id ? (
                          <CheckIcon size={16} className="text-green-600" />
                        ) : (
--- a/web/containers/Layout/BottomBar/SystemMonitor/TableActiveModel/index.tsx
+++ b/web/containers/Layout/BottomBar/SystemMonitor/TableActiveModel/index.tsx
@ -34,7 +34,7 @@ const TableActiveModel = () => {
                return (
                  <th
                    key={i}
-                    className="px-6 py-2 text-left font-normal last:text-center"
+                    className="px-4 py-2 text-left font-normal last:text-center"
                  >
                    {col}
                  </th>
@ -46,17 +46,27 @@ const TableActiveModel = () => {
            <Fragment>
              <tbody>
                <tr>
-                  <td className="px-6 py-2 font-bold">{activeModel.name}</td>
-                  <td className="px-6 py-2 font-bold">{activeModel.id}</td>
-                  <td className="px-6 py-2">
+                  <td
+                    className="max-w-[200px] px-4 py-2 font-bold"
+                    title={activeModel.name}
+                  >
+                    <p className="line-clamp-2">{activeModel.name}</p>
+                  </td>
+                  <td
+                    className="max-w-[200px] px-4 py-2 font-bold"
+                    title={activeModel.id}
+                  >
+                    <p className="line-clamp-2">{activeModel.id}</p>
+                  </td>
+                  <td className="px-4 py-2">
                    <Badge themes="secondary">
                      {toGibibytes(activeModel.metadata.size)}
                    </Badge>
                  </td>
-                  <td className="px-6 py-2">
+                  <td className="px-4 py-2">
                    <Badge themes="secondary">v{activeModel.version}</Badge>
                  </td>
-                  <td className="px-6 py-2 text-center">
+                  <td className="px-4 py-2 text-center">
                    <Tooltip>
                      <TooltipTrigger className="w-full">
                        <Button
--- a/web/containers/Layout/index.tsx
+++ b/web/containers/Layout/index.tsx
@ -20,6 +20,7 @@ import { SUCCESS_SET_NEW_DESTINATION } from '@/screens/Settings/Advanced/DataFol
 import CancelModelImportModal from '@/screens/Settings/CancelModelImportModal'
 import ChooseWhatToImportModal from '@/screens/Settings/ChooseWhatToImportModal'
 import EditModelInfoModal from '@/screens/Settings/EditModelInfoModal'
+import HuggingFaceRepoDetailModal from '@/screens/Settings/HuggingFaceRepoDetailModal'
 import ImportModelOptionModal from '@/screens/Settings/ImportModelOptionModal'
 import ImportingModelModal from '@/screens/Settings/ImportingModelModal'
 import SelectingModelModal from '@/screens/Settings/SelectingModelModal'
@ -75,6 +76,7 @@ const BaseLayout = () => {
      {importModelStage === 'CONFIRM_CANCEL' && <CancelModelImportModal />}
      <ChooseWhatToImportModal />
      <InstallingExtensionModal />
+      <HuggingFaceRepoDetailModal />
    </div>
  )
 }
--- a/web/containers/ListContainer/index.tsx
+++ b/web/containers/ListContainer/index.tsx
@ -0,0 +1,29 @@
+import { ReactNode, useEffect, useRef } from 'react'
+
+type Props = {
+  children: ReactNode
+}
+
+const ListContainer: React.FC<Props> = ({ children }) => {
+  const listRef = useRef<HTMLDivElement>(null)
+
+  useEffect(() => {
+    const scrollHeight = listRef.current?.scrollHeight ?? 0
+
+    listRef.current?.scrollTo({
+      top: scrollHeight,
+      behavior: 'smooth',
+    })
+  })
+
+  return (
+    <div
+      ref={listRef}
+      className="flex h-full w-full flex-col overflow-y-scroll"
+    >
+      {children}
+    </div>
+  )
+}
+
+export default ListContainer
--- a/web/containers/Providers/EventHandler.tsx
+++ b/web/containers/Providers/EventHandler.tsx
@ -34,6 +34,9 @@ import {
  updateThreadAtom,
 } from '@/helpers/atoms/Thread.atom'

+const maxWordForThreadTitle = 10
+const defaultThreadTitle = 'New Thread'
+
 export default function EventHandler({ children }: { children: ReactNode }) {
  const messages = useAtomValue(getCurrentChatMessagesAtom)
  const addNewMessage = useSetAtom(addNewMessageAtom)
@ -90,34 +93,64 @@ export default function EventHandler({ children }: { children: ReactNode }) {
      }

      const thread = threadsRef.current?.find((e) => e.id == message.thread_id)
+      if (!thread) {
+        console.warn(
+          `Failed to update title for thread ${message.thread_id}: Thread not found!`
+        )
+        return
+      }
+
      const messageContent = message.content[0]?.text?.value
+      if (!messageContent) {
+        console.warn(
+          `Failed to update title for thread ${message.thread_id}: Responded content is null!`
+        )
+        return
+      }

      // The thread title should not be updated if the message is less than 10 words
      // And no new line character is present
      // And non-alphanumeric characters should be removed
-      if (thread && messageContent && !messageContent.includes('\n')) {
+      if (messageContent.includes('\n')) {
+        console.warn(
+          `Failed to update title for thread ${message.thread_id}: Title can't contain new line character!`
+        )
+        return
+      }
+
      // Remove non-alphanumeric characters
      const cleanedMessageContent = messageContent
        .replace(/[^a-z0-9\s]/gi, '')
        .trim()
+
      // Split the message into words
      const words = cleanedMessageContent.split(' ')
-        // Check if the message is less than 10 words
-        if (words.length < 10) {
-          // Update the Thread title with the response of the inference on the 1st prompt
-          updateThread({
+
+      if (words.length >= maxWordForThreadTitle) {
+        console.warn(
+          `Failed to update title for thread ${message.thread_id}: Title can't be greater than ${maxWordForThreadTitle} words!`
+        )
+        return
+      }
+
+      const updatedThread: Thread = {
        ...thread,
+
        title: cleanedMessageContent,
        metadata: thread.metadata,
-          })
+      }

      extensionManager
        .get<ConversationalExtension>(ExtensionTypeEnum.Conversational)
        ?.saveThread({
-              ...thread,
+          ...updatedThread,
+        })
+        .then(() => {
+          // Update the Thread title with the response of the inference on the 1st prompt
+          updateThread({
+            ...updatedThread,
+          })
        })
-        }
-      }
    },
    [updateThread]
  )
@ -142,7 +175,7 @@ export default function EventHandler({ children }: { children: ReactNode }) {
      setIsGeneratingResponse(false)

      const thread = threadsRef.current?.find((e) => e.id == message.thread_id)
-      if (thread) {
+      if (!thread) return
      const messageContent = message.content[0]?.text?.value
      const metadata = {
        ...thread.metadata,
@ -168,7 +201,6 @@ export default function EventHandler({ children }: { children: ReactNode }) {

      // Attempt to generate the title of the Thread when needed
      generateThreadTitle(message, thread)
-      }
    },
    [setIsGeneratingResponse, updateMessage, updateThread, updateThreadWaiting]
  )
@ -181,6 +213,7 @@ export default function EventHandler({ children }: { children: ReactNode }) {
          break
        default:
          updateThreadMessage(message)
+          break
      }
    },
    [updateThreadMessage, updateThreadTitle]
@ -188,11 +221,14 @@ export default function EventHandler({ children }: { children: ReactNode }) {

  const generateThreadTitle = (message: ThreadMessage, thread: Thread) => {
    // If this is the first ever prompt in the thread
-    if (
-      thread &&
-      thread.title?.trim() === 'New Thread' &&
-      activeModelRef.current
-    ) {
+    if (thread.title?.trim() !== defaultThreadTitle) {
+      return
+    }
+
+    if (!activeModelRef.current) {
+      return
+    }
+
    // This is the first time message comes in on a new thread
    // Summarize the first message, and make that the title of the Thread
    // 1. Get the summary of the first prompt using whatever engine user is currently using
@ -200,15 +236,11 @@ export default function EventHandler({ children }: { children: ReactNode }) {

    if (!threadMessages || threadMessages.length === 0) return

-      const summarizeFirstPrompt = `Summarize in a 5-word Title. Give the title only. "${threadMessages[0].content[0].text.value}"`
-      // Prompt: Given this query from user {query}, return to me the summary in 5 words as the title
+    const summarizeFirstPrompt = `Summarize in a ${maxWordForThreadTitle}-word Title. Give the title only. "${threadMessages[0].content[0].text.value}"`
+
+    // Prompt: Given this query from user {query}, return to me the summary in 10 words as the title
    const msgId = ulid()
    const messages: ChatCompletionMessage[] = [
-        {
-          role: ChatCompletionRole.System,
-          content:
-            'The conversation below is for a text summarization, user asks assistant to summarize a text and assistant should response in just less than 10 words',
-        },
      {
        role: ChatCompletionRole.User,
        content: summarizeFirstPrompt,
@ -236,7 +268,6 @@ export default function EventHandler({ children }: { children: ReactNode }) {
      engine?.inference(messageRequest)
    }, 1000)
  }
-  }

  useEffect(() => {
    if (window.core?.events) {
@ -244,14 +275,13 @@ export default function EventHandler({ children }: { children: ReactNode }) {
      events.on(MessageEvent.OnMessageUpdate, onMessageResponseUpdate)
      events.on(ModelEvent.OnModelStopped, onModelStopped)
    }
-  }, [onNewMessageResponse, onMessageResponseUpdate, onModelStopped])

-  useEffect(() => {
    return () => {
      events.off(MessageEvent.OnMessageResponse, onNewMessageResponse)
      events.off(MessageEvent.OnMessageUpdate, onMessageResponseUpdate)
      events.off(ModelEvent.OnModelStopped, onModelStopped)
    }
  }, [onNewMessageResponse, onMessageResponseUpdate, onModelStopped])
+
  return <Fragment>{children}</Fragment>
 }
--- a/web/containers/Providers/EventListener.tsx
+++ b/web/containers/Providers/EventListener.tsx
@ -2,7 +2,7 @@ import { PropsWithChildren, useCallback, useEffect } from 'react'

 import React from 'react'

-import { DownloadEvent, events, DownloadState } from '@janhq/core'
+import { DownloadEvent, events, DownloadState, ModelEvent } from '@janhq/core'
 import { useSetAtom } from 'jotai'

 import { setDownloadStateAtom } from '@/hooks/useDownloadState'
@ -64,6 +64,7 @@ const EventListenerWrapper = ({ children }: PropsWithChildren) => {
      if (state.downloadType !== 'extension') {
        setDownloadState(state)
      }
+      events.emit(ModelEvent.OnModelsUpdate, {})
    },
    [setDownloadState]
  )
--- a/web/helpers/atoms/HFConverter.atom.ts
+++ b/web/helpers/atoms/HFConverter.atom.ts
@ -1,44 +0,0 @@
-import { HuggingFaceRepoData } from '@janhq/core'
-import { atom } from 'jotai'
-
-export const repoIDAtom = atom<string | null>(null)
-export const loadingAtom = atom<boolean>(false)
-export const fetchErrorAtom = atom<Error | null>(null)
-export const conversionStatusAtom = atom<
-  | 'downloading'
-  | 'converting'
-  | 'quantizing'
-  | 'done'
-  | 'stopping'
-  | 'generating'
-  | null
->(null)
-export const conversionErrorAtom = atom<Error | null>(null)
-const _repoDataAtom = atom<HuggingFaceRepoData | null>(null)
-const _unsupportedAtom = atom<boolean>(false)
-
-export const resetAtom = atom(null, (_get, set) => {
-  set(repoIDAtom, null)
-  set(loadingAtom, false)
-  set(fetchErrorAtom, null)
-  set(conversionStatusAtom, null)
-  set(conversionErrorAtom, null)
-  set(_repoDataAtom, null)
-  set(_unsupportedAtom, false)
-})
-
-export const repoDataAtom = atom(
-  (get) => get(_repoDataAtom),
-  (_get, set, repoData: HuggingFaceRepoData) => {
-    set(_repoDataAtom, repoData)
-    if (
-      !repoData.tags.includes('transformers') ||
-      (!repoData.tags.includes('pytorch') &&
-        !repoData.tags.includes('safetensors'))
-    ) {
-      set(_unsupportedAtom, true)
-    }
-  }
-)
-
-export const unsupportedAtom = atom((get) => get(_unsupportedAtom))
--- a/web/helpers/atoms/HuggingFace.atom.ts
+++ b/web/helpers/atoms/HuggingFace.atom.ts
@ -0,0 +1,12 @@
+import { HuggingFaceRepoData } from '@janhq/core/.'
+import { atom } from 'jotai'
+
+// modals
+export type ImportHuggingFaceModelStage = 'NONE' | 'REPO_DETAIL'
+
+export const importingHuggingFaceRepoDataAtom = atom<
+  HuggingFaceRepoData | undefined
+>(undefined)
+
+export const importHuggingFaceModelStageAtom =
+  atom<ImportHuggingFaceModelStage>('NONE')
--- a/web/helpers/atoms/Model.atom.ts
+++ b/web/helpers/atoms/Model.atom.ts
@ -46,6 +46,8 @@ export const removeDownloadedModelAtom = atom(

 export const configuredModelsAtom = atom<Model[]>([])

+export const defaultModelAtom = atom<Model | undefined>(undefined)
+
 /// TODO: move this part to another atom
 // store the paths of the models that are being imported
 export const importingModelsAtom = atom<ImportingModel[]>([])
--- a/web/hooks/useConvertHuggingFaceModel.ts
+++ b/web/hooks/useConvertHuggingFaceModel.ts
@ -1,79 +0,0 @@
-import {
-  ExtensionTypeEnum,
-  HuggingFaceExtension,
-  HuggingFaceRepoData,
-  Quantization,
-} from '@janhq/core'
-
-import { useAtomValue, useSetAtom } from 'jotai'
-
-import { extensionManager } from '@/extension/ExtensionManager'
-import { ignoreSslAtom, proxyAtom } from '@/helpers/atoms/AppConfig.atom'
-import {
-  conversionStatusAtom,
-  conversionErrorAtom,
-} from '@/helpers/atoms/HFConverter.atom'
-
-export const useConvertHuggingFaceModel = () => {
-  const proxy = useAtomValue(proxyAtom)
-  const ignoreSSL = useAtomValue(ignoreSslAtom)
-  const setConversionStatus = useSetAtom(conversionStatusAtom)
-  const setConversionError = useSetAtom(conversionErrorAtom)
-
-  const convertHuggingFaceModel = async (
-    repoID: string,
-    repoData: HuggingFaceRepoData,
-    quantization: Quantization
-  ) => {
-    const extension = await extensionManager.get<HuggingFaceExtension>(
-      ExtensionTypeEnum.HuggingFace
-    )
-    try {
-      if (extension) {
-        extension.interrupted = false
-      }
-      setConversionStatus('downloading')
-      await extension?.downloadModelFiles(repoID, repoData, {
-        ignoreSSL,
-        proxy,
-      })
-      if (extension?.interrupted) return
-      setConversionStatus('converting')
-      await extension?.convert(repoID)
-      if (extension?.interrupted) return
-      setConversionStatus('quantizing')
-      await extension?.quantize(repoID, quantization)
-      if (extension?.interrupted) return
-      setConversionStatus('generating')
-      await extension?.generateMetadata(repoID, repoData, quantization)
-      setConversionStatus('done')
-    } catch (err) {
-      if (extension?.interrupted) return
-      extension?.cancelConvert(repoID, repoData)
-      if (typeof err === 'number') {
-        setConversionError(new Error(`exit code: ${err}`))
-      } else {
-        setConversionError(err as Error)
-      }
-      console.error(err)
-    }
-  }
-
-  const cancelConvertHuggingFaceModel = async (
-    repoID: string,
-    repoData: HuggingFaceRepoData
-  ) => {
-    const extension = await extensionManager.get<HuggingFaceExtension>(
-      ExtensionTypeEnum.HuggingFace
-    )
-
-    setConversionStatus('stopping')
-    await extension?.cancelConvert(repoID, repoData)
-    setConversionStatus(null)
-  }
-
-  return {
-    convertHuggingFaceModel,
-    cancelConvertHuggingFaceModel,
-  }
-}
--- a/web/hooks/useGetHFRepoData.ts
+++ b/web/hooks/useGetHFRepoData.ts
@ -1,31 +1,41 @@
-import { useAtomValue, useSetAtom } from 'jotai'
+import { useCallback, useState } from 'react'

 import {
-  repoDataAtom,
-  repoIDAtom,
-  loadingAtom,
-  fetchErrorAtom,
-} from '@/helpers/atoms/HFConverter.atom'
+  ExtensionTypeEnum,
+  HuggingFaceRepoData,
+  ModelExtension,
+} from '@janhq/core'
+
+import { extensionManager } from '@/extension'

 export const useGetHFRepoData = () => {
-  const repoID = useAtomValue(repoIDAtom)
-  const setRepoData = useSetAtom(repoDataAtom)
-  const setLoading = useSetAtom(loadingAtom)
-  const setFetchError = useSetAtom(fetchErrorAtom)
+  const [error, setError] = useState<string | undefined>(undefined)
+  const [loading, setLoading] = useState(false)

-  const getRepoData = async () => {
-    setLoading(true)
+  const getHfRepoData = useCallback(async (repoId: string) => {
    try {
-      const res = await fetch(`https://huggingface.co/api/models/${repoID}`)
-      const data = await res.json()
-      setRepoData(data)
+      setError(undefined)
+      setLoading(true)
+      const data = await extensionGetHfRepoData(repoId)
+      return data
    } catch (err) {
-      setFetchError(
-        Error("The repo does not exist or you don't have access to it.")
-      )
+      console.error(err)
+      if (err instanceof Error) {
+        setError(err.message)
      }
+      throw err
+    } finally {
      setLoading(false)
    }
+  }, [])

-  return getRepoData
+  return { loading, error, getHfRepoData }
+}
+
+const extensionGetHfRepoData = async (
+  repoId: string
+): Promise<HuggingFaceRepoData | undefined> => {
+  return extensionManager
+    .get<ModelExtension>(ExtensionTypeEnum.Model)
+    ?.fetchHuggingFaceRepoData(repoId)
 }
--- a/web/hooks/useModels.ts
+++ b/web/hooks/useModels.ts
@ -13,25 +13,37 @@ import { useSetAtom } from 'jotai'
 import { extensionManager } from '@/extension'
 import {
  configuredModelsAtom,
+  defaultModelAtom,
  downloadedModelsAtom,
 } from '@/helpers/atoms/Model.atom'

 const useModels = () => {
  const setDownloadedModels = useSetAtom(downloadedModelsAtom)
  const setConfiguredModels = useSetAtom(configuredModelsAtom)
+  const setDefaultModel = useSetAtom(defaultModelAtom)

  const getData = useCallback(() => {
    const getDownloadedModels = async () => {
      const models = await getLocalDownloadedModels()
      setDownloadedModels(models)
    }
+
    const getConfiguredModels = async () => {
      const models = await getLocalConfiguredModels()
      setConfiguredModels(models)
    }
-    getDownloadedModels()
-    getConfiguredModels()
-  }, [setDownloadedModels, setConfiguredModels])
+
+    const getDefaultModel = async () => {
+      const defaultModel = await getLocalDefaultModel()
+      setDefaultModel(defaultModel)
+    }
+
+    Promise.all([
+      getDownloadedModels(),
+      getConfiguredModels(),
+      getDefaultModel(),
+    ])
+  }, [setDownloadedModels, setConfiguredModels, setDefaultModel])

  useEffect(() => {
    // Try get data on mount
@ -46,6 +58,11 @@ const useModels = () => {
  }, [getData])
 }

+const getLocalDefaultModel = async (): Promise<Model | undefined> =>
+  extensionManager
+    .get<ModelExtension>(ExtensionTypeEnum.Model)
+    ?.getDefaultModel()
+
 const getLocalConfiguredModels = async (): Promise<Model[]> =>
  extensionManager
    .get<ModelExtension>(ExtensionTypeEnum.Model)
--- a/web/hooks/useUpdateModelParameters.ts
+++ b/web/hooks/useUpdateModelParameters.ts
@ -34,13 +34,10 @@ export default function useUpdateModelParameters() {

  const updateModelParameter = useCallback(
    async (thread: Thread, settings: UpdateModelParameter) => {
-      const params = settings.modelId
-        ? settings.params
-        : { ...activeModelParams, ...settings.params }
-
-      const updatedModelParams: ModelParams = {
-        ...params,
-      }
+      const toUpdateSettings = processStopWords(settings.params ?? {})
+      const updatedModelParams = settings.modelId
+        ? toUpdateSettings
+        : { ...activeModelParams, ...toUpdateSettings }

      // update the state
      setThreadModelParams(thread.id, updatedModelParams)
@ -73,5 +70,13 @@ export default function useUpdateModelParameters() {
    [activeModelParams, selectedModel, setThreadModelParams]
  )

+  const processStopWords = (params: ModelParams): ModelParams => {
+    if ('stop' in params && typeof params['stop'] === 'string') {
+      // Input as string but stop words accept an array of strings (space as separator)
+      params['stop'] = (params['stop'] as string).split(' ')
+    }
+    return params
+  }
+
  return { updateModelParameter }
 }
--- a/web/package.json
+++ b/web/package.json
@ -23,10 +23,12 @@
    "framer-motion": "^10.16.4",
    "highlight.js": "^11.9.0",
    "jotai": "^2.6.0",
+    "katex": "^0.16.10",
    "lodash": "^4.17.21",
    "lucide-react": "^0.291.0",
    "marked": "^9.1.2",
    "marked-highlight": "^2.0.6",
+    "marked-katex-extension": "^5.0.1",
    "next": "14.0.1",
    "next-themes": "^0.2.1",
    "postcss": "8.4.31",
--- a/web/public/icons/Jan_AppIcon.svg
+++ b/web/public/icons/Jan_AppIcon.svg
@ -1,15 +0,0 @@
-<svg width="68" height="28" viewBox="0 0 68 28" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M27.0157 6.56975C26.0798 6.41763 25.3942 6.92314 24.8653 7.59949C23.7445 9.02943 23.3865 10.7426 23.1665 12.4978C23.3841 14.0448 22.6143 15.1845 21.6503 16.2657C21.0814 16.87 20.6438 17.5856 20.3651 18.3675C20.0864 19.1493 19.9727 19.9803 20.031 20.8083C20.0677 21.5222 20.1886 22.2292 20.3914 22.9146C20.4345 23.0209 20.4689 23.1304 20.4943 23.2423C20.513 23.3967 20.5973 23.6027 20.4241 23.6916C20.2135 23.7992 20.0802 23.5886 20.0029 23.4576C19.8119 23.133 19.6551 22.7895 19.535 22.4325C18.84 20.1296 18.9359 17.9086 20.4101 15.8936C20.7385 15.4661 21.0916 15.0582 21.4677 14.672C22.1697 13.9277 22.2844 13.3263 21.8024 12.4463C20.5832 10.2253 19.3361 8.01607 18.1006 5.80212C17.2886 4.34409 16.4626 2.89075 15.667 1.4257C15.4029 0.958785 15.0423 0.553496 14.6094 0.236816C13.547 0.637013 13.2873 1.14252 13.6126 2.20503C13.6809 2.44449 13.7653 2.67905 13.8653 2.90713C14.3496 3.98134 14.8457 5.05087 15.3324 6.12508C16.0016 7.60183 16.6849 9.07156 17.2395 10.5951C17.319 10.8151 17.5296 11.0889 17.1623 11.2738C16.7223 11.7629 16.3737 11.7887 15.9221 11.3487C15.2201 10.6677 14.7732 9.80642 14.2678 8.99199C12.9246 6.8108 11.6026 4.62727 10.3016 2.44141C9.988 1.91951 9.50598 1.70186 8.99587 1.49123C8.09967 1.90781 7.861 2.40396 8.11839 3.34243C8.25212 3.78059 8.43502 4.2022 8.6636 4.59919C10.1284 7.32801 11.6845 10.0054 13.2171 12.6944C13.3632 12.8779 13.4373 13.1084 13.4254 13.3427C13.1025 14.1243 12.3584 14.0588 11.9068 13.4971C10.9076 12.245 9.94121 10.9672 8.95375 9.70813C8.14413 8.68072 7.31579 7.66502 6.50384 6.64464C6.1758 6.23383 5.74227 5.91996 5.24963 5.73659C4.49851 6.10636 4.21772 6.71017 4.45171 7.49184C4.57736 7.88129 4.76631 8.24739 5.01096 8.57541C6.55999 10.8689 8.10747 13.1601 9.65339 15.449C9.80081 15.6643 10.0442 15.8772 9.86867 16.1955C9.5247 16.8298 8.87887 16.9561 8.33367 16.4296C7.63169 15.7579 6.99522 15.0464 6.35174 14.3233C5.5819 13.462 4.8378 12.575 4.082 11.6997C3.78483 11.3557 3.38002 11.2059 2.97287 11.0304C2.27089 11.3932 2.09773 11.7723 2.27089 12.4978C2.38042 12.8949 2.56467 13.2674 2.81375 13.5954C3.42682 14.466 4.03754 15.339 4.67166 16.1932C7.0116 19.355 9.35154 22.5121 12.0799 25.3649C13.9355 27.298 16.1701 28.1897 18.8447 27.9627C18.9428 27.9697 19.0413 27.9697 19.1395 27.9627C22.8109 27.628 26.1266 24.7026 26.9292 21.1009C27.223 19.6189 27.3439 18.1079 27.2895 16.5981C27.2684 14.6111 27.2895 12.6195 27.5984 10.6583C27.7796 9.74169 27.9108 8.81589 27.9915 7.88501C28.057 7.12675 27.7622 6.69144 27.0157 6.56975Z" fill="#FEC928"/>
-<path d="M23.1712 12.4976C23.3888 14.0446 22.619 15.1843 21.6549 16.2655C21.0852 16.8694 20.6468 17.5849 20.3673 18.3667C20.0878 19.1485 19.9733 19.9798 20.031 20.8081C20.0676 21.522 20.1886 22.229 20.3914 22.9144C20.4345 23.0207 20.4689 23.1302 20.4943 23.242C20.513 23.3965 20.5973 23.6025 20.4241 23.6914C20.2135 23.799 20.0801 23.5884 20.0029 23.4574C19.8119 23.1328 19.6551 22.7892 19.5349 22.4323C18.84 20.1294 18.9359 17.9084 20.4101 15.8934C20.7384 15.4659 21.0916 15.0579 21.4677 14.6718C22.1697 13.9275 22.2844 13.3261 21.8023 12.4461C20.5832 10.2251 19.336 8.01587 18.1029 5.80191C17.2886 4.34389 16.4649 2.89054 15.667 1.4255C15.4029 0.95858 15.0423 0.553291 14.6094 0.236612C15.4962 0.0470449 15.9923 0.573619 16.3854 1.23593C17.2605 2.6963 18.1193 4.16836 18.9874 5.63107C20.2837 7.81538 21.5816 9.99968 22.8811 12.184C22.9583 12.2963 23.0074 12.4461 23.1712 12.4976Z" fill="#EDA703"/>
-<path d="M18.8447 27.9625C18.8751 27.6395 19.1652 27.67 19.3618 27.6068C21.0231 27.0849 22.5464 26.3149 23.7749 25.0582C25.3317 23.4844 26.249 21.3886 26.3488 19.1769C26.4448 17.6838 26.3676 16.1953 26.4284 14.7069C26.4927 12.4986 26.9466 10.319 27.7692 8.26864C27.8338 8.1377 27.9081 8.01175 27.9915 7.89185C27.9108 8.82272 27.7796 9.74852 27.5984 10.6651C27.2848 12.6357 27.2684 14.618 27.2895 16.6049C27.3439 18.1147 27.223 19.6257 26.9292 21.1077C26.1266 24.7095 22.8109 27.6349 19.1395 27.9695C19.0412 27.9742 18.9427 27.9719 18.8447 27.9625Z" fill="#EDA807"/>
-<path d="M17.1622 11.2502C16.7223 11.7393 16.3737 11.7651 15.9221 11.3251C15.2201 10.6441 14.7732 9.78282 14.2677 8.96838C12.9246 6.8028 11.6026 4.62707 10.3016 2.4412C9.988 1.91931 9.50597 1.70166 8.99586 1.49103C9.51065 1.16806 9.99502 1.29444 10.463 1.59634C10.8488 1.87602 11.1683 2.23716 11.399 2.65417C13.0073 5.19889 14.6133 7.74751 16.2169 10.3C16.4603 10.6745 16.7013 11.0747 17.1622 11.2502Z" fill="#EDA805"/>
-<path d="M13.4254 13.3425C13.1025 14.1242 12.3584 14.0587 11.9068 13.497C10.9076 12.2449 9.94121 10.9671 8.95376 9.70801C8.14414 8.68061 7.3158 7.66491 6.50384 6.64452C6.17581 6.23372 5.74228 5.91984 5.24963 5.73647C5.95162 5.46266 6.62552 5.69201 7.25964 6.43857C7.85866 7.15471 8.42961 7.8849 9.03331 8.60572C10.1861 10.0099 11.3413 11.4141 12.4988 12.8183C12.7351 13.1085 12.9667 13.4268 13.4254 13.3425Z" fill="#EDA805"/>
-<path d="M5.9984 21.6764C5.33928 21.4779 4.73152 21.1376 4.21771 20.6795C2.87982 19.4981 1.94672 17.9264 1.55018 16.186C1.47998 15.8865 1.57124 15.6875 1.84735 15.6103C2.12347 15.5331 2.21239 15.7437 2.29194 15.9567C3.05476 17.9974 4.28323 19.6731 6.12944 20.8714C6.24436 20.9207 6.33679 21.0111 6.38871 21.1249C6.44062 21.2387 6.44829 21.3677 6.41023 21.4869C6.35174 21.6811 6.18326 21.6881 5.9984 21.6764Z" fill="#B2BFC6"/>
-<path d="M0.00582356 17.4405C-0.010556 17.2205 0.0479424 17.0333 0.284276 16.9725C0.586128 16.9069 0.682066 17.1456 0.752264 17.361C1.16328 18.4846 1.7675 19.5277 2.53764 20.4432C3.11893 21.1412 3.80393 21.7458 4.5687 22.2359C4.65483 22.2774 4.73085 22.3373 4.79147 22.4112C4.85209 22.4852 4.89585 22.5715 4.91969 22.6641C4.96415 22.952 4.6857 23.1322 4.33705 23.0058C3.33285 22.6399 2.45156 21.9993 1.79354 21.157C0.985591 20.1689 0.395516 19.0213 0.0619821 17.7892C0.0356685 17.6744 0.0169088 17.5579 0.00582356 17.4405Z" fill="#B2BFC6"/>
-<path d="M20.0708 0.00268555C20.4263 0.100761 20.7588 0.268624 21.0489 0.496495C22.6588 1.62219 23.555 3.22298 24.0534 5.0812C24.1165 5.31523 24.154 5.56097 24.1984 5.80202C24.2163 5.85265 24.2232 5.90647 24.2189 5.95998C24.2145 6.01348 24.1989 6.06546 24.173 6.11251C24.1472 6.15956 24.1117 6.20062 24.0689 6.23302C24.0261 6.26542 23.977 6.28842 23.9247 6.30051C23.6579 6.37306 23.5386 6.20924 23.4707 5.97521C23.0923 4.66353 22.4559 3.44061 21.5988 2.37812C21.1094 1.7633 20.536 1.22047 19.8953 0.765633C19.8023 0.711974 19.7292 0.629582 19.687 0.530834C19.6448 0.432085 19.6358 0.32231 19.6613 0.217996C19.7011 0.0190678 19.8649 0.00268555 20.0708 0.00268555Z" fill="#B2BFC6"/>
-<path d="M22.5769 7.10317C22.5963 7.20372 22.577 7.30792 22.5227 7.39478C22.4684 7.48163 22.3833 7.54469 22.2844 7.57124C22.0083 7.64613 21.8936 7.45422 21.8304 7.23891C21.5444 6.25071 21.113 5.31055 20.5505 4.44924C19.9752 3.55133 19.239 2.76743 18.379 2.13699C18.3254 2.09942 18.2753 2.05712 18.2293 2.01061C18.0842 1.85849 17.897 1.70169 18.0444 1.46532C18.1918 1.22894 18.4281 1.25469 18.6551 1.35298C19.4671 1.71487 20.1792 2.26815 20.7306 2.96547C21.674 4.12378 22.3041 5.50502 22.5605 6.97679C22.5675 7.0236 22.5745 7.07509 22.5769 7.10317Z" fill="#B2BFC6"/>
-<path d="M9.86867 16.1931C9.5247 16.8273 8.87887 16.9537 8.33367 16.4271C7.63169 15.7554 6.99522 15.044 6.35174 14.3208C5.5819 13.4596 4.8378 12.5726 4.082 11.6973C3.78483 11.3533 3.38002 11.2035 2.97287 11.028C3.63039 10.6722 4.1639 10.7588 4.72548 11.3556C5.68252 12.3713 6.62083 13.4034 7.57553 14.4191C8.04104 14.9277 8.54144 15.4031 9.07309 15.842C9.29101 16.0456 9.57144 16.1693 9.86867 16.1931Z" fill="#EDA805"/>
-<path d="M42.6705 8.36353H45.1023V16.4772C45.1023 17.2272 44.9337 17.8787 44.5966 18.4317C44.2633 18.9847 43.7992 19.4109 43.2045 19.7101C42.6099 20.0094 41.9186 20.159 41.1307 20.159C40.4299 20.159 39.7936 20.0359 39.2216 19.7897C38.6534 19.5397 38.2027 19.1609 37.8693 18.6533C37.536 18.1419 37.3712 17.4999 37.375 16.7272H39.8239C39.8314 17.034 39.8939 17.2972 40.0114 17.5169C40.1326 17.7328 40.2974 17.8995 40.5057 18.0169C40.7178 18.1306 40.9678 18.1874 41.2557 18.1874C41.5587 18.1874 41.8144 18.123 42.0227 17.9942C42.2349 17.8616 42.3958 17.6685 42.5057 17.4147C42.6155 17.1609 42.6705 16.8484 42.6705 16.4772V8.36353Z" fill="black"/>
-<path d="M49.5795 20.1647C49.0227 20.1647 48.5265 20.0681 48.0909 19.8749C47.6553 19.6779 47.3106 19.3881 47.0568 19.0056C46.8068 18.6192 46.6818 18.1381 46.6818 17.5624C46.6818 17.0775 46.7708 16.6703 46.9489 16.3408C47.1269 16.0113 47.3693 15.7461 47.6761 15.5453C47.983 15.3446 48.3314 15.1931 48.7216 15.0908C49.1155 14.9885 49.5284 14.9166 49.9602 14.8749C50.4678 14.8219 50.8769 14.7726 51.1875 14.7272C51.4981 14.6779 51.7235 14.606 51.8636 14.5113C52.0038 14.4166 52.0739 14.2764 52.0739 14.0908V14.0567C52.0739 13.6969 51.9602 13.4184 51.733 13.2215C51.5095 13.0245 51.1913 12.926 50.7784 12.926C50.3428 12.926 49.9962 13.0226 49.7386 13.2158C49.4811 13.4052 49.3106 13.6438 49.2273 13.9317L46.9886 13.7499C47.1023 13.2196 47.3258 12.7613 47.6591 12.3749C47.9924 11.9847 48.4224 11.6855 48.9489 11.4772C49.4792 11.265 50.0928 11.159 50.7898 11.159C51.2746 11.159 51.7386 11.2158 52.1818 11.3294C52.6288 11.4431 53.0246 11.6192 53.3693 11.8578C53.7178 12.0965 53.9924 12.4033 54.1932 12.7783C54.3939 13.1495 54.4943 13.5946 54.4943 14.1135V19.9999H52.1989V18.7897H52.1307C51.9905 19.0624 51.803 19.3029 51.5682 19.5113C51.3333 19.7158 51.0511 19.8768 50.7216 19.9942C50.392 20.1078 50.0114 20.1647 49.5795 20.1647ZM50.2727 18.4942C50.6288 18.4942 50.9432 18.4241 51.2159 18.284C51.4886 18.14 51.7027 17.9469 51.858 17.7044C52.0133 17.462 52.0909 17.1874 52.0909 16.8806V15.9544C52.0152 16.0037 51.911 16.0491 51.7784 16.0908C51.6496 16.1287 51.5038 16.1647 51.3409 16.1988C51.178 16.2291 51.0152 16.2575 50.8523 16.284C50.6894 16.3067 50.5417 16.3275 50.4091 16.3465C50.125 16.3881 49.8769 16.4544 49.6648 16.5453C49.4527 16.6363 49.2879 16.7594 49.1705 16.9147C49.053 17.0662 48.9943 17.2556 48.9943 17.4828C48.9943 17.8124 49.1136 18.0643 49.3523 18.2385C49.5947 18.409 49.9015 18.4942 50.2727 18.4942Z" fill="black"/>
-<path d="M58.7926 14.9544V19.9999H56.3722V11.2726H58.679V12.8124H58.7813C58.9744 12.3048 59.2983 11.9033 59.7528 11.6078C60.2074 11.3086 60.7585 11.159 61.4063 11.159C62.0123 11.159 62.5407 11.2916 62.9915 11.5567C63.4422 11.8219 63.7926 12.2006 64.0426 12.6931C64.2926 13.1817 64.4176 13.765 64.4176 14.4431V19.9999H61.9972V14.8749C62.0009 14.3408 61.8646 13.9241 61.5881 13.6249C61.3116 13.3219 60.9309 13.1703 60.446 13.1703C60.1203 13.1703 59.8324 13.2404 59.5824 13.3806C59.3362 13.5207 59.143 13.7253 59.0028 13.9942C58.8665 14.2594 58.7964 14.5794 58.7926 14.9544Z" fill="black"/>
-</svg>
--- a/web/public/icons/ViewGrid.svg
+++ b/web/public/icons/ViewGrid.svg
@ -1,6 +0,0 @@
-<svg width="24" height="24" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M4 6C4 4.89543 4.89543 4 6 4H8C9.10457 4 10 4.89543 10 6V8C10 9.10457 9.10457 10 8 10H6C4.89543 10 4 9.10457 4 8V6Z" stroke="#9CA3AF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M14 6C14 4.89543 14.8954 4 16 4H18C19.1046 4 20 4.89543 20 6V8C20 9.10457 19.1046 10 18 10H16C14.8954 10 14 9.10457 14 8V6Z" stroke="#9CA3AF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M4 16C4 14.8954 4.89543 14 6 14H8C9.10457 14 10 14.8954 10 16V18C10 19.1046 9.10457 20 8 20H6C4.89543 20 4 19.1046 4 18V16Z" stroke="#9CA3AF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
-<path d="M14 16C14 14.8954 14.8954 14 16 14H18C19.1046 14 20 14.8954 20 16V18C20 19.1046 19.1046 20 18 20H16C14.8954 20 14 19.1046 14 18V16Z" stroke="#9CA3AF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
-</svg>
--- a/web/public/icons/avatar.svg
+++ b/web/public/icons/avatar.svg
@ -1 +0,0 @@
-<svg version="1.1" baseProfile="full" xmlns="http://www.w3.org/2000/svg" viewBox="0, 0, 100, 100" class="la-all letterAvatar2 li-la2-Landon"><circle cx="50" cy="50" r="49" fill="#a12e9d" stroke="#ffffff" stroke-width="1" class="la2-circle"></circle><text x="50" y="71.5" font-family="Arial" font-size="60" font-weight="700" text-anchor="middle" fill="#ffffff" class="la2-text">U</text></svg>
--- a/web/public/icons/ic_sidebar_off.svg
+++ b/web/public/icons/ic_sidebar_off.svg
@ -1,9 +0,0 @@
-<svg width="22" height="22" viewBox="0 0 22 22" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M19.8 2.63965H2.2C0.99 2.63965 0 3.62965 0 4.83965V17.1596C0 18.3696 0.99 19.3596 2.2 19.3596H19.8C21.01 19.3596 22 18.3696 22 17.1596V4.83965C22 3.62965 21.01 2.63965 19.8 2.63965ZM14.08 18.4796H2.2C1.474 18.4796 0.88 17.8856 0.88 17.1596V4.83965C0.88 4.11365 1.474 3.51965 2.2 3.51965H14.08V18.4796ZM21.12 17.1596C21.12 17.8856 20.526 18.4796 19.8 18.4796H14.96V3.51965H19.8C20.526 3.51965 21.12 4.11365 21.12 4.83965V17.1596Z" fill="#6B7280"/>
-<path d="M2.6402 5.93957H3.0802C3.3222 5.93957 3.5202 5.74157 3.5202 5.49957C3.5202 5.25757 3.3222 5.05957 3.0802 5.05957H2.6402C2.3982 5.05957 2.2002 5.25757 2.2002 5.49957C2.2002 5.74157 2.3982 5.93957 2.6402 5.93957Z" fill="#6B7280"/>
-<path d="M4.61969 5.93957H5.05969C5.30169 5.93957 5.49969 5.74157 5.49969 5.49957C5.49969 5.25757 5.30169 5.05957 5.05969 5.05957H4.61969C4.37769 5.05957 4.17969 5.25757 4.17969 5.49957C4.17969 5.74157 4.37769 5.93957 4.61969 5.93957Z" fill="#6B7280"/>
-<path d="M6.60016 5.93957H7.04016C7.28216 5.93957 7.48016 5.74157 7.48016 5.49957C7.48016 5.25757 7.28216 5.05957 7.04016 5.05957H6.60016C6.35816 5.05957 6.16016 5.25757 6.16016 5.49957C6.16016 5.74157 6.35816 5.93957 6.60016 5.93957Z" fill="#6B7280"/>
-<path d="M18.6997 6.37988H17.1597C16.9177 6.37988 16.7197 6.57788 16.7197 6.81988C16.7197 7.06188 16.9177 7.25988 17.1597 7.25988H18.6997C18.9417 7.25988 19.1397 7.06188 19.1397 6.81988C19.1397 6.57788 18.9417 6.37988 18.6997 6.37988Z" fill="#6B7280"/>
-<path d="M18.6997 8.58008H17.1597C16.9177 8.58008 16.7197 8.77808 16.7197 9.02008C16.7197 9.26208 16.9177 9.46008 17.1597 9.46008H18.6997C18.9417 9.46008 19.1397 9.26208 19.1397 9.02008C19.1397 8.77808 18.9417 8.58008 18.6997 8.58008Z" fill="#6B7280"/>
-<path d="M18.6997 10.7793H17.1597C16.9177 10.7793 16.7197 10.9773 16.7197 11.2193C16.7197 11.4613 16.9177 11.6593 17.1597 11.6593H18.6997C18.9417 11.6593 19.1397 11.4613 19.1397 11.2193C19.1397 10.9773 18.9417 10.7793 18.6997 10.7793Z" fill="#6B7280"/>
-</svg>
--- a/web/public/icons/ic_sidebar_on.svg
+++ b/web/public/icons/ic_sidebar_on.svg
@ -1,9 +0,0 @@
-<svg width="22" height="22" viewBox="0 0 22 22" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M19.8 2.63965H2.2C0.99 2.63965 0 3.62965 0 4.83965V17.1596C0 18.3696 0.99 19.3596 2.2 19.3596H19.8C21.01 19.3596 22 18.3696 22 17.1596V4.83965C22 3.62965 21.01 2.63965 19.8 2.63965ZM14.08 18.4796H2.2C1.474 18.4796 0.88 17.8856 0.88 17.1596V4.83965C0.88 4.11365 1.474 3.51965 2.2 3.51965H14.08V18.4796ZM21.12 17.1596C21.12 17.8856 20.526 18.4796 19.8 18.4796H14.96V3.51965H19.8C20.526 3.51965 21.12 4.11365 21.12 4.83965V17.1596Z" fill="#6B7280"/>
-<path d="M2.63922 5.93957H3.07922C3.32122 5.93957 3.51922 5.74157 3.51922 5.49957C3.51922 5.25757 3.32122 5.05957 3.07922 5.05957H2.63922C2.39722 5.05957 2.19922 5.25757 2.19922 5.49957C2.19922 5.74157 2.39722 5.93957 2.63922 5.93957Z" fill="#6B7280"/>
-<path d="M4.61969 5.93957H5.05969C5.30169 5.93957 5.49969 5.74157 5.49969 5.49957C5.49969 5.25757 5.30169 5.05957 5.05969 5.05957H4.61969C4.37769 5.05957 4.17969 5.25757 4.17969 5.49957C4.17969 5.74157 4.37769 5.93957 4.61969 5.93957Z" fill="#6B7280"/>
-<path d="M6.60016 5.93957H7.04016C7.28216 5.93957 7.48016 5.74157 7.48016 5.49957C7.48016 5.25757 7.28216 5.05957 7.04016 5.05957H6.60016C6.35816 5.05957 6.16016 5.25757 6.16016 5.49957C6.16016 5.74157 6.35816 5.93957 6.60016 5.93957Z" fill="#6B7280"/>
-<path d="M18.7007 6.37988H17.1607C16.9187 6.37988 16.7207 6.57788 16.7207 6.81988C16.7207 7.06188 16.9187 7.25988 17.1607 7.25988H18.7007C18.9427 7.25988 19.1407 7.06188 19.1407 6.81988C19.1407 6.57788 18.9427 6.37988 18.7007 6.37988Z" fill="#6B7280"/>
-<path d="M18.7007 8.58008H17.1607C16.9187 8.58008 16.7207 8.77808 16.7207 9.02008C16.7207 9.26208 16.9187 9.46008 17.1607 9.46008H18.7007C18.9427 9.46008 19.1407 9.26208 19.1407 9.02008C19.1407 8.77808 18.9427 8.58008 18.7007 8.58008Z" fill="#6B7280"/>
-<path d="M18.7007 10.7793H17.1607C16.9187 10.7793 16.7207 10.9773 16.7207 11.2193C16.7207 11.4613 16.9187 11.6593 17.1607 11.6593H18.7007C18.9427 11.6593 19.1407 11.4613 19.1407 11.2193C19.1407 10.9773 18.9427 10.7793 18.7007 10.7793Z" fill="#6B7280"/>
-</svg>
--- a/web/public/images/banner.jpg
+++ b/web/public/images/banner.jpg
--- a/web/public/images/mobile.jpg
+++ b/web/public/images/mobile.jpg
--- a/web/public/images/preview.jpg
+++ b/web/public/images/preview.jpg
--- a/web/screens/Chat/ChatBody/index.tsx
+++ b/web/screens/Chat/ChatBody/index.tsx
@ -1,8 +1,8 @@
-import ScrollToBottom from 'react-scroll-to-bottom'
-
 import { MessageStatus } from '@janhq/core'
 import { useAtomValue } from 'jotai'

+import ListContainer from '@/containers/ListContainer'
+
 import { loadModelErrorAtom } from '@/hooks/useActiveModel'

 import ChatItem from '../ChatItem'
@ -26,7 +26,7 @@ const ChatBody: React.FC = () => {
  if (messages.length === 0) return <EmptyThread />

  return (
-    <ScrollToBottom className="flex h-full w-full flex-col">
+    <ListContainer>
      {messages.map((message, index) => (
        <div key={message.id}>
          {message.status !== MessageStatus.Error &&
@ -43,7 +43,7 @@ const ChatBody: React.FC = () => {
        </div>
      ))}
      {loadModelError && <LoadModelError />}
-    </ScrollToBottom>
+    </ListContainer>
  )
 }

--- a/web/screens/Chat/EngineSetting/index.tsx
+++ b/web/screens/Chat/EngineSetting/index.tsx
@ -1,4 +1,4 @@
-import { SettingComponentProps } from '@janhq/core/.'
+import { SettingComponentProps } from '@janhq/core'

 import SettingComponentBuilder from '../../Chat/ModelSetting/SettingComponent'

--- a/web/screens/Chat/ModelSetting/index.tsx
+++ b/web/screens/Chat/ModelSetting/index.tsx
@ -1,6 +1,6 @@
 import React from 'react'

-import { SettingComponentProps } from '@janhq/core/.'
+import { SettingComponentProps } from '@janhq/core'

 import SettingComponentBuilder from './SettingComponent'

--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`<svg version="1.1" baseProfile="full" xmlns="http://www.w3.org/2000/svg" viewBox="0, 0, 100, 100" class="la-all letterAvatar2 li-la2-Landon"><circle cx="50" cy="50" r="49" fill="#a12e9d" stroke="#ffffff" stroke-width="1" class="la2-circle"></circle><text x="50" y="71.5" font-family="Arial" font-size="60" font-weight="700" text-anchor="middle" fill="#ffffff" class="la2-text">U</text></svg>`