diff --git a/.github/workflows/jan-electron-linter-and-test.yml b/.github/workflows/jan-electron-linter-and-test.yml index a9f934cfd..3221a63ec 100644 --- a/.github/workflows/jan-electron-linter-and-test.yml +++ b/.github/workflows/jan-electron-linter-and-test.yml @@ -57,19 +57,19 @@ jobs: rm -rf ~/jan make clean - - name: Get Commit Message for PR - if : github.event_name == 'pull_request' - run: | - echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}})" >> $GITHUB_ENV + # - name: Get Commit Message for PR + # if : github.event_name == 'pull_request' + # run: | + # echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}})" >> $GITHUB_ENV - - name: Get Commit Message for push event - if : github.event_name == 'push' - run: | - echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}})" >> $GITHUB_ENV + # - name: Get Commit Message for push event + # if : github.event_name == 'push' + # run: | + # echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}})" >> $GITHUB_ENV - - name: "Config report portal" - run: | - make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App macos" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" + # - name: "Config report portal" + # run: | + # make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App macos" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" - name: Linter and test run: | @@ -78,9 +78,9 @@ jobs: make test env: CSC_IDENTITY_AUTO_DISCOVERY: "false" - TURBO_API: "${{ secrets.TURBO_API }}" - TURBO_TEAM: "macos" - TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" + # TURBO_API: "${{ secrets.TURBO_API }}" + # TURBO_TEAM: "macos" + # TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" test-on-macos-pr-target: if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository @@ -141,16 +141,16 @@ jobs: } make clean - - name: Get Commit Message for push event - if : github.event_name == 'push' - shell: bash - run: | - echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV + # - name: Get Commit Message for push event + # if : github.event_name == 'push' + # shell: bash + # run: | + # echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV - - name: "Config report portal" - shell: bash - run: | - make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows ${{ matrix.antivirus-tools }}" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" + # - name: "Config report portal" + # shell: bash + # run: | + # make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows ${{ matrix.antivirus-tools }}" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" - name: Linter and test shell: powershell @@ -158,10 +158,10 @@ jobs: npm config set registry ${{ secrets.NPM_PROXY }} --global yarn config set registry ${{ secrets.NPM_PROXY }} --global make test - env: - TURBO_API: "${{ secrets.TURBO_API }}" - TURBO_TEAM: "windows" - TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" + # env: + # TURBO_API: "${{ secrets.TURBO_API }}" + # TURBO_TEAM: "windows" + # TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" test-on-windows-pr: if: (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) runs-on: windows-desktop-default-windows-security @@ -189,16 +189,16 @@ jobs: } make clean - - name: Get Commit Message for PR - if : github.event_name == 'pull_request' - shell: bash - run: | - echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV + # - name: Get Commit Message for PR + # if : github.event_name == 'pull_request' + # shell: bash + # run: | + # echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV - - name: "Config report portal" - shell: bash - run: | - make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" + # - name: "Config report portal" + # shell: bash + # run: | + # make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Windows" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" - name: Linter and test shell: powershell @@ -206,10 +206,10 @@ jobs: npm config set registry ${{ secrets.NPM_PROXY }} --global yarn config set registry ${{ secrets.NPM_PROXY }} --global make test - env: - TURBO_API: "${{ secrets.TURBO_API }}" - TURBO_TEAM: "windows" - TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" + # env: + # TURBO_API: "${{ secrets.TURBO_API }}" + # TURBO_TEAM: "windows" + # TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" test-on-windows-pr-target: if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository @@ -266,20 +266,20 @@ jobs: rm -rf ~/jan make clean - - name: Get Commit Message for PR - if : github.event_name == 'pull_request' - run: | - echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV + # - name: Get Commit Message for PR + # if : github.event_name == 'pull_request' + # run: | + # echo "REPORT_PORTAL_DESCRIPTION=${{github.event.after}}" >> $GITHUB_ENV - - name: Get Commit Message for push event - if : github.event_name == 'push' - run: | - echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV + # - name: Get Commit Message for push event + # if : github.event_name == 'push' + # run: | + # echo "REPORT_PORTAL_DESCRIPTION=${{github.sha}}" >> $GITHUB_ENV - - name: "Config report portal" - shell: bash - run: | - make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Linux" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" + # - name: "Config report portal" + # shell: bash + # run: | + # make update-playwright-config REPORT_PORTAL_URL=${{ secrets.REPORT_PORTAL_URL }} REPORT_PORTAL_API_KEY=${{ secrets.REPORT_PORTAL_API_KEY }} REPORT_PORTAL_PROJECT_NAME=${{ secrets.REPORT_PORTAL_PROJECT_NAME }} REPORT_PORTAL_LAUNCH_NAME="Jan App Linux" REPORT_PORTAL_DESCRIPTION="${{env.REPORT_PORTAL_DESCRIPTION}}" - name: Linter and test run: | @@ -288,10 +288,10 @@ jobs: npm config set registry ${{ secrets.NPM_PROXY }} --global yarn config set registry ${{ secrets.NPM_PROXY }} --global make test - env: - TURBO_API: "${{ secrets.TURBO_API }}" - TURBO_TEAM: "linux" - TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" + # env: + # TURBO_API: "${{ secrets.TURBO_API }}" + # TURBO_TEAM: "linux" + # TURBO_TOKEN: "${{ secrets.TURBO_TOKEN }}" test-on-ubuntu-pr-target: runs-on: [self-hosted, Linux, ubuntu-desktop] diff --git a/core/src/browser/core.ts b/core/src/browser/core.ts index 088c32e4e..fdbceb06b 100644 --- a/core/src/browser/core.ts +++ b/core/src/browser/core.ts @@ -28,6 +28,15 @@ const downloadFile: (downloadRequest: DownloadRequest, network?: NetworkConfig) network ) => globalThis.core?.api?.downloadFile(downloadRequest, network) +/** + * Get unit in bytes for a remote file. + * + * @param url - The url of the file. + * @returns {Promise} - A promise that resolves with the file size. + */ +const getFileSize: (url: string) => Promise = (url: string) => + globalThis.core.api?.getFileSize(url) + /** * Aborts the download of a specific file. * @param {string} fileName - The name of the file whose download is to be aborted. @@ -122,6 +131,7 @@ const systemInformation: () => Promise = () => */ const showToast: (title: string, message: string) => void = (title, message) => globalThis.core.api?.showToast(title, message) + /** * Register extension point function type definition */ @@ -150,5 +160,6 @@ export { getUserHomePath, systemInformation, showToast, + getFileSize, FileStat, } diff --git a/core/src/browser/extensions/huggingface.ts b/core/src/browser/extensions/huggingface.ts deleted file mode 100644 index b9c9626a0..000000000 --- a/core/src/browser/extensions/huggingface.ts +++ /dev/null @@ -1,30 +0,0 @@ -import { BaseExtension, ExtensionTypeEnum } from '../extension' -import { HuggingFaceInterface, HuggingFaceRepoData, Quantization } from '../../types/huggingface' -import { Model } from '../../types/model' - -/** - * Hugging Face extension for converting HF models to GGUF. - */ -export abstract class HuggingFaceExtension extends BaseExtension implements HuggingFaceInterface { - interrupted = false - /** - * Hugging Face extension type. - */ - type(): ExtensionTypeEnum | undefined { - return ExtensionTypeEnum.HuggingFace - } - - abstract downloadModelFiles( - repoID: string, - repoData: HuggingFaceRepoData, - network?: { ignoreSSL?: boolean; proxy?: string } - ): Promise - abstract convert(repoID: string): Promise - abstract quantize(repoID: string, quantization: Quantization): Promise - abstract generateMetadata( - repoID: string, - repoData: HuggingFaceRepoData, - quantization: Quantization - ): Promise - abstract cancelConvert(repoID: string, repoData: HuggingFaceRepoData): Promise -} diff --git a/core/src/browser/extensions/index.ts b/core/src/browser/extensions/index.ts index 768886d49..85d5a8583 100644 --- a/core/src/browser/extensions/index.ts +++ b/core/src/browser/extensions/index.ts @@ -24,11 +24,6 @@ export { AssistantExtension } from './assistant' */ export { ModelExtension } from './model' -/** - * Hugging Face extension for converting HF models to GGUF. - */ -export { HuggingFaceExtension } from './huggingface' - /** * Base AI Engines. */ diff --git a/core/src/browser/extensions/model.ts b/core/src/browser/extensions/model.ts index 6dd52f192..5b3089403 100644 --- a/core/src/browser/extensions/model.ts +++ b/core/src/browser/extensions/model.ts @@ -1,5 +1,12 @@ import { BaseExtension, ExtensionTypeEnum } from '../extension' -import { GpuSetting, ImportingModel, Model, ModelInterface, OptionType } from '../../types' +import { + GpuSetting, + HuggingFaceRepoData, + ImportingModel, + Model, + ModelInterface, + OptionType, +} from '../../types' /** * Model extension for managing models. @@ -24,4 +31,6 @@ export abstract class ModelExtension extends BaseExtension implements ModelInter abstract getConfiguredModels(): Promise abstract importModels(models: ImportingModel[], optionType: OptionType): Promise abstract updateModelInfo(modelInfo: Partial): Promise + abstract fetchHuggingFaceRepoData(repoId: string): Promise + abstract getDefaultModel(): Promise } diff --git a/core/src/node/api/processors/download.ts b/core/src/node/api/processors/download.ts index 98464dd52..a4af47400 100644 --- a/core/src/node/api/processors/download.ts +++ b/core/src/node/api/processors/download.ts @@ -66,6 +66,7 @@ export class Downloader implements Processor { localPath: normalizedPath, } DownloadManager.instance.downloadProgressMap[modelId] = initialDownloadState + DownloadManager.instance.downloadInfo[normalizedPath] = initialDownloadState if (downloadRequest.downloadType === 'extension') { observer?.(DownloadEvent.onFileDownloadUpdate, initialDownloadState) @@ -118,19 +119,42 @@ export class Downloader implements Processor { if (rq) { DownloadManager.instance.networkRequests[fileName] = undefined rq?.abort() - } else { - observer?.(DownloadEvent.onFileDownloadError, { - fileName, - error: 'aborted', - }) } + + const downloadInfo = DownloadManager.instance.downloadInfo[fileName] + observer?.(DownloadEvent.onFileDownloadError, { + ...downloadInfo, + fileName, + error: 'aborted', + }) } - resumeDownload(observer: any, fileName: any) { + resumeDownload(_observer: any, fileName: any) { DownloadManager.instance.networkRequests[fileName]?.resume() } - pauseDownload(observer: any, fileName: any) { + pauseDownload(_observer: any, fileName: any) { DownloadManager.instance.networkRequests[fileName]?.pause() } + + async getFileSize(_observer: any, url: string): Promise { + return new Promise((resolve, reject) => { + const request = require('request') + request( + { + url, + method: 'HEAD', + }, + function (err: any, response: any) { + if (err) { + console.error('Getting file size failed:', err) + reject(err) + } else { + const size: number = response.headers['content-length'] ?? -1 + resolve(size) + } + } + ) + }) + } } diff --git a/core/src/node/helper/download.ts b/core/src/node/helper/download.ts index b9fb88bb5..b7560d100 100644 --- a/core/src/node/helper/download.ts +++ b/core/src/node/helper/download.ts @@ -8,8 +8,12 @@ export class DownloadManager { public static instance: DownloadManager = new DownloadManager() + // store the download information with key is model id public downloadProgressMap: Record = {} + // store the download infomation with key is normalized file path + public downloadInfo: Record = {} + constructor() { if (DownloadManager.instance) { return DownloadManager.instance diff --git a/core/src/types/api/index.ts b/core/src/types/api/index.ts index d95d0474e..fb0167a04 100644 --- a/core/src/types/api/index.ts +++ b/core/src/types/api/index.ts @@ -32,7 +32,6 @@ export enum AppRoute { startServer = 'startServer', stopServer = 'stopServer', log = 'log', - logServer = 'logServer', systemInformation = 'systemInformation', showToast = 'showToast', } @@ -52,6 +51,7 @@ export enum DownloadRoute { pauseDownload = 'pauseDownload', resumeDownload = 'resumeDownload', getDownloadProgress = 'getDownloadProgress', + getFileSize = 'getFileSize', } export enum DownloadEvent { diff --git a/core/src/types/huggingface/huggingfaceEntity.ts b/core/src/types/huggingface/huggingfaceEntity.ts index c3c320354..da846900b 100644 --- a/core/src/types/huggingface/huggingfaceEntity.ts +++ b/core/src/types/huggingface/huggingfaceEntity.ts @@ -1,34 +1,65 @@ export interface HuggingFaceRepoData { id: string + modelId: string + modelUrl?: string author: string + sha: string + downloads: number + lastModified: string + private: boolean + disabled: boolean + gated: boolean + pipeline_tag: 'text-generation' tags: Array<'transformers' | 'pytorch' | 'safetensors' | string> + cardData: Record siblings: { rfilename: string + downloadUrl?: string + fileSize?: number + quantization?: Quantization }[] - createdAt: string // ISO 8601 timestamp + createdAt: string } -/* eslint-disable @typescript-eslint/naming-convention */ -export enum Quantization { - Q3_K_S = 'Q3_K_S', - Q3_K_M = 'Q3_K_M', // eslint-disable-line @typescript-eslint/no-duplicate-enum-values - Q3_K_L = 'Q3_K_L', - Q4_K_S = 'Q4_K_S', - Q4_K_M = 'Q4_K_M', // eslint-disable-line @typescript-eslint/no-duplicate-enum-values - Q5_K_S = 'Q5_K_S', - Q5_K_M = 'Q5_K_M', // eslint-disable-line @typescript-eslint/no-duplicate-enum-values - Q4_0 = 'Q4_0', - Q4_1 = 'Q4_1', - Q5_0 = 'Q5_0', - Q5_1 = 'Q5_1', - IQ2_XXS = 'IQ2_XXS', - IQ2_XS = 'IQ2_XS', - Q2_K = 'Q2_K', - Q2_K_S = 'Q2_K_S', - Q6_K = 'Q6_K', - Q8_0 = 'Q8_0', - F16 = 'F16', - F32 = 'F32', - COPY = 'COPY', -} -/* eslint-enable @typescript-eslint/naming-convention */ +const CardDataKeys = [ + 'base_model', + 'datasets', + 'inference', + 'language', + 'library_name', + 'license', + 'model_creator', + 'model_name', + 'model_type', + 'pipeline_tag', + 'prompt_template', + 'quantized_by', + 'tags', +] as const +export type CardDataKeysTuple = typeof CardDataKeys +export type CardDataKeys = CardDataKeysTuple[number] + +export const AllQuantizations = [ + 'Q3_K_S', + 'Q3_K_M', + 'Q3_K_L', + 'Q4_K_S', + 'Q4_K_M', + 'Q5_K_S', + 'Q5_K_M', + 'Q4_0', + 'Q4_1', + 'Q5_0', + 'Q5_1', + 'IQ2_XXS', + 'IQ2_XS', + 'Q2_K', + 'Q2_K_S', + 'Q6_K', + 'Q8_0', + 'F16', + 'F32', + 'COPY', +] +export type QuantizationsTuple = typeof AllQuantizations +export type Quantization = QuantizationsTuple[number] diff --git a/core/src/types/huggingface/huggingfaceInterface.ts b/core/src/types/huggingface/huggingfaceInterface.ts deleted file mode 100644 index c99b2177d..000000000 --- a/core/src/types/huggingface/huggingfaceInterface.ts +++ /dev/null @@ -1,58 +0,0 @@ -import { Model } from '../model' -import { HuggingFaceRepoData, Quantization } from './huggingfaceEntity' - -/** - * Hugging Face extension for converting HF models to GGUF. - * @extends BaseExtension - */ -export interface HuggingFaceInterface { - interrupted: boolean - /** - * Downloads a Hugging Face model. - * @param repoID - The repo ID of the model to convert. - * @param repoData - The repo data of the model to convert. - * @param network - Optional object to specify proxy/whether to ignore SSL certificates. - * @returns A promise that resolves when the download is complete. - */ - downloadModelFiles( - repoID: string, - repoData: HuggingFaceRepoData, - network?: { ignoreSSL?: boolean; proxy?: string } - ): Promise - - /** - * Converts a Hugging Face model to GGUF. - * @param repoID - The repo ID of the model to convert. - * @returns A promise that resolves when the conversion is complete. - */ - convert(repoID: string): Promise - - /** - * Quantizes a GGUF model. - * @param repoID - The repo ID of the model to quantize. - * @param quantization - The quantization to use. - * @returns A promise that resolves when the quantization is complete. - */ - quantize(repoID: string, quantization: Quantization): Promise - - /** - * Generates Jan model metadata from a Hugging Face model. - * @param repoID - The repo ID of the model to generate metadata for. - * @param repoData - The repo data of the model to generate metadata for. - * @param quantization - The quantization of the model. - * @returns A promise that resolves when the model metadata generation is complete. - */ - generateMetadata( - repoID: string, - repoData: HuggingFaceRepoData, - quantization: Quantization - ): Promise - - /** - * Cancels the convert of current Hugging Face model. - * @param repoID - The repository ID to cancel. - * @param repoData - The repository data to cancel. - * @returns {Promise} A promise that resolves when the download has been cancelled. - */ - cancelConvert(repoID: string, repoData: HuggingFaceRepoData): Promise -} diff --git a/core/src/types/huggingface/index.ts b/core/src/types/huggingface/index.ts index c108c55e2..a32e4a171 100644 --- a/core/src/types/huggingface/index.ts +++ b/core/src/types/huggingface/index.ts @@ -1,2 +1 @@ -export * from './huggingfaceInterface' export * from './huggingfaceEntity' diff --git a/electron/icons/icon.ico b/electron/icons/icon.ico new file mode 100644 index 000000000..40c76171d Binary files /dev/null and b/electron/icons/icon.ico differ diff --git a/electron/package.json b/electron/package.json index e76b9172a..7f1c978e4 100644 --- a/electron/package.json +++ b/electron/package.json @@ -54,6 +54,13 @@ "nsis" ] }, + "nsis": { + "oneClick": true, + "installerIcon": "icons/icon.ico", + "uninstallerIcon": "icons/icon.ico", + "include": "scripts/uninstaller.nsh", + "deleteAppDataOnUninstall": true + }, "artifactName": "jan-${os}-${arch}-${version}.${ext}" }, "scripts": { diff --git a/electron/scripts/uninstaller.nsh b/electron/scripts/uninstaller.nsh new file mode 100644 index 000000000..ad6d91591 --- /dev/null +++ b/electron/scripts/uninstaller.nsh @@ -0,0 +1,18 @@ +!include nsDialogs.nsh + +XPStyle on + +!macro customUnInstall +; Uninstall process execution + ${ifNot} ${isUpdated} + # If you tick Delete fixed folder + MessageBox MB_OKCANCEL "Do you also want to delete the DEFAULT Jan data folder at $PROFILE\jan?" IDOK label_ok IDCANCEL label_cancel + label_ok: + # Delete user data folder + RMDir /r $PROFILE\jan + Goto end + label_cancel: + Goto end + end: + ${endIf} +!macroend \ No newline at end of file diff --git a/extensions/assistant-extension/package.json b/extensions/assistant-extension/package.json index 094f9820c..aa5dba692 100644 --- a/extensions/assistant-extension/package.json +++ b/extensions/assistant-extension/package.json @@ -35,7 +35,6 @@ "@langchain/community": "0.0.13", "hnswlib-node": "^1.4.2", "langchain": "^0.0.214", - "path-browserify": "^1.0.1", "pdf-parse": "^1.1.1", "ts-loader": "^9.5.0" }, diff --git a/extensions/conversational-extension/package.json b/extensions/conversational-extension/package.json index a29967da4..d062ce9c3 100644 --- a/extensions/conversational-extension/package.json +++ b/extensions/conversational-extension/package.json @@ -22,8 +22,7 @@ "ts-loader": "^9.5.0" }, "dependencies": { - "@janhq/core": "file:../../core", - "path-browserify": "^1.0.1" + "@janhq/core": "file:../../core" }, "engines": { "node": ">=18.0.0" diff --git a/extensions/conversational-extension/webpack.config.js b/extensions/conversational-extension/webpack.config.js index b56a8f264..e4a0b2179 100644 --- a/extensions/conversational-extension/webpack.config.js +++ b/extensions/conversational-extension/webpack.config.js @@ -1,4 +1,3 @@ -const path = require('path') const webpack = require('webpack') module.exports = { @@ -16,15 +15,11 @@ module.exports = { }, output: { filename: 'index.js', // Adjust the output file name as needed - path: path.resolve(__dirname, 'dist'), library: { type: 'module' }, // Specify ESM output format }, plugins: [new webpack.DefinePlugin({})], resolve: { extensions: ['.ts', '.js'], - fallback: { - path: require.resolve('path-browserify'), - } }, // Do not minify the output, otherwise it breaks the class registration optimization: { diff --git a/extensions/huggingface-extension/.gitignore b/extensions/huggingface-extension/.gitignore deleted file mode 100644 index bdf39cc7f..000000000 --- a/extensions/huggingface-extension/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -bin -scripts/convert* -scripts/gguf-py diff --git a/extensions/huggingface-extension/.prettierrc b/extensions/huggingface-extension/.prettierrc deleted file mode 100644 index 46f1abcb0..000000000 --- a/extensions/huggingface-extension/.prettierrc +++ /dev/null @@ -1,8 +0,0 @@ -{ - "semi": false, - "singleQuote": true, - "quoteProps": "consistent", - "trailingComma": "es5", - "endOfLine": "auto", - "plugins": ["prettier-plugin-tailwindcss"] -} diff --git a/extensions/huggingface-extension/README.md b/extensions/huggingface-extension/README.md deleted file mode 100644 index f9690da09..000000000 --- a/extensions/huggingface-extension/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# Create a Jan Extension using Typescript - -Use this template to bootstrap the creation of a TypeScript Jan extension. 🚀 - -## Create Your Own Extension - -To create your own extension, you can use this repository as a template! Just follow the below instructions: - -1. Click the Use this template button at the top of the repository -2. Select Create a new repository -3. Select an owner and name for your new repository -4. Click Create repository -5. Clone your new repository - -## Initial Setup - -After you've cloned the repository to your local machine or codespace, you'll need to perform some initial setup steps before you can develop your extension. - -> [!NOTE] -> -> You'll need to have a reasonably modern version of -> [Node.js](https://nodejs.org) handy. If you are using a version manager like -> [`nodenv`](https://github.com/nodenv/nodenv) or -> [`nvm`](https://github.com/nvm-sh/nvm), you can run `nodenv install` in the -> root of your repository to install the version specified in -> [`package.json`](./package.json). Otherwise, 20.x or later should work! - -1. :hammer_and_wrench: Install the dependencies - - ```bash - npm install - ``` - -1. :building_construction: Package the TypeScript for distribution - - ```bash - npm run bundle - ``` - -1. :white_check_mark: Check your artifact - - There will be a tgz file in your extension directory now - -## Update the Extension Metadata - -The [`package.json`](package.json) file defines metadata about your extension, such as -extension name, main entry, description and version. - -When you copy this repository, update `package.json` with the name, description for your extension. - -## Update the Extension Code - -The [`src/`](./src/) directory is the heart of your extension! This contains the -source code that will be run when your extension functions are invoked. You can replace the -contents of this directory with your own code. - -There are a few things to keep in mind when writing your extension code: - -- Most Jan Extension functions are processed asynchronously. - In `index.ts`, you will see that the extension function will return a `Promise`. - - ```typescript - import { events, MessageEvent, MessageRequest } from '@janhq/core' - - function onStart(): Promise { - return events.on(MessageEvent.OnMessageSent, (data: MessageRequest) => - this.inference(data) - ) - } - ``` - - For more information about the Jan Extension Core module, see the - [documentation](https://github.com/janhq/jan/blob/main/core/README.md). - -So, what are you waiting for? Go ahead and start customizing your extension! diff --git a/extensions/huggingface-extension/bin/mac-arm64/quantize b/extensions/huggingface-extension/bin/mac-arm64/quantize deleted file mode 100755 index f8a149b10..000000000 Binary files a/extensions/huggingface-extension/bin/mac-arm64/quantize and /dev/null differ diff --git a/extensions/huggingface-extension/package.json b/extensions/huggingface-extension/package.json deleted file mode 100644 index c0c18c5eb..000000000 --- a/extensions/huggingface-extension/package.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "name": "@janhq/huggingface-extension", - "productName": "HuggingFace", - "version": "1.0.0", - "description": "Hugging Face extension for converting HF models to GGUF", - "main": "dist/index.js", - "node": "dist/node/index.cjs.js", - "author": "Jan ", - "license": "AGPL-3.0", - "scripts": { - "build": "tsc --module commonjs && rollup -c rollup.config.ts --configPlugin @rollup/plugin-typescript --bundleConfigAsCjs", - "download:llama": "run-script-os", - "download:llama:linux": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz --wildcards '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", - "download:llama:darwin": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", - "download:llama:win32": "download.bat", - "build:publish:linux": "rimraf *.tgz --glob && npm run build && npm run download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", - "build:publish:darwin": "rimraf *.tgz --glob && npm run build && npm run download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && ../../.github/scripts/auto-sign.sh && npm pack && cpx *.tgz ../../pre-install", - "build:publish:win32": "rimraf *.tgz --glob && npm run build && npm run download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", - "build:publish": "run-script-os" - }, - "exports": { - ".": "./dist/index.js", - "./main": "./dist/node/index.cjs.js" - }, - "devDependencies": { - "@rollup/plugin-commonjs": "^25.0.7", - "@rollup/plugin-json": "^6.1.0", - "@rollup/plugin-node-resolve": "^15.2.3", - "@rollup/plugin-replace": "^5.0.5", - "@rollup/plugin-typescript": "^11.1.6", - "@types/node": "^20.11.16", - "cpx": "^1.5.0", - "download-cli": "^1.1.1", - "rimraf": "^5.0.5", - "rollup": "^4.9.6", - "rollup-plugin-sourcemaps": "^0.6.3", - "rollup-plugin-typescript2": "^0.36.0", - "run-script-os": "^1.1.6", - "typescript": "^5.3.3" - }, - "dependencies": { - "@janhq/core": "file:../../core", - "hyllama": "^0.1.2", - "python-shell": "^5.0.0", - "ts-loader": "^9.5.0" - }, - "bundledDependencies": [ - "python-shell" - ], - "engines": { - "node": ">=18.0.0" - }, - "files": [ - "dist/*", - "package.json", - "README.md" - ] -} diff --git a/extensions/huggingface-extension/rollup.config.ts b/extensions/huggingface-extension/rollup.config.ts deleted file mode 100644 index 16cf3c46d..000000000 --- a/extensions/huggingface-extension/rollup.config.ts +++ /dev/null @@ -1,72 +0,0 @@ -import resolve from '@rollup/plugin-node-resolve' -import commonjs from '@rollup/plugin-commonjs' -import sourceMaps from 'rollup-plugin-sourcemaps' -import typescript from 'rollup-plugin-typescript2' -import json from '@rollup/plugin-json' -import replace from '@rollup/plugin-replace' - -const packageJson = require('./package.json') - -export default [ - { - input: `src/index.ts`, - output: [{ file: packageJson.main, format: 'es', sourcemap: true }], - // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash') - external: [], - watch: { - include: 'src/**', - }, - plugins: [ - replace({ - preventAssignment: true, - NODE_MODULE_PATH: JSON.stringify( - `${packageJson.name}/${packageJson.node}` - ), - }), - // Allow json resolution - json(), - // Compile TypeScript files - typescript({ useTsconfigDeclarationDir: true }), - // Compile TypeScript files - // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs) - commonjs(), - // Allow node_modules resolution, so you can use 'external' to control - // which external modules to include in the bundle - // https://github.com/rollup/rollup-plugin-node-resolve#usage - resolve({ - extensions: ['.js', '.ts'], - }), - - // Resolve source maps to the original source - sourceMaps(), - ], - }, - { - input: `src/node/index.ts`, - output: [ - { file: 'dist/node/index.cjs.js', format: 'cjs', sourcemap: true }, - ], - // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash') - external: [], - watch: { - include: 'src/node/**', - }, - plugins: [ - // Allow json resolution - json(), - // Compile TypeScript files - typescript({ useTsconfigDeclarationDir: true }), - // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs) - commonjs(), - // Allow node_modules resolution, so you can use 'external' to control - // which external modules to include in the bundle - // https://github.com/rollup/rollup-plugin-node-resolve#usage - resolve({ - extensions: ['.ts', '.js', '.json'], - }), - - // Resolve source maps to the original source - sourceMaps(), - ], - }, -] diff --git a/extensions/huggingface-extension/src/@types/global.d.ts b/extensions/huggingface-extension/src/@types/global.d.ts deleted file mode 100644 index b30fe9d69..000000000 --- a/extensions/huggingface-extension/src/@types/global.d.ts +++ /dev/null @@ -1 +0,0 @@ -declare const NODE_MODULE_PATH: string diff --git a/extensions/huggingface-extension/src/index.ts b/extensions/huggingface-extension/src/index.ts deleted file mode 100644 index 0425b9f88..000000000 --- a/extensions/huggingface-extension/src/index.ts +++ /dev/null @@ -1,399 +0,0 @@ -import { - fs, - downloadFile, - abortDownload, - joinPath, - HuggingFaceExtension, - HuggingFaceRepoData, - executeOnMain, - Quantization, - Model, - InferenceEngine, - getJanDataFolderPath, - events, - DownloadEvent, - log, - DownloadRequest, -} from '@janhq/core' - -declare global { - interface Window { - electronAPI?: any - } -} - -/** - * A extension for models - */ -export default class JanHuggingFaceExtension extends HuggingFaceExtension { - private static readonly _safetensorsRegexs = [ - /model\.safetensors$/, - /model-[0-9]+-of-[0-9]+\.safetensors$/, - ] - private static readonly _pytorchRegexs = [ - /pytorch_model\.bin$/, - /consolidated\.[0-9]+\.pth$/, - /pytorch_model-[0-9]+-of-[0-9]+\.bin$/, - /.*\.pt$/, - ] - interrupted = false - - /** - * Called when the extension is loaded. - * @override - */ - onLoad() {} - - /** - * Called when the extension is unloaded. - * @override - */ - onUnload(): void {} - - private getFileList(repoData: HuggingFaceRepoData): string[] { - // SafeTensors first, if not, then PyTorch - const modelFiles = repoData.siblings - .map((file) => file.rfilename) - .filter((file) => - JanHuggingFaceExtension._safetensorsRegexs.some((regex) => - regex.test(file) - ) - ) - if (modelFiles.length === 0) { - repoData.siblings.forEach((file) => { - if ( - JanHuggingFaceExtension._pytorchRegexs.some((regex) => - regex.test(file.rfilename) - ) - ) { - modelFiles.push(file.rfilename) - } - }) - } - - const vocabFiles = [ - 'tokenizer.model', - 'vocab.json', - 'tokenizer.json', - ].filter((file) => - repoData.siblings.some((sibling) => sibling.rfilename === file) - ) - - const etcFiles = repoData.siblings - .map((file) => file.rfilename) - .filter( - (file) => - (file.endsWith('.json') && !vocabFiles.includes(file)) || - file.endsWith('.txt') || - file.endsWith('.py') || - file.endsWith('.tiktoken') - ) - - return [...modelFiles, ...vocabFiles, ...etcFiles] - } - - private async getModelDirPath(repoID: string): Promise { - const modelName = repoID.split('/').slice(1).join('/') - return joinPath([await getJanDataFolderPath(), 'models', modelName]) - } - private async getConvertedModelPath(repoID: string): Promise { - const modelName = repoID.split('/').slice(1).join('/') - const modelDirPath = await this.getModelDirPath(repoID) - return joinPath([modelDirPath, modelName + '.gguf']) - } - private async getQuantizedModelPath( - repoID: string, - quantization: Quantization - ): Promise { - const modelName = repoID.split('/').slice(1).join('/') - const modelDirPath = await this.getModelDirPath(repoID) - return joinPath([ - modelDirPath, - modelName + `-${quantization.toLowerCase()}.gguf`, - ]) - } - private getCtxLength(config: { - max_sequence_length?: number - max_position_embeddings?: number - n_ctx?: number - }): number { - if (config.max_sequence_length) return config.max_sequence_length - if (config.max_position_embeddings) return config.max_position_embeddings - if (config.n_ctx) return config.n_ctx - return 4096 - } - - /** - * Downloads a Hugging Face model. - * @param repoID - The repo ID of the model to convert. - * @param repoData - The repo data of the model to convert. - * @param network - Optional object to specify proxy/whether to ignore SSL certificates. - * @returns A promise that resolves when the download is complete. - */ - async downloadModelFiles( - repoID: string, - repoData: HuggingFaceRepoData, - network?: { ignoreSSL?: boolean; proxy?: string } - ): Promise { - if (this.interrupted) return - const modelDirPath = await this.getModelDirPath(repoID) - if (!(await fs.existsSync(modelDirPath))) await fs.mkdir(modelDirPath) - const files = this.getFileList(repoData) - const filePaths: string[] = [] - - for (const file of files) { - const filePath = file - const localPath = await joinPath([modelDirPath, filePath]) - const url = `https://huggingface.co/${repoID}/resolve/main/${filePath}` - - if (this.interrupted) return - if (!(await fs.existsSync(localPath))) { - const downloadRequest: DownloadRequest = { - url, - localPath, - } - downloadFile(downloadRequest, network) - filePaths.push(filePath) - } - } - - await new Promise((resolve, reject) => { - if (filePaths.length === 0) resolve() - const onDownloadSuccess = async ({ fileName }: { fileName: string }) => { - if (filePaths.includes(fileName)) { - filePaths.splice(filePaths.indexOf(fileName), 1) - if (filePaths.length === 0) { - events.off(DownloadEvent.onFileDownloadSuccess, onDownloadSuccess) - events.off(DownloadEvent.onFileDownloadError, onDownloadError) - resolve() - } - } - } - - const onDownloadError = async ({ - fileName, - error, - }: { - fileName: string - error: Error - }) => { - if (filePaths.includes(fileName)) { - this.cancelConvert(repoID, repoData) - events.off(DownloadEvent.onFileDownloadSuccess, onDownloadSuccess) - events.off(DownloadEvent.onFileDownloadError, onDownloadError) - reject(error) - } - } - - events.on(DownloadEvent.onFileDownloadSuccess, onDownloadSuccess) - events.on(DownloadEvent.onFileDownloadError, onDownloadError) - }) - } - - /** - * Converts a Hugging Face model to GGUF. - * @param repoID - The repo ID of the model to convert. - * @returns A promise that resolves when the conversion is complete. - */ - async convert(repoID: string): Promise { - if (this.interrupted) return - const modelDirPath = await this.getModelDirPath(repoID) - const modelOutPath = await this.getConvertedModelPath(repoID) - if (!(await fs.existsSync(modelDirPath))) { - throw new Error('Model dir not found') - } - if (await fs.existsSync(modelOutPath)) return - - await executeOnMain(NODE_MODULE_PATH, 'installDeps') - if (this.interrupted) return - - try { - await executeOnMain( - NODE_MODULE_PATH, - 'convertHf', - modelDirPath, - modelOutPath + '.temp' - ) - } catch (err) { - log(`[Conversion]::Debug: Error using hf-to-gguf.py, trying convert.py`) - - let ctx = 4096 - try { - const config = await fs.readFileSync( - await joinPath([modelDirPath, 'config.json']), - 'utf8' - ) - const configParsed = JSON.parse(config) - ctx = this.getCtxLength(configParsed) - configParsed.max_sequence_length = ctx - await fs.writeFileSync( - await joinPath([modelDirPath, 'config.json']), - JSON.stringify(configParsed, null, 2) - ) - } catch (err) { - log(`${err}`) - // ignore missing config.json - } - - const bpe = await fs.existsSync( - await joinPath([modelDirPath, 'vocab.json']) - ) - - await executeOnMain( - NODE_MODULE_PATH, - 'convert', - modelDirPath, - modelOutPath + '.temp', - { - ctx, - bpe, - } - ) - } - await executeOnMain( - NODE_MODULE_PATH, - 'renameSync', - modelOutPath + '.temp', - modelOutPath - ) - - for (const file of await fs.readdirSync(modelDirPath)) { - if ( - modelOutPath.endsWith(file) || - (file.endsWith('config.json') && !file.endsWith('_config.json')) - ) - continue - await fs.unlinkSync(await joinPath([modelDirPath, file])) - } - } - - /** - * Quantizes a GGUF model. - * @param repoID - The repo ID of the model to quantize. - * @param quantization - The quantization to use. - * @returns A promise that resolves when the quantization is complete. - */ - async quantize(repoID: string, quantization: Quantization): Promise { - if (this.interrupted) return - const modelDirPath = await this.getModelDirPath(repoID) - const modelOutPath = await this.getQuantizedModelPath(repoID, quantization) - if (!(await fs.existsSync(modelDirPath))) { - throw new Error('Model dir not found') - } - if (await fs.existsSync(modelOutPath)) return - - await executeOnMain( - NODE_MODULE_PATH, - 'quantize', - await this.getConvertedModelPath(repoID), - modelOutPath + '.temp', - quantization - ) - await executeOnMain( - NODE_MODULE_PATH, - 'renameSync', - modelOutPath + '.temp', - modelOutPath - ) - - await fs.unlinkSync(await this.getConvertedModelPath(repoID)) - } - - /** - * Generates Jan model metadata from a Hugging Face model. - * @param repoID - The repo ID of the model to generate metadata for. - * @param repoData - The repo data of the model to generate metadata for. - * @param quantization - The quantization of the model. - * @returns A promise that resolves when the model metadata generation is complete. - */ - async generateMetadata( - repoID: string, - repoData: HuggingFaceRepoData, - quantization: Quantization - ): Promise { - const modelName = repoID.split('/').slice(1).join('/') - const filename = `${modelName}-${quantization.toLowerCase()}.gguf` - const modelDirPath = await this.getModelDirPath(repoID) - const modelPath = await this.getQuantizedModelPath(repoID, quantization) - const modelConfigPath = await joinPath([modelDirPath, 'model.json']) - if (!(await fs.existsSync(modelPath))) { - throw new Error('Model not found') - } - - const size = await executeOnMain(NODE_MODULE_PATH, 'getSize', modelPath) - let ctx = 4096 - try { - const config = await fs.readFileSync( - await joinPath([modelDirPath, 'config.json']), - 'utf8' - ) - ctx = this.getCtxLength(JSON.parse(config)) - fs.unlinkSync(await joinPath([modelDirPath, 'config.json'])) - } catch (err) { - // ignore missing config.json - } - // maybe later, currently it's gonna use too much memory - // const buffer = await fs.readFileSync(quantizedModelPath) - // const ggufData = ggufMetadata(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)) - - const metadata: Model = { - object: 'model', - version: '1.0', - format: 'gguf', - sources: [ - { - url: `https://huggingface.co/${repoID}`, // i think this is just for download but not sure, - filename, - }, - ], - id: modelName, - name: modelName, - created: Date.now(), - description: `Auto converted from Hugging Face model: ${repoID}`, - settings: { - ctx_len: ctx, - prompt_template: '', - llama_model_path: modelName, - }, - parameters: { - temperature: 0.7, - top_p: 0.95, - stream: true, - max_tokens: 4096, - // stop: [''], seems like we dont really need this..? - frequency_penalty: 0, - presence_penalty: 0, - }, - metadata: { - author: repoData.author, - tags: repoData.tags, - size, - }, - engine: InferenceEngine.nitro, - } - - await fs.writeFileSync(modelConfigPath, JSON.stringify(metadata, null, 2)) - } - - /** - * Cancels the convert of current Hugging Face model. - * @param repoID - The repository ID to cancel. - * @param repoData - The repository data to cancel. - * @returns {Promise} A promise that resolves when the download has been cancelled. - */ - async cancelConvert( - repoID: string, - repoData: HuggingFaceRepoData - ): Promise { - this.interrupted = true - const modelDirPath = await this.getModelDirPath(repoID) - const files = this.getFileList(repoData) - for (const file of files) { - const filePath = file - const localPath = await joinPath([modelDirPath, filePath]) - await abortDownload(localPath) - } - - executeOnMain(NODE_MODULE_PATH, 'killProcesses') - } -} diff --git a/extensions/huggingface-extension/tsconfig.json b/extensions/huggingface-extension/tsconfig.json deleted file mode 100644 index a42f31602..000000000 --- a/extensions/huggingface-extension/tsconfig.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "compilerOptions": { - "moduleResolution": "node", - "target": "es2020", - "module": "ES2020", - "lib": ["es2015", "es2016", "es2017", "dom"], - "strict": true, - "sourceMap": true, - "declaration": true, - "allowSyntheticDefaultImports": true, - "experimentalDecorators": true, - "emitDecoratorMetadata": true, - "declarationDir": "dist/types", - "outDir": "dist", - "importHelpers": true, - "typeRoots": ["node_modules/@types"], - "resolveJsonModule": true, - }, - "include": ["src"], -} diff --git a/extensions/inference-groq-extension/package.json b/extensions/inference-groq-extension/package.json index faf1b4a98..4fc08927b 100644 --- a/extensions/inference-groq-extension/package.json +++ b/extensions/inference-groq-extension/package.json @@ -25,7 +25,6 @@ "dependencies": { "@janhq/core": "file:../../core", "fetch-retry": "^5.0.6", - "path-browserify": "^1.0.1", "ulidx": "^2.3.0" }, "engines": { diff --git a/extensions/inference-groq-extension/resources/models.json b/extensions/inference-groq-extension/resources/models.json index b2775e2be..ad721ba62 100644 --- a/extensions/inference-groq-extension/resources/models.json +++ b/extensions/inference-groq-extension/resources/models.json @@ -1,4 +1,88 @@ [ + { + "sources": [ + { + "url": "https://groq.com" + } + ], + "id": "llama3-70b-8192", + "object": "model", + "name": "Groq Llama 3 70b", + "version": "1.0", + "description": "Groq Llama 3 70b with supercharged speed!", + "format": "api", + "settings": { + "text_model": false + }, + "parameters": { + "max_tokens": 8192, + "temperature": 0.7, + "top_p": 1, + "stop": null, + "stream": true + }, + "metadata": { + "author": "Meta", + "tags": ["General", "Big Context Length"] + }, + "engine": "groq" + }, + { + "sources": [ + { + "url": "https://groq.com" + } + ], + "id": "llama3-8b-8192", + "object": "model", + "name": "Groq Llama 3 8b", + "version": "1.0", + "description": "Groq Llama 3 8b with supercharged speed!", + "format": "api", + "settings": { + "text_model": false + }, + "parameters": { + "max_tokens": 8192, + "temperature": 0.7, + "top_p": 1, + "stop": null, + "stream": true + }, + "metadata": { + "author": "Meta", + "tags": ["General", "Big Context Length"] + }, + "engine": "groq" + }, + { + "sources": [ + { + "url": "https://groq.com" + } + ], + "id": "gemma-7b-it", + "object": "model", + "name": "Groq Gemma 7b Instruct", + "version": "1.0", + "description": "Groq Gemma 7b Instruct with supercharged speed!", + "format": "api", + "settings": { + "text_model": false + }, + "parameters": { + "max_tokens": 4096, + "temperature": 0.7, + "top_p": 1, + "stop": null, + "stream": true + }, + "metadata": { + "author": "Google", + "tags": ["General"] + }, + "engine": "groq" + }, { "sources": [ { diff --git a/extensions/inference-groq-extension/webpack.config.js b/extensions/inference-groq-extension/webpack.config.js index 58ade764b..199dee42c 100644 --- a/extensions/inference-groq-extension/webpack.config.js +++ b/extensions/inference-groq-extension/webpack.config.js @@ -1,4 +1,3 @@ -const path = require('path') const webpack = require('webpack') const packageJson = require('./package.json') const settingJson = require('./resources/settings.json') @@ -26,14 +25,10 @@ module.exports = { ], output: { filename: 'index.js', // Adjust the output file name as needed - path: path.resolve(__dirname, 'dist'), library: { type: 'module' }, // Specify ESM output format }, resolve: { extensions: ['.ts', '.js'], - fallback: { - path: require.resolve('path-browserify'), - }, }, optimization: { minimize: false, diff --git a/extensions/inference-nitro-extension/bin/version.txt b/extensions/inference-nitro-extension/bin/version.txt index 6a5f415df..0c4b45492 100644 --- a/extensions/inference-nitro-extension/bin/version.txt +++ b/extensions/inference-nitro-extension/bin/version.txt @@ -1 +1 @@ -0.3.16-hotfix +0.3.22 diff --git a/extensions/inference-nitro-extension/package.json b/extensions/inference-nitro-extension/package.json index 1916da4b6..dabda9aec 100644 --- a/extensions/inference-nitro-extension/package.json +++ b/extensions/inference-nitro-extension/package.json @@ -1,7 +1,7 @@ { "name": "@janhq/inference-nitro-extension", "productName": "Nitro Inference Engine", - "version": "1.0.1", + "version": "1.0.2", "description": "This extension embeds Nitro, a lightweight (3mb) inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.", "main": "dist/index.js", "node": "dist/node/index.cjs.js", @@ -51,7 +51,6 @@ "@janhq/core": "file:../../core", "decompress": "^4.2.1", "fetch-retry": "^5.0.6", - "path-browserify": "^1.0.1", "rxjs": "^7.8.1", "tcp-port-used": "^1.0.2", "terminate": "^2.6.1", diff --git a/extensions/inference-nitro-extension/resources/default_settings.json b/extensions/inference-nitro-extension/resources/default_settings.json index 39f0880b0..09d014a12 100644 --- a/extensions/inference-nitro-extension/resources/default_settings.json +++ b/extensions/inference-nitro-extension/resources/default_settings.json @@ -27,7 +27,7 @@ "min": 0, "max": 4096, "step": 128, - "value": 4096 + "value": 2048 } } ] diff --git a/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json b/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json index 850429376..a6827b391 100644 --- a/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json +++ b/extensions/inference-nitro-extension/resources/models/command-r-34b/model.json @@ -8,7 +8,7 @@ "id": "command-r-34b", "object": "model", "name": "Command-R v01 34B Q4", - "version": "1.1", + "version": "1.2", "description": "C4AI Command-R developed by CohereAI is optimized for a variety of use cases including reasoning, summarization, and question answering.", "format": "gguf", "settings": { @@ -27,9 +27,9 @@ }, "metadata": { "author": "CohereAI", - "tags": ["34B", "Finetuned", "Coming Soon", "Unavailable"], + "tags": ["34B", "Finetuned"], "size": 21500000000 }, "engine": "nitro" } - \ No newline at end of file + diff --git a/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json b/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json new file mode 100644 index 000000000..4dbb941ef --- /dev/null +++ b/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json @@ -0,0 +1,34 @@ +{ + "sources": [ + { + "filename": "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf", + "url": "https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf" + } + ], + "id": "llama3-8b-instruct", + "object": "model", + "name": "Llama 3 8B Q4", + "version": "1.0", + "description": "Meta's Llama 3 excels at general usage situations, including chat, general world knowledge, and coding.", + "format": "gguf", + "settings": { + "ctx_len": 8192, + "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "llama_model_path": "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf" + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 4096, + "stop": ["<|end_of_text|>","<|eot_id|>"], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "MetaAI", + "tags": ["7B", "Featured"], + "size": 4920000000 + }, + "engine": "nitro" +} diff --git a/extensions/inference-nitro-extension/resources/models/phind-34b/model.json b/extensions/inference-nitro-extension/resources/models/phind-34b/model.json index 4adecb017..6b0abe2a1 100644 --- a/extensions/inference-nitro-extension/resources/models/phind-34b/model.json +++ b/extensions/inference-nitro-extension/resources/models/phind-34b/model.json @@ -1,14 +1,14 @@ { "sources": [ { - "filename": "phind-codellama-34b-v2.Q4_K_M.gguf", + "filename": "phind-codellama-34b-v2.Q5_K_M.gguf", "url": "https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v2-GGUF/resolve/main/phind-codellama-34b-v2.Q5_K_M.gguf" } ], "id": "phind-34b", "object": "model", "name": "Phind 34B Q4", - "version": "1.0", + "version": "1.1", "description": "Phind 34B is the best Open-source coding model.", "format": "gguf", "settings": { diff --git a/extensions/inference-nitro-extension/rollup.config.ts b/extensions/inference-nitro-extension/rollup.config.ts index 7b2758881..497bb6466 100644 --- a/extensions/inference-nitro-extension/rollup.config.ts +++ b/extensions/inference-nitro-extension/rollup.config.ts @@ -36,6 +36,7 @@ const trinityv127bJson = require('./resources/models/trinity-v1.2-7b/model.json' const vistral7bJson = require('./resources/models/vistral-7b/model.json') const wizardcoder13bJson = require('./resources/models/wizardcoder-13b/model.json') const yi34bJson = require('./resources/models/yi-34b/model.json') +const llama3Json = require('./resources/models/llama3-8b-instruct/model.json') export default [ { @@ -79,6 +80,7 @@ export default [ vistral7bJson, wizardcoder13bJson, yi34bJson, + llama3Json ]), NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`), DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson), diff --git a/extensions/inference-openai-extension/package.json b/extensions/inference-openai-extension/package.json index 2dd75d300..d1a751069 100644 --- a/extensions/inference-openai-extension/package.json +++ b/extensions/inference-openai-extension/package.json @@ -26,7 +26,6 @@ "dependencies": { "@janhq/core": "file:../../core", "fetch-retry": "^5.0.6", - "path-browserify": "^1.0.1", "ulidx": "^2.3.0" }, "engines": { diff --git a/extensions/inference-openai-extension/webpack.config.js b/extensions/inference-openai-extension/webpack.config.js index f3d0db183..cd5e65c72 100644 --- a/extensions/inference-openai-extension/webpack.config.js +++ b/extensions/inference-openai-extension/webpack.config.js @@ -1,4 +1,3 @@ -const path = require('path') const webpack = require('webpack') const packageJson = require('./package.json') const settingJson = require('./resources/settings.json') @@ -26,14 +25,10 @@ module.exports = { ], output: { filename: 'index.js', // Adjust the output file name as needed - path: path.resolve(__dirname, 'dist'), library: { type: 'module' }, // Specify ESM output format }, resolve: { extensions: ['.ts', '.js'], - fallback: { - path: require.resolve('path-browserify'), - }, }, optimization: { minimize: false, diff --git a/extensions/inference-triton-trtllm-extension/package.json b/extensions/inference-triton-trtllm-extension/package.json index 06c4976e1..6612dc191 100644 --- a/extensions/inference-triton-trtllm-extension/package.json +++ b/extensions/inference-triton-trtllm-extension/package.json @@ -26,7 +26,6 @@ "dependencies": { "@janhq/core": "file:../../core", "fetch-retry": "^5.0.6", - "path-browserify": "^1.0.1", "rxjs": "^7.8.1", "ulidx": "^2.3.0" }, diff --git a/extensions/inference-triton-trtllm-extension/webpack.config.js b/extensions/inference-triton-trtllm-extension/webpack.config.js index 13d32c52d..6486d5efc 100644 --- a/extensions/inference-triton-trtllm-extension/webpack.config.js +++ b/extensions/inference-triton-trtllm-extension/webpack.config.js @@ -1,4 +1,3 @@ -const path = require('path') const webpack = require('webpack') const packageJson = require('./package.json') const settingJson = require('./resources/settings.json') @@ -24,14 +23,10 @@ module.exports = { ], output: { filename: 'index.js', // Adjust the output file name as needed - path: path.resolve(__dirname, 'dist'), library: { type: 'module' }, // Specify ESM output format }, resolve: { extensions: ['.ts', '.js'], - fallback: { - path: require.resolve('path-browserify'), - }, }, optimization: { minimize: false, diff --git a/extensions/huggingface-extension/download.bat b/extensions/model-extension/download.bat similarity index 100% rename from extensions/huggingface-extension/download.bat rename to extensions/model-extension/download.bat diff --git a/extensions/model-extension/package.json b/extensions/model-extension/package.json index 0967e1632..0a3b57262 100644 --- a/extensions/model-extension/package.json +++ b/extensions/model-extension/package.json @@ -4,15 +4,23 @@ "version": "1.0.30", "description": "Model Management Extension provides model exploration and seamless downloads", "main": "dist/index.js", - "module": "dist/module.js", + "node": "dist/node/index.cjs.js", "author": "Jan ", "license": "AGPL-3.0", "scripts": { - "build": "rollup -c rollup.config.ts", - "build:publish": "rimraf *.tgz --glob && yarn build && npm pack && cpx *.tgz ../../pre-install" + "build": "tsc --module commonjs && rollup -c rollup.config.ts --configPlugin @rollup/plugin-typescript --bundleConfigAsCjs", + "download:llama": "run-script-os", + "download:llama:linux": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz --wildcards '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", + "download:llama:darwin": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", + "download:llama:win32": "download.bat", + "build:publish:linux": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", + "build:publish:darwin": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && ../../.github/scripts/auto-sign.sh && npm pack && cpx *.tgz ../../pre-install", + "build:publish:win32": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", + "build:publish": "run-script-os" }, "devDependencies": { "cpx": "^1.5.0", + "download-cli": "^1.1.1", "rimraf": "^3.0.2", "ts-loader": "^9.5.0", "typescript": "5.3.3", @@ -20,6 +28,7 @@ "@rollup/plugin-json": "^6.1.0", "@rollup/plugin-node-resolve": "^15.2.3", "@rollup/plugin-replace": "^5.0.5", + "@rollup/plugin-typescript": "^11.1.6", "@types/pdf-parse": "^1.1.4", "rollup": "^2.38.5", "rollup-plugin-define": "^1.0.1", @@ -33,6 +42,7 @@ ], "dependencies": { "@janhq/core": "file:../../core", - "path-browserify": "^1.0.1" + "@huggingface/gguf": "^0.0.11", + "python-shell": "^5.0.0" } } diff --git a/extensions/model-extension/resources/default-model.json b/extensions/model-extension/resources/default-model.json index f887a2b7a..f2e15d2c9 100644 --- a/extensions/model-extension/resources/default-model.json +++ b/extensions/model-extension/resources/default-model.json @@ -13,7 +13,7 @@ "created": 0, "description": "User self import model", "settings": { - "ctx_len": 4096, + "ctx_len": 2048, "embedding": false, "prompt_template": "{system_message}\n### Instruction: {prompt}\n### Response:", "llama_model_path": "N/A" diff --git a/extensions/model-extension/rollup.config.ts b/extensions/model-extension/rollup.config.ts index 256b33add..abd12890e 100644 --- a/extensions/model-extension/rollup.config.ts +++ b/extensions/model-extension/rollup.config.ts @@ -20,10 +20,7 @@ export default [ replace({ preventAssignment: true, DEFAULT_MODEL: JSON.stringify(defaultModelJson), - MODULE_PATH: JSON.stringify( - `${packageJson.name}/${packageJson.module}` - ), - VERSION: JSON.stringify(packageJson.version), + NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`), }), // Allow json resolution json(), diff --git a/extensions/model-extension/scripts/convert-hf-to-gguf.py b/extensions/model-extension/scripts/convert-hf-to-gguf.py new file mode 100755 index 000000000..0d4ea03b4 --- /dev/null +++ b/extensions/model-extension/scripts/convert-hf-to-gguf.py @@ -0,0 +1,1720 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import contextlib +import json +import os +import re +import sys +from enum import IntEnum +from pathlib import Path +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast + +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf + +from convert import HfVocab + + +# check for any of the given keys in the dictionary and return the value of the first key found +def get_key_opts(d, keys): + for k in keys: + if k in d: + return d[k] + print(f"Could not find any of {keys}") + sys.exit() + + +###### MODEL DEFINITIONS ###### + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class Model: + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): + self.dir_model = dir_model + self.ftype = ftype + self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.is_safetensors = self._is_model_safetensors() + self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") + self.part_names = self._get_part_names() + self.hparams = Model.load_hparams(self.dir_model) + self.model_arch = self._get_model_architecture() + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + + def set_vocab(self): + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for part_name in self.part_names: + print(f"gguf: loading model part '{part_name}'") + ctx: ContextManager[Any] + if self.is_safetensors: + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + else: + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + + with ctx as model_part: + for name in model_part.keys(): + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + yield name, data + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams.get( + "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + )) + if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: + self.gguf_writer.add_context_length(n_ctx) + if (n_embd := self.hparams.get("hidden_size")) is not None: + self.gguf_writer.add_embedding_length(n_embd) + if (n_ff := self.hparams.get("intermediate_size")) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + if (n_head := self.hparams.get("num_attention_heads")) is not None: + self.gguf_writer.add_head_count(n_head) + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + + if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + def write(self): + self.write_tensors() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file() + self.gguf_writer.close() + + def write_vocab(self): + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + @staticmethod + def count_model_parts(dir_model: Path, prefix: str) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.endswith(prefix): + num_parts += 1 + + return num_parts + + @staticmethod + def load_hparams(dir_model): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @staticmethod + def from_model_architecture(model_architecture): + if model_architecture == "GPTNeoXForCausalLM": + return GPTNeoXModel + if model_architecture == "BloomForCausalLM": + return BloomModel + if model_architecture == "MPTForCausalLM": + return MPTModel + if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return BaichuanModel + if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): + return FalconModel + if model_architecture == "GPTBigCodeForCausalLM": + return StarCoderModel + if model_architecture == "GPTRefactForCausalLM": + return RefactModel + if model_architecture == "PersimmonForCausalLM": + return PersimmonModel + if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + return StableLMModel + if model_architecture == "QWenLMHeadModel": + return QwenModel + if model_architecture == "Qwen2ForCausalLM": + return Model + if model_architecture == "MixtralForCausalLM": + return MixtralModel + if model_architecture == "GPT2LMHeadModel": + return GPT2Model + if model_architecture == "PhiForCausalLM": + return Phi2Model + if model_architecture == "PlamoForCausalLM": + return PlamoModel + if model_architecture == "CodeShellForCausalLM": + return CodeShellModel + if model_architecture == "OrionForCausalLM": + return OrionModel + if model_architecture == "InternLM2ForCausalLM": + return InternLM2Model + if model_architecture == "MiniCPMForCausalLM": + return MiniCPMModel + return Model + + def _is_model_safetensors(self) -> bool: + return Model.count_model_parts(self.dir_model, ".safetensors") > 0 + + def _get_part_names(self): + if self.is_safetensors: + if self.num_parts == 1: # there's only one .safetensors file + return ("model.safetensors",) + return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) + + if self.num_parts == 1: # there's only one .bin file + return ("pytorch_model.bin",) + return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) + + def _get_model_architecture(self) -> gguf.MODEL_ARCH: + arch = self.hparams["architectures"][0] + if arch == "GPTNeoXForCausalLM": + return gguf.MODEL_ARCH.GPTNEOX + if arch == "BloomForCausalLM": + return gguf.MODEL_ARCH.BLOOM + if arch == "MPTForCausalLM": + return gguf.MODEL_ARCH.MPT + if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return gguf.MODEL_ARCH.BAICHUAN + if arch in ("FalconForCausalLM", "RWForCausalLM"): + return gguf.MODEL_ARCH.FALCON + if arch == "GPTBigCodeForCausalLM": + return gguf.MODEL_ARCH.STARCODER + if arch == "GPTRefactForCausalLM": + return gguf.MODEL_ARCH.REFACT + if arch == "PersimmonForCausalLM": + return gguf.MODEL_ARCH.PERSIMMON + if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + return gguf.MODEL_ARCH.STABLELM + if arch == "QWenLMHeadModel": + return gguf.MODEL_ARCH.QWEN + if arch == "Qwen2ForCausalLM": + return gguf.MODEL_ARCH.QWEN2 + if arch == "MixtralForCausalLM": + return gguf.MODEL_ARCH.LLAMA + if arch == "GPT2LMHeadModel": + return gguf.MODEL_ARCH.GPT2 + if arch == "PhiForCausalLM": + return gguf.MODEL_ARCH.PHI2 + if arch == "PlamoForCausalLM": + return gguf.MODEL_ARCH.PLAMO + if arch == "CodeShellForCausalLM": + return gguf.MODEL_ARCH.CODESHELL + if arch == "OrionForCausalLM": + return gguf.MODEL_ARCH.ORION + if arch == "InternLM2ForCausalLM": + return gguf.MODEL_ARCH.INTERNLM2 + if arch == "MiniCPMForCausalLM": + return gguf.MODEL_ARCH.MINICPM + + raise NotImplementedError(f'Architecture "{arch}" not supported!') + + def _set_vocab_gpt2(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + pad_token = f"[PAD{i}]".encode('utf-8') + tokens.append(bytearray(pad_token)) + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_qwen(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + pad_token = f"[PAD{i}]".encode("utf-8") + tokens.append(bytearray(pad_token)) + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_hf(self): + path = self.dir_model + added_tokens_path = self.dir_model + vocab = HfVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +class GPTNeoXModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + +class BloomModel(Model): + def set_gguf_parameters(self): + self.gguf_writer.add_name("Bloom") + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams["n_layer"] + tensors = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + has_lm_head = True + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + for name, data_torch in tensors.items(): + if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): + has_lm_head = False + + name = re.sub(r'transformer\.', '', name) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) + data = np.concatenate( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + axis=0, + ) + print("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) + data = np.concatenate( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + axis=0, + ) + print("re-format attention.linear_qkv.bias") + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "word_embeddings.weight": + self.gguf_writer.add_tensor("output.weight", data) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + +class MPTModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layers"] + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + if "scales" in name: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) + if new_name is not None: + new_name = new_name.replace("scales", "act.scales") + else: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + # note: MPT output is tied to (same as) wte in original model; + # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ + if new_name == "token_embd.weight": + self.gguf_writer.add_tensor("output.weight", data) + + +class OrionModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + +class BaichuanModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + for i in range(block_count): + if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: + print(f"Unpacking and permuting layer {i}") + model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ + self._reverse_hf_permute_part(w, 0, head_count, head_count) + model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ + self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) + model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ + self._reverse_hf_part(w, 2) + del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] + + +class FalconModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_name("Falcon") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + head_dim = self.hparams["hidden_size"] // n_head + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class StarCoderModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("StarCoder") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +class RefactModel(Model): + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Refact") + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + block_count = self.hparams["n_layer"] + + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + tensors = dict(self.get_tensors()) + for i in range(block_count): + if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] + del tensors[f"transformer.h.{i}.attn.kv.weight"] + if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w + del tensors[f"transformer.h.{i}.attn.q.weight"] + if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: + tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] + tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] + del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + + for name, data_torch in tensors.items(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class PersimmonModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + head_count = self.hparams["num_attention_heads"] + head_count_kv = head_count + hidden_size = self.hparams["hidden_size"] + + self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + + # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller + # than the head size? + # ref: https://github.com/ggerganov/llama.cpp/pull/4889 + # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) + + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + def set_vocab(self): + self._set_vocab_sentencepiece() + # self.gguf_writer.add_bos_token_id(71013) + # self.gguf_writer.add_eos_token_id(71013) + + def write_tensors(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + if name.endswith(".self_attention.rotary_emb.inv_freq"): + continue + old_dtype = data_torch.dtype + # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) + data = data_torch.to(torch.float32).squeeze().numpy() + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + n_dims = len(data.shape) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + +class StableLMModel(Model): + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(1e-5) + + +class MixtralModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + +class MiniCPMModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_name("MiniCPM") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def set_vocab(self): + self._set_vocab_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + n_head = self.hparams.get("num_attention_heads") + n_kv_head = self.hparams.get("num_key_value_heads") + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class QwenModel(Model): + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + self._set_vocab_qwen() + + def set_gguf_parameters(self): + self.gguf_writer.add_name("Qwen") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + + def write_tensors(self): + block_count = self.hparams["num_hidden_layers"] + model_kv = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + +class GPT2Model(Model): + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): + continue + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + # note: GPT2 output is tied to (same as) wte in original model + if new_name == "token_embd.weight": + print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor("output.weight", data) + + +class Phi2Model(Model): + def set_gguf_parameters(self): + block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) + + rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) + n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) + n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) + + self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + +class PlamoModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name("PLaMo") + self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + + def shuffle_attn_q_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(8, 5, 128, 5120) + data_torch = torch.permute(data_torch, (1, 0, 2, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def shuffle_attn_output_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(5120, 8, 5, 128) + data_torch = torch.permute(data_torch, (0, 2, 1, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def write_tensors(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + if "self_attn.rotary_emb.inv_freq" in name: + continue + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class CodeShellModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("CodeShell") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + tensors = dict(self.get_tensors()) + has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() + for name, data_torch in tensors.items(): + # we don't need these + if name.endswith((".attn.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "transformer.wte.weight": + self.gguf_writer.add_tensor("output.weight", data) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + +class InternLM2Model(Model): + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + print(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉" + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if "chat" in os.path.basename(self.dir_model.absolute()): + # For the chat model, we replace the eos with '<|im_end|>'. + special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) + print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def _try_get_sft_eos(self, tokenizer): + unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.encode('<|im_end|>') + assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) + if len(unused_145_list) == 1: + eos_token = unused_145_list[0] + if len(im_end_list) == 1: + eos_token = im_end_list[0] + return eos_token + + def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("InternLM2") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + + def post_write_tensors(self, tensor_map, name, data_torch): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def write_tensors(self): + from einops import rearrange + + num_heads = self.hparams.get("num_attention_heads") + num_kv_heads = self.hparams.get("num_key_value_heads") + hidden_size = self.hparams.get("hidden_size") + q_per_kv = num_heads // num_kv_heads + head_dim = hidden_size // num_heads + num_groups = num_heads // q_per_kv + + block_count = self.hparams["num_hidden_layers"] + model_kv = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + if re.match(qkv_pattern, name): + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + # The model weights of q and k equire additional reshape. + q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) + v = rearrange(v, " o g n i -> o (g n i)").T + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) + else: + self.post_write_tensors(tensor_map, name, data_torch) + + +###### CONVERSION LOGIC ###### + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file") + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16"], default="f16", + help="output format - use f32 for float32, f16 for float16", + ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + parser.add_argument( + "model", type=Path, + help="directory containing model file", + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + dir_model = args.model + + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") + + if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file=sys.stderr) + sys.exit(1) + + ftype_map = { + "f32": gguf.GGMLQuantizationType.F32, + "f16": gguf.GGMLQuantizationType.F16, + } + + if args.outfile is not None: + fname_out = args.outfile + else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' + + print(f"Loading model: {dir_model.name}") + + hparams = Model.load_hparams(dir_model) + + with torch.inference_mode(): + model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) + + print("Set model parameters") + model_instance.set_gguf_parameters() + + print("Set model tokenizer") + model_instance.set_vocab() + + if args.vocab_only: + print(f"Exporting model vocab to '{fname_out}'") + model_instance.write_vocab() + else: + print(f"Exporting model to '{fname_out}'") + model_instance.write() + + print(f"Model successfully exported to '{fname_out}'") + + +if __name__ == '__main__': + main() diff --git a/extensions/model-extension/scripts/convert.py b/extensions/model-extension/scripts/convert.py new file mode 100755 index 000000000..323e8058d --- /dev/null +++ b/extensions/model-extension/scripts/convert.py @@ -0,0 +1,1478 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import concurrent.futures +import enum +import faulthandler +import functools +import itertools +import json +import math +import mmap +import os +import pickle +import re +import signal +import struct +import sys +import time +import zipfile +from abc import ABCMeta, abstractmethod +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from dataclasses import dataclass +from pathlib import Path +from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar + +import numpy as np +from sentencepiece import SentencePieceProcessor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf + +if TYPE_CHECKING: + from typing import TypeAlias + +if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): + faulthandler.register(signal.SIGUSR1) + +NDArray: TypeAlias = 'np.ndarray[Any, Any]' + +ARCH = gguf.MODEL_ARCH.LLAMA + +DEFAULT_CONCURRENCY = 8 + +# +# data types +# + + +@dataclass(frozen=True) +class DataType: + name: str + dtype: np.dtype[Any] + valid_conversions: list[str] + + def elements_to_bytes(self, n_elements: int) -> int: + return n_elements * self.dtype.itemsize + + +@dataclass(frozen=True) +class UnquantizedDataType(DataType): + pass + + +DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) +DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) +DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) +DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) + + +@dataclass(frozen=True) +class QuantizedDataType(DataType): + block_size: int + quantized_dtype: np.dtype[Any] + ggml_type: gguf.GGMLQuantizationType + + def quantize(self, arr: NDArray) -> NDArray: + raise NotImplementedError(f'Quantization for {self.name} not implemented') + + def elements_to_bytes(self, n_elements: int) -> int: + assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' + return self.quantized_dtype.itemsize * (n_elements // self.block_size) + + +@dataclass(frozen=True) +class Q8_0QuantizedDataType(QuantizedDataType): + # Mini Q8_0 quantization in Python! + def quantize(self, arr: NDArray) -> NDArray: + assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}' + assert arr.dtype == np.float32, f'Bad array type {arr.dtype}' + n_blocks = arr.size // self.block_size + blocks = arr.reshape((n_blocks, self.block_size)) + # Much faster implementation of block quantization contributed by @Cebtenzzre + + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: + d = abs(blocks).max(axis = 1) / np.float32(127) + with np.errstate(divide = 'ignore'): + qs = (blocks / d[:, None]).round() + qs[d == 0] = 0 + yield from zip(d, qs) + return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + + +DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', + dtype = np.dtype(np.float32), valid_conversions = [], + ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, + quantized_dtype = np.dtype([('d', ' DataType: + dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) + if dt is None: + raise ValueError(self) + # 1D tensors are always F32. + return dt if len(tensor.shape) > 1 else DT_F32 + + +GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { + GGMLFileType.AllF32 : DT_F32, + GGMLFileType.MostlyF16 : DT_F16, + GGMLFileType.MostlyQ8_0: DT_Q8_0, +} + +# +# hparams loading +# + + +@dataclass +class Params: + n_vocab: int + n_embd: int + n_layer: int + n_ctx: int + n_ff: int + n_head: int + n_head_kv: int + n_experts: int | None = None + n_experts_used: int | None = None + f_norm_eps: float | None = None + + rope_scaling_type: gguf.RopeScalingType | None = None + f_rope_freq_base: float | None = None + f_rope_scale: float | None = None + n_orig_ctx: int | None = None + rope_finetuned: bool | None = None + + ftype: GGMLFileType | None = None + + # path to the directory containing the model files + path_model: Path | None = None + + @staticmethod + def guessed(model: LazyModel) -> Params: + # try transformer naming first + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape + + # try transformer naming first + if "model.layers.0.self_attn.q_proj.weight" in model: + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) + else: + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + + if n_layer < 1: + raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + + n_head = n_embd // 128 # guessed + n_mult = 256 # guessed + + # TODO: verify this + n_ff = int(2 * (4 * n_embd) / 3) + n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) + + return Params( + n_vocab = n_vocab, + n_embd = n_embd, + n_layer = n_layer, + n_ctx = -1, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head, + f_norm_eps = 1e-5, + ) + + @staticmethod + def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: + config = json.load(open(config_path)) + + rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None + rope_scaling = config.get("rope_scaling") + + if rope_scaling is not None and (typ := rope_scaling.get("type")): + rope_factor = rope_scaling.get("factor") + f_rope_scale = rope_factor + if typ == "linear": + rope_scaling_type = gguf.RopeScalingType.LINEAR + elif typ == "yarn": + rope_scaling_type = gguf.RopeScalingType.YARN + n_orig_ctx = rope_scaling['original_max_position_embeddings'] + rope_finetuned = rope_scaling['finetuned'] + else: + raise NotImplementedError(f'Unknown rope scaling type: {typ}') + + if "max_sequence_length" in config: + n_ctx = config["max_sequence_length"] + elif "max_position_embeddings" in config: + n_ctx = config["max_position_embeddings"] + else: + raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + + n_experts = None + n_experts_used = None + + if "num_local_experts" in config: + n_experts = config["num_local_experts"] + n_experts_used = config["num_experts_per_tok"] + + return Params( + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_layer = config["num_hidden_layers"], + n_ctx = n_ctx, + n_ff = config["intermediate_size"], + n_head = (n_head := config["num_attention_heads"]), + n_head_kv = config.get("num_key_value_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config.get("rope_theta"), + rope_scaling_type = rope_scaling_type, + f_rope_scale = f_rope_scale, + n_orig_ctx = n_orig_ctx, + rope_finetuned = rope_finetuned, + ) + + # LLaMA v2 70B params.json + # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} + @staticmethod + def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: + config = json.load(open(config_path)) + + n_experts = None + n_experts_used = None + f_rope_freq_base = None + + # hack to determine LLaMA v1 vs v2 vs CodeLlama + if config.get("moe"): + # Mixtral + n_ctx = 32768 + elif config.get("rope_theta") == 1000000: + # CodeLlama + n_ctx = 16384 + elif config["norm_eps"] == 1e-05: + # LLaMA v2 + n_ctx = 4096 + else: + # LLaMA v1 + n_ctx = 2048 + + if "layers.0.feed_forward.w1.weight" in model: + n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] + + if config.get("moe"): + n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] + n_experts = config["moe"]["num_experts"] + n_experts_used = config["moe"]["num_experts_per_tok"] + f_rope_freq_base = 1e6 + + return Params( + n_vocab = model["tok_embeddings.weight"].shape[0], + n_embd = config["dim"], + n_layer = config["n_layers"], + n_ctx = n_ctx, + n_ff = n_ff, + n_head = (n_head := config["n_heads"]), + n_head_kv = config.get("n_kv_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["norm_eps"], + f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), + ) + + @staticmethod + def load(model_plus: ModelPlus) -> Params: + hf_config_path = model_plus.paths[0].parent / "config.json" + orig_config_path = model_plus.paths[0].parent / "params.json" + + if hf_config_path.exists(): + params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) + elif orig_config_path.exists(): + params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) + elif model_plus.format != 'none': + params = Params.guessed(model_plus.model) + else: + raise ValueError('Cannot guess params when model format is none') + + params.path_model = model_plus.paths[0].parent + + return params + + +# +# vocab +# + +class BpeVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) + if isinstance(self.bpe_tokenizer.get('model'), dict): + self.vocab = self.bpe_tokenizer["model"]["vocab"] + else: + self.vocab = self.bpe_tokenizer + added_tokens: dict[str, int] + if fname_added_tokens is not None: + # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. + added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) + else: + # Fall back to trying to find the added tokens in tokenizer.json + tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' + if not tokenizer_json_file.is_file(): + added_tokens = {} + else: + tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) + added_tokens = dict( + (item['content'], item['id']) + for item in tokenizer_json.get('added_tokens', []) + # Added tokens here can be duplicates of the main vocabulary. + if item['content'] not in self.bpe_tokenizer) + + vocab_size: int = len(self.vocab) + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) + if expected_ids != actual_ids: + expected_end_id = vocab_size + len(actual_ids) - 1 + raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") + + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base: int = vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} + + for i, _ in enumerate(self.vocab): + yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.CONTROL + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.bpe_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +class SentencePieceVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) + added_tokens: dict[str, int] + if fname_added_tokens is not None: + added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) + else: + added_tokens = {} + + vocab_size: int = self.sentencepiece_tokenizer.vocab_size() + + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} + expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) + actual_new_ids = sorted(new_tokens.keys()) + + if expected_new_ids != actual_new_ids: + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") + + # Token pieces that were added to the base vocabulary. + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + tokenizer = self.sentencepiece_tokenizer + for i in range(tokenizer.vocab_size()): + piece = tokenizer.id_to_piece(i) + text: bytes = piece.encode("utf-8") + score: float = tokenizer.get_score(i) + + toktype = gguf.TokenType.NORMAL + if tokenizer.is_unknown(i): + toktype = gguf.TokenType.UNKNOWN + if tokenizer.is_control(i): + toktype = gguf.TokenType.CONTROL + + # NOTE: I think added_tokens are user defined. + # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto + # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED + + if tokenizer.is_unused(i): + toktype = gguf.TokenType.UNUSED + if tokenizer.is_byte(i): + toktype = gguf.TokenType.BYTE + + yield text, score, toktype + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.sentencepiece_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +class HfVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: + try: + from transformers import AutoTokenizer + except ImportError as e: + raise ImportError( + "To use HfVocab, please install the `transformers` package. " + "You can install it with `pip install transformers`." + ) from e + + print("fname_tokenizer:", fname_tokenizer) + # Allow the tokenizer to default to slow or fast versions. + # Explicitly set tokenizer to use local paths. + self.tokenizer = AutoTokenizer.from_pretrained( + fname_tokenizer, + cache_dir=fname_tokenizer, + local_files_only=True, + ) + + # Initialize lists and dictionaries for added tokens + self.added_tokens_list = [] + self.added_tokens_dict = dict() + self.added_tokens_ids = set() + + # Process added tokens + for tok, tokidx in sorted( + self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] + ): + # Only consider added tokens that are not in the base vocabulary + if tokidx >= self.tokenizer.vocab_size: + self.added_tokens_list.append(tok) + self.added_tokens_dict[tok] = tokidx + self.added_tokens_ids.add(tokidx) + + # Store special tokens and their IDs + self.specials = { + tok: self.tokenizer.get_vocab()[tok] + for tok in self.tokenizer.all_special_tokens + } + self.special_ids = set(self.tokenizer.all_special_ids) + + # Set vocabulary sizes + self.vocab_size_base = self.tokenizer.vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + reverse_vocab = { + id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() + } + + for token_id in range(self.vocab_size_base): + # Skip processing added tokens here + if token_id in self.added_tokens_ids: + continue + + # Convert token text to bytes + token_text = reverse_vocab[token_id].encode("utf-8") + + # Yield token text, score, and type + yield token_text, self.get_token_score(token_id), self.get_token_type( + token_id, token_text, self.special_ids # Reuse already stored special IDs + ) + + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: + # Special case for byte tokens + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + return gguf.TokenType.BYTE + + # Determine token type based on whether it's a special token + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL + + def get_token_score(self, token_id: int) -> float: + # Placeholder for actual logic to determine the token's score + # This needs to be implemented based on specific requirements + return -1000.0 # Default score + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + if text in self.specials: + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) + score = self.get_token_score(self.specials[text]) + else: + toktype = gguf.TokenType.USER_DEFINED + score = -1000.0 + + yield text.encode("utf-8"), score, toktype + + def has_newline_token(self): + return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.hf_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" + + +# +# data loading +# TODO: reuse (probably move to gguf.py?) +# + + +def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: + # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + +class Tensor(metaclass=ABCMeta): + data_type: DataType + + @abstractmethod + def astype(self, data_type: DataType) -> Tensor: ... + @abstractmethod + def permute(self, n_head: int, n_head_kv: int) -> Tensor: ... + @abstractmethod + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ... + @abstractmethod + def part(self, n_part: int) -> UnquantizedTensor: ... + @abstractmethod + def to_ggml(self) -> GGMLCompatibleTensor: ... + + +def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: + assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" + fp32_arr = bf16_arr.astype(np.uint32) << 16 + return fp32_arr.view(np.float32) + + +class UnquantizedTensor(Tensor): + def __init__(self, ndarray: NDArray) -> None: + assert isinstance(ndarray, np.ndarray) + self.ndarray = ndarray + self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] + + def astype(self, data_type: DataType) -> Tensor: + dtype = data_type.dtype + if self.data_type == DT_BF16: + self.ndarray = bf16_to_fp32(self.ndarray) + return UnquantizedTensor(self.ndarray.astype(dtype)) + + def to_ggml(self) -> UnquantizedTensor: + return self + + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: + r = self.ndarray.shape[0] // 3 + return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) + + def part(self, n_part: int) -> UnquantizedTensor: + r = self.ndarray.shape[0] // 3 + return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) + + def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor: + return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv)) + + +def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray: + tensor = lazy_tensor.load() + assert isinstance(tensor, UnquantizedTensor) + + # double-check: + actual_shape = list(tensor.ndarray.shape) + assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape) + if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype: + if convert: + tensor.ndarray = tensor.ndarray.astype(expected_dtype) + else: + raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}') + + return tensor.ndarray + + +GGMLCompatibleTensor = UnquantizedTensor + + +@dataclass +class LazyTensor: + _load: Callable[[], Tensor] + shape: list[int] + data_type: DataType + description: str + + def load(self) -> Tensor: + ret = self._load() + # Should be okay if it maps to the same numpy type? + assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ + (self.data_type, ret.data_type, self.description) + return ret + + def astype(self, data_type: DataType) -> LazyTensor: + self.validate_conversion_to(data_type) + + def load() -> Tensor: + return self.load().astype(data_type) + return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') + + def validate_conversion_to(self, data_type: DataType) -> None: + if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions: + raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') + + +LazyModel: TypeAlias = 'dict[str, LazyTensor]' + + +@dataclass +class ModelPlus: + model: LazyModel + paths: list[Path] # Where this was read from. + format: Literal['ggml', 'torch', 'safetensors', 'none'] + vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. + + +def merge_sharded(models: list[LazyModel]) -> LazyModel: + # Original LLaMA models have each file contain one part of each tensor. + # Use a dict instead of a set to preserve order. + names = {name: None for model in models for name in model} + + def convert(name: str) -> LazyTensor: + lazy_tensors: list[LazyTensor] = [model[name] for model in models] + if len(lazy_tensors) == 1: + # only one file; don't go through this procedure since there might + # be quantized tensors + return lazy_tensors[0] + if len(lazy_tensors[0].shape) == 1: + # the tensor is just duplicated in every file + return lazy_tensors[0] + if name.startswith('tok_embeddings.') or \ + name.endswith('.attention.wo.weight') or \ + name.endswith('.feed_forward.w2.weight'): + # split by columns + axis = 1 + else: + # split by rows + axis = 0 + concatenated_shape = list(lazy_tensors[0].shape) + concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) + + def load() -> UnquantizedTensor: + ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] + concatenated: NDArray = np.concatenate(ndarrays, axis=axis) + return UnquantizedTensor(concatenated) + description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' + return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) + return {name: convert(name) for name in names} + + +def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: + formats = set(mp.format for mp in models_plus) + assert len(formats) == 1, "different formats?" + format = formats.pop() + paths = [path for mp in models_plus for path in mp.paths] + # Use the first non-None vocab, if any. + try: + vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None) + except StopIteration: + vocab = None + + if any("model.embed_tokens.weight" in mp.model for mp in models_plus): + # Transformers models put different tensors in different files, but + # don't split individual tensors between files. + model: LazyModel = {} + for mp in models_plus: + model.update(mp.model) + else: + model = merge_sharded([mp.model for mp in models_plus]) + + return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types + + +def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: + def load() -> Tensor: + return lazy_tensor.load().permute(n_head, n_head_kv) + return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + + +def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: + def load() -> Tensor: + return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) + s = lazy_tensor.shape.copy() + s[0] = s[0] // 3 + return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + + +def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: + def load() -> Tensor: + return lazy_tensor.load().part(n_part) + s = lazy_tensor.shape.copy() + s[0] = s[0] // 3 + return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) + + +# Functionality that simulates `torch.load` but where individual tensors are +# only loaded into memory on demand, not all at once. +# PyTorch can't do this natively as of time of writing: +# - https://github.com/pytorch/pytorch/issues/64327 +# This allows us to de-shard without multiplying RAM usage, and also +# conveniently drops the PyTorch dependency (though we still need numpy). + + +@dataclass +class LazyStorageKind: + data_type: DataType + + +@dataclass +class LazyStorage: + load: Callable[[int, int], NDArray] + kind: LazyStorageKind + description: str + + +class LazyUnpickler(pickle.Unpickler): + def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): + super().__init__(fp) + self.data_base_path = data_base_path + self.zip_file = zip_file + + def persistent_load(self, pid: Any) -> Any: + assert pid[0] == 'storage' + assert isinstance(pid[1], LazyStorageKind) + data_type = pid[1].data_type + filename_stem = pid[2] + filename = f'{self.data_base_path}/{filename_stem}' + info = self.zip_file.getinfo(filename) + + def load(offset: int, elm_count: int) -> NDArray: + dtype = data_type.dtype + fp = self.zip_file.open(info) + fp.seek(offset * dtype.itemsize) + size = elm_count * dtype.itemsize + data = fp.read(size) + assert len(data) == size + return np.frombuffer(data, dtype) + description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' + return LazyStorage(load=load, kind=pid[1], description=description) + + @staticmethod + def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, + requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: + assert isinstance(storage, LazyStorage) + + def load() -> UnquantizedTensor: + elm_count = stride[0] * size[0] + return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size)) + description = f'pickled storage_offset={storage_offset} in {storage.description}' + return LazyTensor(load, list(size), storage.kind.data_type, description) + + @staticmethod + def rebuild_from_type_v2(func, new_type, args, state): + return func(*args) + + CLASSES: dict[tuple[str, str], Any] = { + # getattr used here as a workaround for mypy not being smart enough to determine + # the staticmethods have a __func__ attribute. + ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), + ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), + ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), + ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), + ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), + ('torch', 'IntStorage'): LazyStorageKind(DT_I32), + ('torch', 'Tensor'): LazyTensor, + } + + def find_class(self, module: str, name: str) -> Any: + if not module.startswith('torch'): + return super().find_class(module, name) + return self.CLASSES[(module, name)] + + +def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: + zf = zipfile.ZipFile(outer_fp) + pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] + assert len(pickle_paths) == 1, pickle_paths + pickle_fp = zf.open(pickle_paths[0], 'r') + unpickler = LazyUnpickler(pickle_fp, + data_base_path=pickle_paths[0][:-4], + zip_file=zf) + model = unpickler.load() + if 'model' in model: model = model['model'] + as_dict = dict(model.items()) + return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) + + +def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: + header_size, = struct.unpack(' LazyTensor: + data_type = SAFETENSORS_DATA_TYPES[info['dtype']] + numpy_dtype = data_type.dtype + shape: list[int] = info['shape'] + begin, end = info['data_offsets'] + assert 0 <= begin <= end <= len(byte_buf) + assert end - begin == math.prod(shape) * numpy_dtype.itemsize + buf = byte_buf[begin:end] + + def load() -> UnquantizedTensor: + return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) + description = f'safetensors begin={begin} end={end} type={data_type} path={path}' + return LazyTensor(load, shape, data_type, description) + model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'} + return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None) + + +def must_read(fp: IO[bytes], length: int) -> bytes: + ret = fp.read(length) + if len(ret) < length: + raise Exception("unexpectedly reached end of file") + return ret + + +@functools.lru_cache(maxsize=None) +def lazy_load_file(path: Path) -> ModelPlus: + fp = open(path, 'rb') + first8 = fp.read(8) + fp.seek(0) + if first8[:2] == b'PK': + # A zip file, i.e. PyTorch format + return lazy_load_torch_file(fp, path) + elif struct.unpack(' Iterable[Out]: + '''Parallel map, but with backpressure. If the caller doesn't call `next` + fast enough, this will stop calling `func` at some point rather than + letting results pile up in memory. Specifically, there is a max of one + output value buffered per thread.''' + if concurrency < 2: + yield from map(func, iterable) + # Not reached. + iterable = iter(iterable) + executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor] + if use_processpool_executor: + executor_class = ProcessPoolExecutor + else: + executor_class = ThreadPoolExecutor + with executor_class(max_workers=max_workers) as executor: + futures: list[concurrent.futures.Future[Out]] = [] + done = False + for _ in range(concurrency): + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break + + while futures: + result = futures.pop(0).result() + while not done and len(futures) < concurrency: + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break + yield result + + +def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: + # Handle special case where the model's vocab size is not set + if params.n_vocab == -1: + raise ValueError( + f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" + ) + + # Check for a vocab size mismatch + if params.n_vocab == vocab.vocab_size: + print("Ignoring added_tokens.json since model matches vocab size without it.") + return + + if pad_vocab and params.n_vocab > vocab.vocab_size: + pad_count = params.n_vocab - vocab.vocab_size + print( + f"Padding vocab with {pad_count} token(s) - through " + ) + for i in range(1, pad_count + 1): + vocab.added_tokens_dict[f""] = -1 + vocab.added_tokens_list.append(f"") + vocab.vocab_size = params.n_vocab + return + + msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})." + if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: + msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." + if vocab.vocab_size < params.n_vocab: + msg += " Add the --pad-vocab option and try again." + + raise Exception(msg) + + +class OutputFile: + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) + + def add_meta_arch(self, params: Params) -> None: + name = "LLaMA" + + # TODO: better logic to determine model name + if params.n_ctx == 4096: + name = "LLaMA v2" + elif params.path_model is not None: + name = str(params.path_model.parent).split('/')[-1] + + self.gguf.add_name (name) + self.gguf.add_context_length (params.n_ctx) + self.gguf.add_embedding_length (params.n_embd) + self.gguf.add_block_count (params.n_layer) + self.gguf.add_feed_forward_length (params.n_ff) + self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) + self.gguf.add_head_count (params.n_head) + self.gguf.add_head_count_kv (params.n_head_kv) + + if params.n_experts: + self.gguf.add_expert_count(params.n_experts) + + if params.n_experts_used: + self.gguf.add_expert_used_count(params.n_experts_used) + + if params.f_norm_eps: + self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + else: + raise ValueError('f_norm_eps is None') + + if params.f_rope_freq_base is not None: + self.gguf.add_rope_freq_base(params.f_rope_freq_base) + + if params.rope_scaling_type: + assert params.f_rope_scale is not None + self.gguf.add_rope_scaling_type(params.rope_scaling_type) + self.gguf.add_rope_scaling_factor(params.f_rope_scale) + + if params.n_orig_ctx is not None: + self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx) + + if params.rope_finetuned is not None: + self.gguf.add_rope_scaling_finetuned(params.rope_finetuned) + + if params.ftype is not None: + self.gguf.add_file_type(params.ftype) + + def handle_tokenizer_model(self, vocab: Vocab) -> str: + # Map the vocab types to the supported tokenizer models + tokenizer_model = { + SentencePieceVocab: "llama", + HfVocab: "llama", + BpeVocab: "gpt2", + }.get(type(vocab)) + + # Block if vocab type is not predefined + if tokenizer_model is None: + raise ValueError("Unknown vocab type: Not supported") + + return tokenizer_model + + def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: + tokens = [] + scores = [] + toktypes = [] + + # NOTE: `all_tokens` returns the base vocabulary and added tokens + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + return tokens, scores, toktypes + + def add_meta_vocab(self, vocab: Vocab) -> None: + # Handle the tokenizer model + tokenizer_model = self.handle_tokenizer_model(vocab) + + # Ensure that tokenizer_model is added to the GGUF model + self.gguf.add_tokenizer_model(tokenizer_model) + + # Extract model vocabulary for model conversion + tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) + + # Add extracted token information for model conversion + self.gguf.add_token_list(tokens) + self.gguf.add_token_scores(scores) + self.gguf.add_token_types(toktypes) + + def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None: + svocab.add_to_gguf(self.gguf) + + def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: + n_elements = int(np.prod(tensor.shape)) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype + data_nbytes = tensor.data_type.elements_to_bytes(n_elements) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) + + def write_meta(self) -> None: + self.gguf.write_header_to_file() + self.gguf.write_kv_data_to_file() + + def write_tensor_info(self) -> None: + self.gguf.write_ti_data_to_file() + + def close(self) -> None: + self.gguf.close() + + @staticmethod + def write_vocab_only( + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) + + of = OutputFile(fname_out, endianess=endianess) + + # meta data + of.add_meta_arch(params) + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + + of.write_meta() + + of.close() + + @staticmethod + def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]: + name, lazy_tensor = item + tensor = lazy_tensor.load().to_ggml() + return (lazy_tensor.data_type, tensor.ndarray) + + @staticmethod + def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: + dt, arr = item + if not isinstance(dt, QuantizedDataType): + return arr + return dt.quantize(arr) + + @staticmethod + def write_all( + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab=pad_vocab) + + of = OutputFile(fname_out, endianess=endianess) + + # meta data + of.add_meta_arch(params) + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + + # tensor info + for name, lazy_tensor in model.items(): + of.add_tensor_info(name, lazy_tensor) + + of.write_meta() + of.write_tensor_info() + + # tensor data + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map( + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, + use_processpool_executor=True, + ) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) + + start = time.time() + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print( + f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" + ) + of.gguf.write_tensor_data(ndarray) + + of.close() + + +def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: + wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type + + if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): + return GGMLFileType.AllF32 + if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): + return GGMLFileType.MostlyF16 + if output_type_str == "q8_0": + return GGMLFileType.MostlyQ8_0 + + name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} + + raise Exception(f"Unexpected combination of types: {name_to_type}") + + +def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: + return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) + for (name, tensor) in model.items()} + + +def convert_model_names(model: LazyModel, params: Params) -> LazyModel: + tmap = gguf.TensorNameMap(ARCH, params.n_layer) + should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) + + tmp = model + + # HF models permut or pack some of the tensors, so we need to undo that + for i in itertools.count(): + if f"model.layers.{i}.self_attn.q_proj.weight" in model: + print(f"Permuting layer {i}") + tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) + # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + elif f"model.layers.{i}.self_attn.W_pack.weight" in model: + print(f"Unpacking and permuting layer {i}") + tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) + tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) + del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] + else: + break + + out: LazyModel = {} + for name, lazy_tensor in model.items(): + tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) + if name_new is None: + raise Exception(f"Unexpected tensor name: {name}") + + if tensor_type in should_skip: + print(f"skipping tensor {name_new}") + continue + + print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") + out[name_new] = lazy_tensor + + return out + + +def nth_multifile_path(path: Path, n: int) -> Path | None: + '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return + the nth path in the model. + ''' + # Support the following patterns: + patterns: list[tuple[str, str]] = [ + # - x.00.pth, x.01.pth, etc. + (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), + # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. + (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'), + # x.bin, x.bin.1, etc. + (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}') + ] + for regex, replacement in patterns: + if re.search(regex, path.name): + new_path = path.with_name(re.sub(regex, replacement, path.name)) + if new_path.exists(): + return new_path + return None + + +def find_multifile_paths(path: Path) -> list[Path]: + '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return + the whole list of paths in the model. + ''' + ret: list[Path] = [] + for i in itertools.count(): + nth_path = nth_multifile_path(path, i) + if nth_path is None: + break + ret.append(nth_path) + if not ret: + # No matches. This should only happen if the file was named, e.g., + # foo.0, and there was no file named foo. Oh well, try to process it + # as a single file. + return [path] + return ret + + +def load_some_model(path: Path) -> ModelPlus: + '''Load a model of any supported format.''' + # Be extra-friendly and accept either a file or a directory: + if path.is_dir(): + # Check if it's a set of safetensors files first + globs = ["model-00001-of-*.safetensors", "model.safetensors"] + files = [file for glob in globs for file in path.glob(glob)] + if not files: + # Try the PyTorch patterns too, with lower priority + globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] + files = [file for glob in globs for file in path.glob(glob)] + if not files: + raise Exception(f"Can't find model in directory {path}") + if len(files) > 1: + raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") + path = files[0] + + paths = find_multifile_paths(path) + models_plus: list[ModelPlus] = [] + for path in paths: + print(f"Loading model file {path}") + models_plus.append(lazy_load_file(path)) + + model_plus = merge_multifile_models(models_plus) + return model_plus + + +class VocabFactory: + def __init__(self, path: Path): + self.path = path + self.files: dict[str, Path | None] = { + "tokenizer.model": None, + "vocab.json": None, + "tokenizer.json": None, + } + self._detect_files() + + def _detect_files(self): + for file in self.files.keys(): + file_path = self.path / file + parent_file_path = self.path.parent / file + if file_path.exists(): + self.files[file] = file_path + elif parent_file_path.exists(): + self.files[file] = parent_file_path + print(f"Found vocab files: {self.files}") + + def _select_file(self, vocabtype: str | None) -> Path: + if vocabtype in ["spm", "bpe"]: + for file_key in self.files.keys(): + if (file := self.files[file_key]) is not None: + return file + raise FileNotFoundError(f"{vocabtype} vocab not found.") + if vocabtype == "hfft": + # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file + return self.path + raise ValueError(f"Unsupported vocabulary type {vocabtype}") + + def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: + load_merges = vocabtype == "bpe" + n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None + return gguf.SpecialVocab( + model_parent_path, + load_merges=load_merges, + special_token_types=None, # Predetermined or passed as a parameter + n_vocab=n_vocab, + ) + + def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + path = self._select_file(vocabtype) + print(f"Loading vocab file '{path}', type '{vocabtype}'") + + added_tokens_path = path.parent / "added_tokens.json" + vocab: Vocab + if vocabtype == "bpe": + vocab = BpeVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + elif vocabtype == "spm": + vocab = SentencePieceVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + elif vocabtype == "hfft": + vocab = HfVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + else: + raise ValueError(f"Unsupported vocabulary type {vocabtype}") + # FIXME: Respect --vocab-dir? + special_vocab = self._create_special_vocab( + vocab, + vocabtype, + model_parent_path, + ) + return vocab, special_vocab + + +def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: + namestr = { + GGMLFileType.AllF32: "f32", + GGMLFileType.MostlyF16: "f16", + GGMLFileType.MostlyQ8_0:"q8_0", + }[file_type] + ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" + if ret in model_paths: + sys.stderr.write( + f"Error: Default output path ({ret}) would overwrite the input. " + "Please explicitly specify a path using --outfile.\n") + sys.exit(1) + return ret + + +def do_dump_model(model_plus: ModelPlus) -> None: + print(f"model_plus.paths = {model_plus.paths!r}") + print(f"model_plus.format = {model_plus.format!r}") + print(f"model_plus.vocab = {model_plus.vocab!r}") + for name, lazy_tensor in model_plus.model.items(): + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") + + +def main(args_in: list[str] | None = None) -> None: + output_choices = ["f32", "f16"] + if np.uint32(1) == np.uint32(1).newbyteorder("<"): + # We currently only support Q8_0 output on little endian systems. + output_choices.append("q8_0") + vocab_types = ["spm", "bpe", "hfft"] + parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") + parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) + parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") + + args = parser.parse_args(args_in) + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] + tmp_model_path = args.model / "weighted_model" + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") + args.model = tmp_model_path + + if args.dump_single: + model_plus = lazy_load_file(args.model) + do_dump_model(model_plus) + return + + if not args.vocab_only: + model_plus = load_some_model(args.model) + else: + model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) + + if args.dump: + do_dump_model(model_plus) + return + endianess = gguf.GGUFEndian.LITTLE + if args.big_endian: + endianess = gguf.GGUFEndian.BIG + + params = Params.load(model_plus) + if params.n_ctx == -1: + if args.ctx is None: + raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" + "Please specify one with --ctx:\n" + " - LLaMA v1: --ctx 2048\n" + " - LLaMA v2: --ctx 4096\n") + params.n_ctx = args.ctx + + if args.outtype: + params.ftype = { + "f32": GGMLFileType.AllF32, + "f16": GGMLFileType.MostlyF16, + "q8_0": GGMLFileType.MostlyQ8_0, + }[args.outtype] + + print(f"params = {params}") + + model_parent_path = model_plus.paths[0].parent + vocab_path = Path(args.vocab_dir or args.model or model_parent_path) + vocab_factory = VocabFactory(vocab_path) + vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) + + if args.vocab_only: + if not args.outfile: + raise ValueError("need --outfile if using --vocab-only") + outfile = args.outfile + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess=endianess, pad_vocab=args.pad_vocab) + print(f"Wrote {outfile}") + return + + if model_plus.vocab is not None and args.vocab_dir is None: + vocab = model_plus.vocab + + print(f"Vocab info: {vocab}") + print(f"Special vocab info: {special_vocab}") + + model = model_plus.model + model = convert_model_names(model, params) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) + + params.ftype = ftype + print(f"Writing {outfile}, format {ftype}") + + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) + print(f"Wrote {outfile}") + + +if __name__ == '__main__': + main() diff --git a/extensions/model-extension/scripts/gguf-py/LICENSE b/extensions/model-extension/scripts/gguf-py/LICENSE new file mode 100644 index 000000000..76f67efdc --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Georgi Gerganov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/extensions/model-extension/scripts/gguf-py/README.md b/extensions/model-extension/scripts/gguf-py/README.md new file mode 100644 index 000000000..22d7ffa52 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/README.md @@ -0,0 +1,81 @@ +## gguf + +This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) +(GGML Universal File) format. + +See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) +as an example for its usage. + +## Installation +```sh +pip install gguf +``` + +## API Examples/Simple Tools + +[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. + +[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console. + +[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key. + +[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files. + +## Development +Maintainers who participate in development of this package are advised to install it in editable mode: + +```sh +cd /path/to/llama.cpp/gguf-py + +pip install --editable . +``` + +**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`. +In this case, upgrade Pip to the latest: + +```sh +pip install --upgrade pip +``` + +## Automatic publishing with CI + +There's a GitHub workflow to make a release automatically upon creation of tags in a specified format. + +1. Bump the version in `pyproject.toml`. +2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number. + +```sh +git tag -a gguf-v1.0.0 -m "Version 1.0 release" +``` + +3. Push the tags. + +```sh +git push origin --tags +``` + +## Manual publishing +If you want to publish the package manually for any reason, you need to have `twine` and `build` installed: + +```sh +pip install build twine +``` + +Then, follow these steps to release a new version: + +1. Bump the version in `pyproject.toml`. +2. Build the package: + +```sh +python -m build +``` + +3. Upload the generated distribution archives: + +```sh +python -m twine upload dist/* +``` + +## TODO +- [ ] Add tests +- [ ] Include conversion scripts as command line entry points in this package. diff --git a/extensions/model-extension/scripts/gguf-py/examples/writer.py b/extensions/model-extension/scripts/gguf-py/examples/writer.py new file mode 100755 index 000000000..f39eed1af --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/examples/writer.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import sys +from pathlib import Path + +import numpy as np + +# Necessary to load the local gguf package +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from gguf import GGUFWriter # noqa: E402 + + +# Example usage: +def writer_example() -> None: + # Example usage with a file + gguf_writer = GGUFWriter("example.gguf", "llama") + + gguf_writer.add_architecture() + gguf_writer.add_block_count(12) + gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer + gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float + gguf_writer.add_custom_alignment(64) + + tensor1 = np.ones((32,), dtype=np.float32) * 100.0 + tensor2 = np.ones((64,), dtype=np.float32) * 101.0 + tensor3 = np.ones((96,), dtype=np.float32) * 102.0 + + gguf_writer.add_tensor("tensor1", tensor1) + gguf_writer.add_tensor("tensor2", tensor2) + gguf_writer.add_tensor("tensor3", tensor3) + + gguf_writer.write_header_to_file() + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file() + + gguf_writer.close() + + +if __name__ == '__main__': + writer_example() diff --git a/extensions/model-extension/scripts/gguf-py/gguf/__init__.py b/extensions/model-extension/scripts/gguf-py/gguf/__init__.py new file mode 100644 index 000000000..110ab342c --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/gguf/__init__.py @@ -0,0 +1,5 @@ +from .constants import * +from .gguf_reader import * +from .gguf_writer import * +from .tensor_mapping import * +from .vocab import * diff --git a/extensions/model-extension/scripts/gguf-py/gguf/constants.py b/extensions/model-extension/scripts/gguf-py/gguf/constants.py new file mode 100644 index 000000000..1cfd41c0b --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/gguf/constants.py @@ -0,0 +1,665 @@ +from __future__ import annotations + +import sys +from enum import Enum, IntEnum, auto +from typing import Any + +# +# constants +# + +GGUF_MAGIC = 0x46554747 # "GGUF" +GGUF_VERSION = 3 +GGUF_DEFAULT_ALIGNMENT = 32 + +# +# metadata keys +# + + +class Keys: + class General: + ARCHITECTURE = "general.architecture" + QUANTIZATION_VERSION = "general.quantization_version" + ALIGNMENT = "general.alignment" + NAME = "general.name" + AUTHOR = "general.author" + URL = "general.url" + DESCRIPTION = "general.description" + LICENSE = "general.license" + SOURCE_URL = "general.source.url" + SOURCE_HF_REPO = "general.source.huggingface.repository" + FILE_TYPE = "general.file_type" + + class LLM: + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + BLOCK_COUNT = "{arch}.block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + + class Attention: + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + + class Rope: + DIMENSION_COUNT = "{arch}.rope.dimension_count" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + + class Tokenizer: + MODEL = "tokenizer.ggml.model" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" + + +# +# recommended mapping of model tensor names for storage in gguf +# + + +class MODEL_ARCH(IntEnum): + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + PERSIMMON = auto() + REFACT = auto() + BERT = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + PHI2 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + + +class MODEL_TENSOR(IntEnum): + TOKEN_EMBD = auto() + TOKEN_EMBD_NORM = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_NORM = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + + +MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.PERSIMMON: "persimmon", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", +} + +TENSOR_NAMES: dict[MODEL_TENSOR, str] = { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", +} + +MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.LLAMA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], + MODEL_ARCH.GPTNEOX: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.FALCON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.BAICHUAN: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.STARCODER: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.BERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.MPT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_ACT, + ], + MODEL_ARCH.GPTJ: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.PERSIMMON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], + MODEL_ARCH.REFACT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.BLOOM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.STABLELM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.QWEN: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.QWEN2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.PLAMO: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.GPT2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.PHI2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.CODESHELL: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.ORION: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.INTERNLM2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.MINICPM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], + # TODO +} + +# tensors that will not be serialized +MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.LLAMA: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], + MODEL_ARCH.BAICHUAN: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], + MODEL_ARCH.PERSIMMON: [ + MODEL_TENSOR.ROPE_FREQS, + ], + MODEL_ARCH.QWEN: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], + MODEL_ARCH.CODESHELL: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], + MODEL_ARCH.ORION: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], +} + +# +# types +# + + +class TokenType(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class RopeScalingType(Enum): + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' + + +class GGMLQuantizationType(IntEnum): + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 + + +class GGUFEndian(IntEnum): + LITTLE = 0 + BIG = 1 + + +class GGUFValueType(IntEnum): + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 + FLOAT32 = 6 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 + FLOAT64 = 12 + + @staticmethod + def get_type(val: Any) -> GGUFValueType: + if isinstance(val, (str, bytes, bytearray)): + return GGUFValueType.STRING + elif isinstance(val, list): + return GGUFValueType.ARRAY + elif isinstance(val, float): + return GGUFValueType.FLOAT32 + elif isinstance(val, bool): + return GGUFValueType.BOOL + elif isinstance(val, int): + return GGUFValueType.INT32 + # TODO: need help with 64-bit types in Python + else: + print("Unknown type:", type(val)) + sys.exit() + + +# Note: Does not support GGML_QKK_64 +QK_K = 256 +# Items here are (block size, type size) +GGML_QUANT_SIZES = { + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), +} + + +# Aliases for backward compatibility. + +# general +KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE +KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION +KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT +KEY_GENERAL_NAME = Keys.General.NAME +KEY_GENERAL_AUTHOR = Keys.General.AUTHOR +KEY_GENERAL_URL = Keys.General.URL +KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION +KEY_GENERAL_LICENSE = Keys.General.LICENSE +KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL +KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO +KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE + +# LLM +KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH +KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH +KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT +KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH +KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL +KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT + +# attention +KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT +KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV +KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS +KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV +KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS +KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS + +# RoPE +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED + +# tokenization +KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST +KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE +KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES +KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES +KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID +KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID +KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID +KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON +KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf.py new file mode 100644 index 000000000..651a81eb8 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/gguf/gguf.py @@ -0,0 +1,15 @@ +# This file left for compatibility. If you want to use the GGUF API from Python +# then don't import gguf/gguf.py directly. If you're looking for examples, see the +# examples/ directory for gguf-py + +import importlib +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +# Compatibility for people trying to import gguf/gguf.py directly instead of as a package. +importlib.invalidate_caches() +import gguf # noqa: E402 + +importlib.reload(gguf) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py new file mode 100644 index 000000000..5b6d4ba6b --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py @@ -0,0 +1,264 @@ +# +# GGUF file reading/modification support. For API usage information, +# please see the files scripts/ for some fairly simple examples. +# +from __future__ import annotations + +import os +from collections import OrderedDict +from typing import Any, Literal, NamedTuple, TypeVar, Union + +import numpy as np +import numpy.typing as npt + +if __name__ == "__main__": + import sys + from pathlib import Path + + # Allow running file in package as a script. + sys.path.insert(0, str(Path(__file__).parent.parent)) + +from gguf.constants import ( + GGML_QUANT_SIZES, + GGUF_DEFAULT_ALIGNMENT, + GGUF_MAGIC, + GGUF_VERSION, + GGMLQuantizationType, + GGUFValueType, +) + + +READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] + + +class ReaderField(NamedTuple): + # Offset to start of this field. + offset: int + + # Name of the field (not necessarily from file data). + name: str + + # Data parts. Some types have multiple components, such as strings + # that consist of a length followed by the string data. + parts: list[npt.NDArray[Any]] = [] + + # Indexes into parts that we can call the actual data. For example + # an array of strings will be populated with indexes to the actual + # string data. + data: list[int] = [-1] + + types: list[GGUFValueType] = [] + + +class ReaderTensor(NamedTuple): + name: str + tensor_type: GGMLQuantizationType + shape: npt.NDArray[np.uint32] + n_elements: int + n_bytes: int + data_offset: int + data: npt.NDArray[Any] + field: ReaderField + + +class GGUFReader: + # I - same as host, S - swapped + byte_order: Literal['I' | 'S'] = 'I' + alignment: int = GGUF_DEFAULT_ALIGNMENT + + # Note: Internal helper, API may change. + gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = { + GGUFValueType.UINT8: np.uint8, + GGUFValueType.INT8: np.int8, + GGUFValueType.UINT16: np.uint16, + GGUFValueType.INT16: np.int16, + GGUFValueType.UINT32: np.uint32, + GGUFValueType.INT32: np.int32, + GGUFValueType.FLOAT32: np.float32, + GGUFValueType.UINT64: np.uint64, + GGUFValueType.INT64: np.int64, + GGUFValueType.FLOAT64: np.float64, + GGUFValueType.BOOL: np.bool_, + } + + def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'): + self.data = np.memmap(path, mode = mode) + offs = 0 + if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC: + raise ValueError('GGUF magic invalid') + offs += 4 + temp_version = self._get(offs, np.uint32) + if temp_version[0] & 65535 == 0: + # If we get 0 here that means it's (probably) a GGUF file created for + # the opposite byte order of the machine this script is running on. + self.byte_order = 'S' + temp_version = temp_version.newbyteorder(self.byte_order) + version = temp_version[0] + if version not in READER_SUPPORTED_VERSIONS: + raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle') + self.fields: OrderedDict[str, ReaderField] = OrderedDict() + self.tensors: list[ReaderTensor] = [] + offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32])) + temp_counts = self._get(offs, np.uint64, 2) + offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64])) + offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64])) + tensor_count, kv_count = temp_counts + offs = self._build_fields(offs, kv_count) + offs, tensors_fields = self._build_tensors_fields(offs, tensor_count) + new_align = self.fields.get('general.alignment') + if new_align is not None: + if new_align.types != [GGUFValueType.UINT32]: + raise ValueError('Bad type for general.alignment field') + self.alignment = new_align.parts[-1][0] + padding = offs % self.alignment + if padding != 0: + offs += self.alignment - padding + self._build_tensors(offs, tensors_fields) + + _DT = TypeVar('_DT', bound = npt.DTypeLike) + + # Fetch a key/value metadata field by key. + def get_field(self, key: str) -> Union[ReaderField, None]: + return self.fields.get(key, None) + + # Fetch a tensor from the list by index. + def get_tensor(self, idx: int) -> ReaderTensor: + return self.tensors[idx] + + def _get( + self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None, + ) -> npt.NDArray[Any]: + count = int(count) + itemsize = int(np.empty([], dtype = dtype).itemsize) + end_offs = offset + itemsize * count + return ( + self.data[offset:end_offs] + .view(dtype = dtype)[:count] + .newbyteorder(override_order or self.byte_order) + ) + + def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: + if field.name in self.fields: + raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') + self.fields[field.name] = field + return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts) + + def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]: + slen = self._get(offset, np.uint64) + return slen, self._get(offset + 8, np.uint8, slen[0]) + + def _get_field_parts( + self, orig_offs: int, raw_type: int, + ) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]: + offs = orig_offs + types: list[GGUFValueType] = [] + gtype = GGUFValueType(raw_type) + types.append(gtype) + # Handle strings. + if gtype == GGUFValueType.STRING: + sparts: list[npt.NDArray[Any]] = list(self._get_str(offs)) + size = sum(int(part.nbytes) for part in sparts) + return size, sparts, [1], types + # Check if it's a simple scalar type. + nptype = self.gguf_scalar_to_np.get(gtype) + if nptype is not None: + val = self._get(offs, nptype) + return int(val.nbytes), [val], [0], types + # Handle arrays. + if gtype == GGUFValueType.ARRAY: + raw_itype = self._get(offs, np.uint32) + offs += int(raw_itype.nbytes) + alen = self._get(offs, np.uint64) + offs += int(alen.nbytes) + aparts: list[npt.NDArray[Any]] = [raw_itype, alen] + data_idxs: list[int] = [] + for idx in range(alen[0]): + curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0]) + if idx == 0: + types += curr_types + idxs_offs = len(aparts) + aparts += curr_parts + data_idxs += (idx + idxs_offs for idx in curr_idxs) + offs += curr_size + return offs - orig_offs, aparts, data_idxs, types + # We can't deal with this one. + raise ValueError('Unknown/unhandled field type {gtype}') + + def _get_tensor(self, orig_offs: int) -> ReaderField: + offs = orig_offs + name_len, name_data = self._get_str(offs) + offs += int(name_len.nbytes + name_data.nbytes) + n_dims = self._get(offs, np.uint32) + offs += int(n_dims.nbytes) + dims = self._get(offs, np.uint64, n_dims[0]) + offs += int(dims.nbytes) + raw_dtype = self._get(offs, np.uint32) + offs += int(raw_dtype.nbytes) + offset_tensor = self._get(offs, np.uint64) + offs += int(offset_tensor.nbytes) + return ReaderField( + orig_offs, + str(bytes(name_data), encoding = 'utf-8'), + [name_len, name_data, n_dims, dims, raw_dtype, offset_tensor], + [1, 3, 4, 5], + ) + + def _build_fields(self, offs: int, count: int) -> int: + for _ in range(count): + orig_offs = offs + kv_klen, kv_kdata = self._get_str(offs) + offs += int(kv_klen.nbytes + kv_kdata.nbytes) + raw_kv_type = self._get(offs, np.uint32) + offs += int(raw_kv_type.nbytes) + parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type] + idxs_offs = len(parts) + field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0]) + parts += field_parts + self._push_field(ReaderField( + orig_offs, + str(bytes(kv_kdata), encoding = 'utf-8'), + parts, + [idx + idxs_offs for idx in field_idxs], + field_types, + ), skip_sum = True) + offs += field_size + return offs + + def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]: + tensor_fields = [] + for _ in range(count): + field = self._get_tensor(offs) + offs += sum(int(part.nbytes) for part in field.parts) + tensor_fields.append(field) + return offs, tensor_fields + + def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: + tensors = [] + for field in fields: + _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts + ggml_type = GGMLQuantizationType(raw_dtype[0]) + n_elems = np.prod(dims) + block_size, type_size = GGML_QUANT_SIZES[ggml_type] + n_bytes = n_elems * type_size // block_size + data_offs = int(start_offs + offset_tensor[0]) + item_type: npt.DTypeLike + if ggml_type == GGMLQuantizationType.F32: + item_count = n_elems + item_type = np.float32 + elif ggml_type == GGMLQuantizationType.F16: + item_count = n_elems + item_type = np.float16 + else: + item_count = n_bytes + item_type = np.uint8 + tensors.append(ReaderTensor( + name = str(bytes(name_data), encoding = 'utf-8'), + tensor_type = ggml_type, + shape = dims, + n_elements = n_elems, + n_bytes = n_bytes, + data_offset = data_offs, + data = self._get(data_offs, item_type, item_count), + field = field, + )) + self.tensors = tensors diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py new file mode 100644 index 000000000..16808196e --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py @@ -0,0 +1,427 @@ +from __future__ import annotations + +import os +import shutil +import struct +import tempfile +from enum import Enum, auto +from io import BufferedWriter +from typing import IO, Any, Sequence + +import numpy as np + +from .constants import ( + GGUF_DEFAULT_ALIGNMENT, + GGUF_MAGIC, + GGUF_VERSION, + GGMLQuantizationType, + GGUFEndian, + GGUFValueType, + Keys, + RopeScalingType, + TokenType, +) + + +class WriterState(Enum): + EMPTY = auto() + HEADER = auto() + KV_DATA = auto() + TI_DATA = auto() + + +class GGUFWriter: + fout: BufferedWriter + temp_file: tempfile.SpooledTemporaryFile[bytes] | None + tensors: list[np.ndarray[Any, Any]] + _simple_value_packing = { + GGUFValueType.UINT8: "B", + GGUFValueType.INT8: "b", + GGUFValueType.UINT16: "H", + GGUFValueType.INT16: "h", + GGUFValueType.UINT32: "I", + GGUFValueType.INT32: "i", + GGUFValueType.FLOAT32: "f", + GGUFValueType.UINT64: "Q", + GGUFValueType.INT64: "q", + GGUFValueType.FLOAT64: "d", + GGUFValueType.BOOL: "?", + } + + def __init__( + self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True, + endianess: GGUFEndian = GGUFEndian.LITTLE, + ): + self.fout = open(path, "wb") + self.arch = arch + self.endianess = endianess + self.offset_tensor = 0 + self.data_alignment = GGUF_DEFAULT_ALIGNMENT + self.kv_data = bytearray() + self.kv_data_count = 0 + self.ti_data = bytearray() + self.ti_data_count = 0 + self.use_temp_file = use_temp_file + self.temp_file = None + self.tensors = [] + print("gguf: This GGUF file is for {0} Endian only".format( + "Big" if self.endianess == GGUFEndian.BIG else "Little", + )) + self.state = WriterState.EMPTY + + self.add_architecture() + + def write_header_to_file(self) -> None: + if self.state is not WriterState.EMPTY: + raise ValueError(f'Expected output file to be empty, got {self.state}') + + self._write_packed(" None: + if self.state is not WriterState.HEADER: + raise ValueError(f'Expected output file to contain the header, got {self.state}') + + self.fout.write(self.kv_data) + self.flush() + self.state = WriterState.KV_DATA + + def write_ti_data_to_file(self) -> None: + if self.state is not WriterState.KV_DATA: + raise ValueError(f'Expected output file to contain KV data, got {self.state}') + + self.fout.write(self.ti_data) + self.flush() + self.state = WriterState.TI_DATA + + def add_key(self, key: str) -> None: + self.add_val(key, GGUFValueType.STRING, add_vtype=False) + + def add_uint8(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.UINT8) + + def add_int8(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.INT8) + + def add_uint16(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.UINT16) + + def add_int16(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.INT16) + + def add_uint32(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.UINT32) + + def add_int32(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.INT32) + + def add_float32(self, key: str, val: float) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.FLOAT32) + + def add_uint64(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.UINT64) + + def add_int64(self, key: str, val: int) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.INT64) + + def add_float64(self, key: str, val: float) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.FLOAT64) + + def add_bool(self, key: str, val: bool) -> None: + self.add_key(key) + self.add_val(val, GGUFValueType.BOOL) + + def add_string(self, key: str, val: str) -> None: + if not val: + return + self.add_key(key) + self.add_val(val, GGUFValueType.STRING) + + def add_array(self, key: str, val: Sequence[Any]) -> None: + if not isinstance(val, Sequence): + raise ValueError("Value must be a sequence for array type") + + self.add_key(key) + self.add_val(val, GGUFValueType.ARRAY) + + def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None: + if vtype is None: + vtype = GGUFValueType.get_type(val) + + if add_vtype: + self.kv_data += self._pack("I", vtype) + self.kv_data_count += 1 + + pack_fmt = self._simple_value_packing.get(vtype) + if pack_fmt is not None: + self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL) + elif vtype == GGUFValueType.STRING: + encoded_val = val.encode("utf8") if isinstance(val, str) else val + self.kv_data += self._pack("Q", len(encoded_val)) + self.kv_data += encoded_val + elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: + ltype = GGUFValueType.get_type(val[0]) + if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): + raise ValueError("All items in a GGUF array should be of the same type") + self.kv_data += self._pack("I", ltype) + self.kv_data += self._pack("Q", len(val)) + for item in val: + self.add_val(item, add_vtype=False) + else: + raise ValueError("Invalid GGUF metadata value type or value") + + @staticmethod + def ggml_pad(x: int, n: int) -> int: + return ((x + n - 1) // n) * n + + def add_tensor_info( + self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], + tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None, + ) -> None: + if self.state is not WriterState.EMPTY: + raise ValueError(f'Expected output file to be empty, got {self.state}') + + if raw_dtype is None and tensor_dtype not in (np.float32, np.float16): + raise ValueError("Only F32 and F16 tensors are supported for now") + + encoded_name = name.encode("utf8") + self.ti_data += self._pack("Q", len(encoded_name)) + self.ti_data += encoded_name + n_dims = len(tensor_shape) + self.ti_data += self._pack("I", n_dims) + for i in range(n_dims): + self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) + if raw_dtype is None: + dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 + else: + dtype = raw_dtype + self.ti_data += self._pack("I", dtype) + self.ti_data += self._pack("Q", self.offset_tensor) + self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) + self.ti_data_count += 1 + + def add_tensor( + self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, + raw_dtype: GGMLQuantizationType | None = None, + ) -> None: + if self.endianess == GGUFEndian.BIG: + tensor.byteswap(inplace=True) + if self.use_temp_file and self.temp_file is None: + fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) + fp.seek(0) + self.temp_file = fp + + shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape + self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) + + if self.temp_file is None: + self.tensors.append(tensor) + return + + tensor.tofile(self.temp_file) + self.write_padding(self.temp_file, tensor.nbytes) + + def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None: + pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n + if pad != 0: + fp.write(bytes([0] * pad)) + + def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: + if self.state is not WriterState.TI_DATA: + raise ValueError(f'Expected output file to contain tensor info, got {self.state}') + + if self.endianess == GGUFEndian.BIG: + tensor.byteswap(inplace=True) + self.write_padding(self.fout, self.fout.tell()) + tensor.tofile(self.fout) + self.write_padding(self.fout, tensor.nbytes) + + def write_tensors_to_file(self) -> None: + self.write_ti_data_to_file() + + self.write_padding(self.fout, self.fout.tell()) + + if self.temp_file is None: + while True: + try: + tensor = self.tensors.pop(0) + except IndexError: + break + tensor.tofile(self.fout) + self.write_padding(self.fout, tensor.nbytes) + return + + self.temp_file.seek(0) + + shutil.copyfileobj(self.temp_file, self.fout) + self.flush() + self.temp_file.close() + + def flush(self) -> None: + self.fout.flush() + + def close(self) -> None: + self.fout.close() + + def add_architecture(self) -> None: + self.add_string(Keys.General.ARCHITECTURE, self.arch) + + def add_author(self, author: str) -> None: + self.add_string(Keys.General.AUTHOR, author) + + def add_tensor_data_layout(self, layout: str) -> None: + self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) + + def add_url(self, url: str) -> None: + self.add_string(Keys.General.URL, url) + + def add_description(self, description: str) -> None: + self.add_string(Keys.General.DESCRIPTION, description) + + def add_source_url(self, url: str) -> None: + self.add_string(Keys.General.SOURCE_URL, url) + + def add_source_hf_repo(self, repo: str) -> None: + self.add_string(Keys.General.SOURCE_HF_REPO, repo) + + def add_file_type(self, ftype: int) -> None: + self.add_uint32(Keys.General.FILE_TYPE, ftype) + + def add_name(self, name: str) -> None: + self.add_string(Keys.General.NAME, name) + + def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None: + self.add_uint32( + Keys.General.QUANTIZATION_VERSION, quantization_version) + + def add_custom_alignment(self, alignment: int) -> None: + self.data_alignment = alignment + self.add_uint32(Keys.General.ALIGNMENT, alignment) + + def add_context_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) + + def add_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_block_count(self, length: int) -> None: + self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) + + def add_feed_forward_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) + + def add_parallel_residual(self, use: bool) -> None: + self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) + + def add_head_count(self, count: int) -> None: + self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) + + def add_head_count_kv(self, count: int) -> None: + self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) + + def add_key_length(self, length: int) -> None: + self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length) + + def add_value_length(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) + + def add_max_alibi_bias(self, bias: float) -> None: + self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) + + def add_clamp_kqv(self, value: float) -> None: + self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + + def add_expert_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) + + def add_expert_used_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count) + + def add_layer_norm_eps(self, value: float) -> None: + self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) + + def add_layer_norm_rms_eps(self, value: float) -> None: + self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) + + def add_rope_dimension_count(self, count: int) -> None: + self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) + + def add_rope_freq_base(self, value: float) -> None: + self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) + + def add_rope_scaling_type(self, value: RopeScalingType) -> None: + self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value) + + def add_rope_scaling_factor(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) + + def add_rope_scaling_orig_ctx_len(self, value: int) -> None: + self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) + + def add_rope_scaling_finetuned(self, value: bool) -> None: + self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) + + def add_tokenizer_model(self, model: str) -> None: + self.add_string(Keys.Tokenizer.MODEL, model) + + def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: + self.add_array(Keys.Tokenizer.LIST, tokens) + + def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: + self.add_array(Keys.Tokenizer.MERGES, merges) + + def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: + self.add_array(Keys.Tokenizer.TOKEN_TYPE, types) + + def add_token_scores(self, scores: Sequence[float]) -> None: + self.add_array(Keys.Tokenizer.SCORES, scores) + + def add_bos_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.BOS_ID, id) + + def add_eos_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOS_ID, id) + + def add_unk_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.UNK_ID, id) + + def add_sep_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.SEP_ID, id) + + def add_pad_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.PAD_ID, id) + + def add_add_bos_token(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_BOS, value) + + def add_add_eos_token(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_EOS, value) + + def add_add_space_prefix(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) + + def add_chat_template(self, value: str) -> None: + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: + pack_prefix = '' + if not skip_pack_prefix: + pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>' + return struct.pack(f'{pack_prefix}{fmt}', value) + + def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: + self.fout.write(self._pack(fmt, value, skip_pack_prefix)) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/py.typed b/extensions/model-extension/scripts/gguf-py/gguf/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py b/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py new file mode 100644 index 000000000..4f16d8504 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py @@ -0,0 +1,332 @@ +from __future__ import annotations + +from typing import Sequence + +from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES + + +class TensorNameMap: + mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { + # Token embeddings + MODEL_TENSOR.TOKEN_EMBD: ( + "gpt_neox.embed_in", # gptneox + "transformer.wte", # gpt2 gpt-j mpt refact qwen + "transformer.word_embeddings", # falcon + "word_embeddings", # bloom + "model.embed_tokens", # llama-hf + "tok_embeddings", # llama-pth + "embeddings.word_embeddings", # bert + "language_model.embedding.word_embeddings", # persimmon + "wte", # gpt2 + "transformer.embd.wte", # phi2 + "model.tok_embeddings", # internlm2 + ), + + # Token type embeddings + MODEL_TENSOR.TOKEN_TYPES: ( + "embeddings.token_type_embeddings", # bert + ), + + # Normalization of token embeddings + MODEL_TENSOR.TOKEN_EMBD_NORM: ( + "word_embeddings_layernorm", # bloom + ), + + # Position embeddings + MODEL_TENSOR.POS_EMBD: ( + "transformer.wpe", # gpt2 + "embeddings.position_embeddings", # bert + "wpe", # gpt2 + ), + + # Output + MODEL_TENSOR.OUTPUT: ( + "embed_out", # gptneox + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen + "output", # llama-pth bloom internlm2 + "word_embeddings_for_head", # persimmon + "lm_head.linear", # phi2 + ), + + # Output norm + MODEL_TENSOR.OUTPUT_NORM: ( + "gpt_neox.final_layer_norm", # gptneox + "transformer.ln_f", # gpt2 gpt-j falcon + "model.norm", # llama-hf baichuan internlm2 + "norm", # llama-pth + "embeddings.LayerNorm", # bert + "transformer.norm_f", # mpt + "ln_f", # refact bloom qwen gpt2 + "language_model.encoder.final_layernorm", # persimmon + "model.final_layernorm", # persimmon + "lm_head.ln", # phi2 + ), + + # Rope frequencies + MODEL_TENSOR.ROPE_FREQS: ( + "rope.freqs", # llama-pth + ), + } + + block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { + # Attention norm + MODEL_TENSOR.ATTN_NORM: ( + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth + "encoder.layer.{bid}.attention.output.LayerNorm", # bert + "language_model.encoder.layers.{bid}.input_layernorm", # persimmon + "model.layers.{bid}.ln1", # yi + "h.{bid}.ln_1", # gpt2 + "transformer.h.{bid}.ln", # phi2 + "model.layers.layers.{bid}.norm", # plamo + "model.layers.{bid}.attention_norm", # internlm2 + ), + + # Attention norm 2 + MODEL_TENSOR.ATTN_NORM_2: ( + "transformer.h.{bid}.ln_attn", # falcon40b + ), + + # Attention query-key-value + MODEL_TENSOR.ATTN_QKV: ( + "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox + "transformer.h.{bid}.attn.c_attn", # gpt2 qwen + "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.h.{bid}.self_attention.query_key_value", # falcon + "h.{bid}.self_attention.query_key_value", # bloom + "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "model.layers.{bid}.self_attn.query_key_value", # persimmon + "h.{bid}.attn.c_attn", # gpt2 + "transformer.h.{bid}.mixer.Wqkv", # phi2 + ), + + # Attention query + MODEL_TENSOR.ATTN_Q: ( + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq" # internlm2 + ), + + # Attention key + MODEL_TENSOR.ATTN_K: ( + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk" # internlm2 + ), + + # Attention value + MODEL_TENSOR.ATTN_V: ( + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv" # internlm2 + ), + + # Attention output + MODEL_TENSOR.ATTN_OUT: ( + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j + "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 + "transformer.h.{bid}.mixer.out_proj", # phi2 + "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + ), + + # Rotary embeddings + MODEL_TENSOR.ATTN_ROT_EMBD: ( + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf + "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth + "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo + "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell + ), + + # Feed-forward norm + MODEL_TENSOR.FFN_NORM: ( + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 refact qwen + "h.{bid}.post_attention_layernorm", # bloom + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf + "layers.{bid}.ffn_norm", # llama-pth + "encoder.layer.{bid}.output.LayerNorm", # bert + "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon + "model.layers.{bid}.ln2", # yi + "h.{bid}.ln_2", # gpt2 + "model.layers.{bid}.ffn_norm", # internlm2 + ), + + MODEL_TENSOR.FFN_GATE_INP: ( + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral + ), + + # Feed-forward up + MODEL_TENSOR.FFN_UP: ( + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "h.{bid}.mlp.dense_h_to_4h", # bloom + "model.layers.{bid}.mlp.up_proj", # llama-hf refact + "layers.{bid}.feed_forward.w3", # llama-pth + "encoder.layer.{bid}.intermediate.dense", # bert + "transformer.h.{bid}.mlp.fc_in", # gpt-j + "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "transformer.h.{bid}.mlp.w1", # qwen + "h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.fc1", # phi2 + "model.layers.layers.{bid}.mlp.up_proj", # plamo + "model.layers.{bid}.feed_forward.w3", # internlm2 + ), + + MODEL_TENSOR.FFN_UP_EXP: ( + "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral + ), + + # AWQ-activation gate + MODEL_TENSOR.FFN_ACT: ( + "transformer.blocks.{bid}.ffn.act", # mpt + ), + + # Feed-forward gate + MODEL_TENSOR.FFN_GATE: ( + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + ), + + MODEL_TENSOR.FFN_GATE_EXP: ( + "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral + ), + + # Feed-forward down + MODEL_TENSOR.FFN_DOWN: ( + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "h.{bid}.mlp.dense_4h_to_h", # bloom + "model.layers.{bid}.mlp.down_proj", # llama-hf + "layers.{bid}.feed_forward.w2", # llama-pth + "encoder.layer.{bid}.output.dense", # bert + "transformer.h.{bid}.mlp.fc_out", # gpt-j + "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "h.{bid}.mlp.c_proj", # gpt2 + "transformer.h.{bid}.mlp.fc2", # phi2 + "model.layers.{bid}.mlp.fc2", # phi2 + "model.layers.layers.{bid}.mlp.down_proj", # plamo + "model.layers.{bid}.feed_forward.w2", # internlm2 + ), + + MODEL_TENSOR.FFN_DOWN_EXP: ( + "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral + ), + + MODEL_TENSOR.ATTN_Q_NORM: ( + "language_model.encoder.layers.{bid}.self_attention.q_layernorm", + "model.layers.{bid}.self_attn.q_layernorm", # persimmon + ), + + MODEL_TENSOR.ATTN_K_NORM: ( + "language_model.encoder.layers.{bid}.self_attention.k_layernorm", + "model.layers.{bid}.self_attn.k_layernorm", # persimmon + ), + + MODEL_TENSOR.ROPE_FREQS: ( + "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon + ), + } + + mapping: dict[str, tuple[MODEL_TENSOR, str]] + + def __init__(self, arch: MODEL_ARCH, n_blocks: int): + self.mapping = {} + for tensor, keys in self.mappings_cfg.items(): + if tensor not in MODEL_TENSORS[arch]: + continue + tensor_name = TENSOR_NAMES[tensor] + self.mapping[tensor_name] = (tensor, tensor_name) + for key in keys: + self.mapping[key] = (tensor, tensor_name) + for bid in range(n_blocks): + for tensor, keys in self.block_mappings_cfg.items(): + if tensor not in MODEL_TENSORS[arch]: + continue + # TODO: make this configurable + n_experts = 8 + for xid in range(n_experts): + tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) + self.mapping[tensor_name] = (tensor, tensor_name) + for key in keys: + key = key.format(bid = bid, xid = xid) + self.mapping[key] = (tensor, tensor_name) + + def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: + result = self.mapping.get(key) + if result is not None: + return result + for suffix in try_suffixes: + if key.endswith(suffix): + result = self.mapping.get(key[:-len(suffix)]) + if result is not None: + return result[0], result[1] + suffix + return None + + def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) + if result is None: + return None + return result[1] + + def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) + if result is None: + return None + return result[0] + + def __getitem__(self, key: str) -> str: + try: + return self.mapping[key][1] + except KeyError: + raise KeyError(key) + + def __contains__(self, key: str) -> bool: + return key in self.mapping + + def __repr__(self) -> str: + return repr(self.mapping) + + +def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: + return TensorNameMap(arch, n_blocks) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/vocab.py b/extensions/model-extension/scripts/gguf-py/gguf/vocab.py new file mode 100644 index 000000000..cd1942975 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/gguf/vocab.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path +from typing import Any, Callable + +from .gguf_writer import GGUFWriter + + +class SpecialVocab: + merges: list[str] + add_special_token: dict[str, bool] + special_token_ids: dict[str, int] + chat_template: str | None + + def __init__( + self, path: str | os.PathLike[str], load_merges: bool = False, + special_token_types: tuple[str, ...] | None = None, + n_vocab: int | None = None, + ): + self.special_token_ids = {} + self.add_special_token = {} + self.n_vocab = n_vocab + self.load_merges = load_merges + self.merges = [] + self.chat_template = None + if special_token_types is not None: + self.special_token_types = special_token_types + else: + self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad') + self._load(Path(path)) + + def __repr__(self) -> str: + return ''.format( + len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset", + ) + + def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: + if self.merges: + if not quiet: + print(f'gguf: Adding {len(self.merges)} merge(s).') + gw.add_token_merges(self.merges) + elif self.load_merges: + print( + 'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.', + file = sys.stderr, + ) + for typ, tokid in self.special_token_ids.items(): + id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) + if id_handler is None: + print( + f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', + file = sys.stderr, + ) + continue + if not quiet: + print(f'gguf: Setting special token type {typ} to {tokid}') + id_handler(tokid) + for typ, value in self.add_special_token.items(): + add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) + if add_handler is None: + print( + f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping', + file = sys.stderr, + ) + continue + if not quiet: + print(f'gguf: Setting add_{typ}_token to {value}') + add_handler(value) + if self.chat_template is not None: + if not quiet: + print(f'gguf: Setting chat_template to {self.chat_template}') + gw.add_chat_template(self.chat_template) + + def _load(self, path: Path) -> None: + self._try_load_from_tokenizer_json(path) + self._try_load_from_config_json(path) + if self.load_merges and not self.merges: + self._try_load_merges_txt(path) + + def _try_load_merges_txt(self, path: Path) -> bool: + merges_file = path / 'merges.txt' + if not merges_file.is_file(): + return False + with open(merges_file, 'r', encoding = 'utf-8') as fp: + first_line = next(fp, '').strip() + if not first_line.startswith('#'): + fp.seek(0) + line_num = 0 + else: + line_num = 1 + merges = [] + for line in fp: + line_num += 1 + line = line.strip() + if not line: + continue + parts = line.split(None, 3) + if len(parts) != 2: + print( + f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring', + file = sys.stderr, + ) + continue + merges.append(f'{parts[0]} {parts[1]}') + self.merges = merges + return True + + def _set_special_token(self, typ: str, tid: Any) -> None: + if not isinstance(tid, int): + return + if tid < 0: + raise ValueError(f'invalid value for special token type {typ}: {tid}') + if self.n_vocab is None or tid < self.n_vocab: + if typ in self.special_token_ids: + return + self.special_token_ids[typ] = tid + return + print( + f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping', + file = sys.stderr, + ) + + def _try_load_from_tokenizer_json(self, path: Path) -> bool: + tokenizer_file = path / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, encoding = 'utf-8') as f: + tokenizer = json.load(f) + if self.load_merges: + merges = tokenizer.get('model', {}).get('merges') + if isinstance(merges, list) and merges and isinstance(merges[0], str): + self.merges = merges + added_tokens = tokenizer.get('added_tokens', {}) + else: + added_tokens = {} + tokenizer_config_file = path / 'tokenizer_config.json' + if not tokenizer_config_file.is_file(): + return True + with open(tokenizer_config_file, encoding = 'utf-8') as f: + tokenizer_config = json.load(f) + chat_template = tokenizer_config.get('chat_template') + if chat_template is None or isinstance(chat_template, str): + self.chat_template = chat_template + else: + print( + f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', + file = sys.stderr + ) + for typ in self.special_token_types: + add_entry = tokenizer_config.get(f'add_{typ}_token') + if isinstance(add_entry, bool): + self.add_special_token[typ] = add_entry + if not added_tokens: + # We will need this to get the content for the token, so if it's empty + # may as well just give up. + continue + entry = tokenizer_config.get(f'{typ}_token') + if isinstance(entry, str): + tc_content = entry + elif isinstance(entry, dict): + entry_content = entry.get('content') + if not isinstance(entry_content, str): + continue + tc_content = entry_content + else: + continue + # We only need the first match here. + maybe_token_id = next( + (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content), + None, + ) + self._set_special_token(typ, maybe_token_id) + return True + + def _try_load_from_config_json(self, path: Path) -> bool: + config_file = path / 'config.json' + if not config_file.is_file(): + return False + with open(config_file, encoding = 'utf-8') as f: + config = json.load(f) + for typ in self.special_token_types: + self._set_special_token(typ, config.get(f'{typ}_token_id')) + return True diff --git a/extensions/model-extension/scripts/gguf-py/pyproject.toml b/extensions/model-extension/scripts/gguf-py/pyproject.toml new file mode 100644 index 000000000..9789c2c87 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/pyproject.toml @@ -0,0 +1,35 @@ +[tool.poetry] +name = "gguf" +version = "0.7.0" +description = "Read and write ML models in GGUF for GGML" +authors = ["GGML "] +packages = [ + {include = "gguf"}, + {include = "gguf/py.typed"}, + {include = "scripts"}, +] +readme = "README.md" +homepage = "https://ggml.ai" +repository = "https://github.com/ggerganov/llama.cpp" +keywords = ["ggml", "gguf", "llama.cpp"] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +[tool.poetry.dependencies] +python = ">=3.8" +numpy = ">=1.17" + +[tool.poetry.dev-dependencies] +pytest = "^5.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry.scripts] +gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" +gguf-dump = "scripts:gguf_dump_entrypoint" +gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" diff --git a/extensions/model-extension/scripts/gguf-py/scripts/__init__.py b/extensions/model-extension/scripts/gguf-py/scripts/__init__.py new file mode 100644 index 000000000..77132db7a --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/scripts/__init__.py @@ -0,0 +1,12 @@ +import os + +from importlib import import_module + + +os.environ["NO_LOCAL_GGUF"] = "TRUE" + +gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main +gguf_dump_entrypoint = import_module("scripts.gguf-dump").main +gguf_set_metadata_entrypoint = import_module("scripts.gguf-set-metadata").main + +del import_module, os diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py new file mode 100755 index 000000000..10a16ad06 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +import numpy as np + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent)) + +import gguf + + +def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: + if np.uint32(1) == np.uint32(1).newbyteorder("<"): + # Host is little endian + host_endian = "little" + swapped_endian = "big" + else: + # Sorry PDP or other weird systems that don't use BE or LE. + host_endian = "big" + swapped_endian = "little" + if reader.byte_order == "S": + file_endian = swapped_endian + else: + file_endian = host_endian + order = host_endian if args.order == "native" else args.order + print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") + if file_endian == order: + print(f"* File is already {order.upper()} endian. Nothing to do.") + sys.exit(0) + print("* Checking tensors for conversion compatibility") + for tensor in reader.tensors: + if tensor.tensor_type not in ( + gguf.GGMLQuantizationType.F32, + gguf.GGMLQuantizationType.F16, + gguf.GGMLQuantizationType.Q8_0, + ): + raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") + print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") + if args.dry_run: + return + print("\n*** Warning *** Warning *** Warning **") + print("* This conversion process may damage the file. Ensure you have a backup.") + if order != host_endian: + print("* Requested endian differs from host, you will not be able to load the model on this machine.") + print("* The file will be modified immediately, so if conversion fails or is interrupted") + print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") + response = input("YES, I am sure> ") + if response != "YES": + print("You didn't enter YES. Okay then, see ya!") + sys.exit(0) + print(f"\n* Converting fields ({len(reader.fields)})") + for idx, field in enumerate(reader.fields.values()): + print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") + for part in field.parts: + part.byteswap(inplace=True) + print(f"\n* Converting tensors ({len(reader.tensors)})") + for idx, tensor in enumerate(reader.tensors): + print( + f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, " + f"elements={tensor.n_elements}... ", + end="", + ) + tensor_type = tensor.tensor_type + for part in tensor.field.parts: + part.byteswap(inplace=True) + if tensor_type != gguf.GGMLQuantizationType.Q8_0: + tensor.data.byteswap(inplace=True) + print() + continue + # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes + block_size = 34 + n_blocks = len(tensor.data) // block_size + for block_num in range(n_blocks): + block_offs = block_num * block_size + # I know I said f16, but it doesn't matter here - any simple 16 bit type works. + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + if block_num % 100000 == 0: + print(f"[{(n_blocks - block_num) // 1000}K]", end="") + sys.stdout.flush() + print() + print("* Completion") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Convert GGUF file byte order") + parser.add_argument( + "model", type=str, + help="GGUF format model filename", + ) + parser.add_argument( + "order", type=str, choices=['big', 'little', 'native'], + help="Requested byte order", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Don't actually change anything", + ) + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) + print(f'* Loading: {args.model}') + reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') + convert_byteorder(reader, args) + + +if __name__ == "__main__": + main() diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py new file mode 100755 index 000000000..dbf891508 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path +from typing import Any + +import numpy as np + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent)) + +from gguf import GGUFReader, GGUFValueType # noqa: E402 + + +def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: + host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' + if reader.byte_order == 'S': + file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE' + else: + file_endian = host_endian + return (host_endian, file_endian) + + +# For more information about what field.parts and field.data represent, +# please see the comments in the modify_gguf.py example. +def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: + host_endian, file_endian = get_file_host_endian(reader) + print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') + print(f'\n* Dumping {len(reader.fields)} key/value pair(s)') + for n, field in enumerate(reader.fields.values(), 1): + if not field.types: + pretty_type = 'N/A' + elif field.types[0] == GGUFValueType.ARRAY: + nest_count = len(field.types) - 1 + pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count + else: + pretty_type = str(field.types[-1].name) + print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '') + if len(field.types) == 1: + curr_type = field.types[0] + if curr_type == GGUFValueType.STRING: + print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '') + elif field.types[0] in reader.gguf_scalar_to_np: + print(' = {0}'.format(field.parts[-1][0]), end = '') + print() + if args.no_tensors: + return + print(f'\n* Dumping {len(reader.tensors)} tensor(s)') + for n, tensor in enumerate(reader.tensors, 1): + prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) + print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') + + +def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: + import json + host_endian, file_endian = get_file_host_endian(reader) + metadata: dict[str, Any] = {} + tensors: dict[str, Any] = {} + result = { + "filename": args.model, + "endian": file_endian, + "metadata": metadata, + "tensors": tensors, + } + for idx, field in enumerate(reader.fields.values()): + curr: dict[str, Any] = { + "index": idx, + "type": field.types[0].name if field.types else 'UNKNOWN', + "offset": field.offset, + } + metadata[field.name] = curr + if field.types[:1] == [GGUFValueType.ARRAY]: + curr["array_types"] = [t.name for t in field.types][1:] + if not args.json_array: + continue + itype = field.types[-1] + if itype == GGUFValueType.STRING: + curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data] + else: + curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()] + elif field.types[0] == GGUFValueType.STRING: + curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8") + else: + curr["value"] = field.parts[-1].tolist()[0] + if not args.no_tensors: + for idx, tensor in enumerate(reader.tensors): + tensors[tensor.name] = { + "index": idx, + "shape": tensor.shape.tolist(), + "type": tensor.tensor_type.name, + "offset": tensor.field.offset, + } + json.dump(result, sys.stdout) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Dump GGUF file metadata") + parser.add_argument("model", type=str, help="GGUF format model filename") + parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") + parser.add_argument("--json", action="store_true", help="Produce JSON output") + parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) + if not args.json: + print(f'* Loading: {args.model}') + reader = GGUFReader(args.model, 'r') + if args.json: + dump_metadata_json(reader, args) + else: + dump_metadata(reader, args) + + +if __name__ == '__main__': + main() diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py new file mode 100755 index 000000000..3ebdfa898 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +import argparse +import os +import sys +from pathlib import Path + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent)) + +from gguf import GGUFReader # noqa: E402 + + +def minimal_example(filename: str) -> None: + reader = GGUFReader(filename, 'r+') + field = reader.fields['tokenizer.ggml.bos_token_id'] + if field is None: + return + part_index = field.data[0] + field.parts[part_index][0] = 2 # Set tokenizer.ggml.bos_token_id to 2 + # + # So what's this field.data thing? It's helpful because field.parts contains + # _every_ part of the GGUF field. For example, tokenizer.ggml.bos_token_id consists + # of: + # + # Part index 0: Key length (27) + # Part index 1: Key data ("tokenizer.ggml.bos_token_id") + # Part index 2: Field type (4, the id for GGUFValueType.UINT32) + # Part index 3: Field value + # + # Note also that each part is an NDArray slice, so even a part that + # is only a single value like the key length will be a NDArray of + # the key length type (numpy.uint32). + # + # The .data attribute in the Field is a list of relevant part indexes + # and doesn't contain internal GGUF details like the key length part. + # In this case, .data will be [3] - just the part index of the + # field value itself. + + +def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: + field = reader.get_field(args.key) + if field is None: + print(f'! Field {repr(args.key)} not found', file = sys.stderr) + sys.exit(1) + # Note that field.types is a list of types. This is because the GGUF + # format supports arrays. For example, an array of UINT32 would + # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] + handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None + if handler is None: + print( + f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}', + file = sys.stderr, + ) + sys.exit(1) + current_value = field.parts[field.data[0]][0] + new_value = handler(args.value) + print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') + if current_value == new_value: + print(f'- Key {repr(args.key)} already set to requested value {current_value}') + sys.exit(0) + if args.dry_run: + sys.exit(0) + if not args.force: + print('*** Warning *** Warning *** Warning **') + print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') + print('* Enter exactly YES if you are positive you want to proceed:') + response = input('YES, I am sure> ') + if response != 'YES': + print("You didn't enter YES. Okay then, see ya!") + sys.exit(0) + field.parts[field.data[0]][0] = new_value + print('* Field changed. Successful completion.') + + +def main() -> None: + parser = argparse.ArgumentParser(description="Set a simple value in GGUF file metadata") + parser.add_argument("model", type=str, help="GGUF format model filename") + parser.add_argument("key", type=str, help="Metadata key to set") + parser.add_argument("value", type=str, help="Metadata value to set") + parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") + parser.add_argument("--force", action="store_true", help="Change the field without confirmation") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) + print(f'* Loading: {args.model}') + reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') + set_metadata(reader, args) + + +if __name__ == '__main__': + main() diff --git a/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py b/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py new file mode 100644 index 000000000..0adeb7d55 --- /dev/null +++ b/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py @@ -0,0 +1,7 @@ +import gguf # noqa: F401 + +# TODO: add tests + + +def test_write_gguf() -> None: + pass diff --git a/extensions/huggingface-extension/scripts/install_deps.py b/extensions/model-extension/scripts/install_deps.py similarity index 100% rename from extensions/huggingface-extension/scripts/install_deps.py rename to extensions/model-extension/scripts/install_deps.py diff --git a/extensions/huggingface-extension/scripts/version.txt b/extensions/model-extension/scripts/version.txt similarity index 100% rename from extensions/huggingface-extension/scripts/version.txt rename to extensions/model-extension/scripts/version.txt diff --git a/extensions/model-extension/src/@types/InvalidHostError.ts b/extensions/model-extension/src/@types/InvalidHostError.ts new file mode 100644 index 000000000..47262206e --- /dev/null +++ b/extensions/model-extension/src/@types/InvalidHostError.ts @@ -0,0 +1,6 @@ +export class InvalidHostError extends Error { + constructor(message: string) { + super(message) + this.name = 'InvalidHostError' + } +} diff --git a/extensions/model-extension/src/@types/NotSupportModelError.ts b/extensions/model-extension/src/@types/NotSupportModelError.ts new file mode 100644 index 000000000..0a1946176 --- /dev/null +++ b/extensions/model-extension/src/@types/NotSupportModelError.ts @@ -0,0 +1,6 @@ +export class NotSupportedModelError extends Error { + constructor(message: string) { + super(message) + this.name = 'NotSupportedModelError' + } +} diff --git a/extensions/model-extension/src/@types/global.d.ts b/extensions/model-extension/src/@types/global.d.ts index a72b5188e..3878d4bf2 100644 --- a/extensions/model-extension/src/@types/global.d.ts +++ b/extensions/model-extension/src/@types/global.d.ts @@ -1,8 +1,7 @@ export {} declare global { declare const DEFAULT_MODEL: object - declare const MODULE_PATH: string - declare const VERSION: string + declare const NODE: string interface Core { api: APIFunctions diff --git a/extensions/model-extension/src/index.ts b/extensions/model-extension/src/index.ts index 9dd106868..dbe7605ea 100644 --- a/extensions/model-extension/src/index.ts +++ b/extensions/model-extension/src/index.ts @@ -17,9 +17,19 @@ import { baseName, GpuSetting, DownloadRequest, + executeOnMain, + HuggingFaceRepoData, + Quantization, + log, + getFileSize, + AllQuantizations, + ModelEvent, } from '@janhq/core' import { extractFileName } from './helpers/path' +import { GGUFMetadata, gguf } from '@huggingface/gguf' +import { NotSupportedModelError } from './@types/NotSupportModelError' +import { InvalidHostError } from './@types/InvalidHostError' /** * A extension for models @@ -35,6 +45,17 @@ export default class JanModelExtension extends ModelExtension { ] private static readonly _tensorRtEngineFormat = '.engine' private static readonly _supportedGpuArch = ['ampere', 'ada'] + private static readonly _safetensorsRegexs = [ + /model\.safetensors$/, + /model-[0-9]+-of-[0-9]+\.safetensors$/, + ] + private static readonly _pytorchRegexs = [ + /pytorch_model\.bin$/, + /consolidated\.[0-9]+\.pth$/, + /pytorch_model-[0-9]+-of-[0-9]+\.bin$/, + /.*\.pt$/, + ] + interrupted = false /** * Called when the extension is loaded. @@ -49,7 +70,7 @@ export default class JanModelExtension extends ModelExtension { * Called when the extension is unloaded. * @override */ - onUnload(): void {} + async onUnload() {} /** * Downloads a machine learning model. @@ -65,7 +86,11 @@ export default class JanModelExtension extends ModelExtension { // create corresponding directory const modelDirPath = await joinPath([JanModelExtension._homeDir, model.id]) if (!(await fs.existsSync(modelDirPath))) await fs.mkdir(modelDirPath) - + const modelJsonPath = await joinPath([modelDirPath, 'model.json']) + if (!(await fs.existsSync(modelJsonPath))) { + await fs.writeFileSync(modelJsonPath, JSON.stringify(model, null, 2)) + events.emit(ModelEvent.OnModelsUpdate, {}) + } if (model.engine === InferenceEngine.nitro_tensorrt_llm) { if (!gpuSettings || gpuSettings.gpus.length === 0) { console.error('No GPU found. Please check your GPU setting.') @@ -140,6 +165,84 @@ export default class JanModelExtension extends ModelExtension { } } + private toHuggingFaceUrl(repoId: string): string { + try { + const url = new URL(repoId) + if (url.host !== 'huggingface.co') { + throw new InvalidHostError(`Invalid Hugging Face repo URL: ${repoId}`) + } + + const paths = url.pathname.split('/').filter((e) => e.trim().length > 0) + if (paths.length < 2) { + throw new InvalidHostError(`Invalid Hugging Face repo URL: ${repoId}`) + } + + return `${url.origin}/api/models/${paths[0]}/${paths[1]}` + } catch (err) { + if (err instanceof InvalidHostError) { + throw err + } + + if (repoId.startsWith('https')) { + throw new Error(`Cannot parse url: ${repoId}`) + } + + return `https://huggingface.co/api/models/${repoId}` + } + } + + async fetchHuggingFaceRepoData(repoId: string): Promise { + const sanitizedUrl = this.toHuggingFaceUrl(repoId) + console.debug('sanitizedUrl', sanitizedUrl) + + const res = await fetch(sanitizedUrl) + const response = await res.json() + if (response['error'] != null) { + throw new Error(response['error']) + } + + const data = response as HuggingFaceRepoData + + if (data.tags.indexOf('gguf') === -1) { + throw new NotSupportedModelError( + `${repoId} is not supported. Only GGUF models are supported.` + ) + } + + const promises: Promise[] = [] + + // fetching file sizes + const url = new URL(sanitizedUrl) + const paths = url.pathname.split('/').filter((e) => e.trim().length > 0) + + for (const sibling of data.siblings) { + const downloadUrl = `https://huggingface.co/${paths[2]}/${paths[3]}/resolve/main/${sibling.rfilename}` + sibling.downloadUrl = downloadUrl + promises.push(getFileSize(downloadUrl)) + } + + const result = await Promise.all(promises) + for (let i = 0; i < data.siblings.length; i++) { + data.siblings[i].fileSize = result[i] + } + + AllQuantizations.forEach((quantization) => { + data.siblings.forEach((sibling) => { + if (!sibling.quantization && sibling.rfilename.includes(quantization)) { + sibling.quantization = quantization + } + }) + }) + + data.modelUrl = `https://huggingface.co/${paths[2]}/${paths[3]}` + return data + } + + async fetchModelMetadata(url: string): Promise { + const { metadata } = await gguf(url) + return metadata + } + /** * Specifically for Jan server. */ @@ -453,7 +556,7 @@ export default class JanModelExtension extends ModelExtension { return model } - private async getDefaultModel(): Promise { + override async getDefaultModel(): Promise { const defaultModel = DEFAULT_MODEL as Model return defaultModel } @@ -674,4 +777,218 @@ export default class JanModelExtension extends ModelExtension { importedModels ) } + + private getGgufFileList( + repoData: HuggingFaceRepoData, + selectedQuantization: Quantization + ): string[] { + return repoData.siblings + .map((file) => file.rfilename) + .filter((file) => file.indexOf(selectedQuantization) !== -1) + .filter((file) => file.endsWith('.gguf')) + } + + private getFileList(repoData: HuggingFaceRepoData): string[] { + // SafeTensors first, if not, then PyTorch + const modelFiles = repoData.siblings + .map((file) => file.rfilename) + .filter((file) => + JanModelExtension._safetensorsRegexs.some((regex) => regex.test(file)) + ) + if (modelFiles.length === 0) { + repoData.siblings.forEach((file) => { + if ( + JanModelExtension._pytorchRegexs.some((regex) => + regex.test(file.rfilename) + ) + ) { + modelFiles.push(file.rfilename) + } + }) + } + + const vocabFiles = [ + 'tokenizer.model', + 'vocab.json', + 'tokenizer.json', + ].filter((file) => + repoData.siblings.some((sibling) => sibling.rfilename === file) + ) + + const etcFiles = repoData.siblings + .map((file) => file.rfilename) + .filter( + (file) => + (file.endsWith('.json') && !vocabFiles.includes(file)) || + file.endsWith('.txt') || + file.endsWith('.py') || + file.endsWith('.tiktoken') + ) + + return [...modelFiles, ...vocabFiles, ...etcFiles] + } + + private async getModelDirPath(repoID: string): Promise { + const modelName = repoID.split('/').slice(1).join('/') + return joinPath([await getJanDataFolderPath(), 'models', modelName]) + } + + private async getConvertedModelPath(repoID: string): Promise { + const modelName = repoID.split('/').slice(1).join('/') + const modelDirPath = await this.getModelDirPath(repoID) + return joinPath([modelDirPath, modelName + '.gguf']) + } + + private async getQuantizedModelPath( + repoID: string, + quantization: Quantization + ): Promise { + const modelName = repoID.split('/').slice(1).join('/') + const modelDirPath = await this.getModelDirPath(repoID) + return joinPath([ + modelDirPath, + modelName + `-${quantization.toLowerCase()}.gguf`, + ]) + } + private getCtxLength(config: { + max_sequence_length?: number + max_position_embeddings?: number + n_ctx?: number + }): number { + if (config.max_sequence_length) return config.max_sequence_length + if (config.max_position_embeddings) return config.max_position_embeddings + if (config.n_ctx) return config.n_ctx + return 2048 + } + + /** + * Converts a Hugging Face model to GGUF. + * @param repoID - The repo ID of the model to convert. + * @returns A promise that resolves when the conversion is complete. + */ + async convert(repoID: string): Promise { + if (this.interrupted) return + const modelDirPath = await this.getModelDirPath(repoID) + const modelOutPath = await this.getConvertedModelPath(repoID) + if (!(await fs.existsSync(modelDirPath))) { + throw new Error('Model dir not found') + } + if (await fs.existsSync(modelOutPath)) return + + await executeOnMain(NODE, 'installDeps') + if (this.interrupted) return + + try { + await executeOnMain( + NODE, + 'convertHf', + modelDirPath, + modelOutPath + '.temp' + ) + } catch (err) { + log(`[Conversion]::Debug: Error using hf-to-gguf.py, trying convert.py`) + + let ctx = 2048 + try { + const config = await fs.readFileSync( + await joinPath([modelDirPath, 'config.json']), + 'utf8' + ) + const configParsed = JSON.parse(config) + ctx = this.getCtxLength(configParsed) + configParsed.max_sequence_length = ctx + await fs.writeFileSync( + await joinPath([modelDirPath, 'config.json']), + JSON.stringify(configParsed, null, 2) + ) + } catch (err) { + log(`${err}`) + // ignore missing config.json + } + + const bpe = await fs.existsSync( + await joinPath([modelDirPath, 'vocab.json']) + ) + + await executeOnMain( + NODE, + 'convert', + modelDirPath, + modelOutPath + '.temp', + { + ctx, + bpe, + } + ) + } + await executeOnMain( + NODE, + 'renameSync', + modelOutPath + '.temp', + modelOutPath + ) + + for (const file of await fs.readdirSync(modelDirPath)) { + if ( + modelOutPath.endsWith(file) || + (file.endsWith('config.json') && !file.endsWith('_config.json')) + ) + continue + await fs.unlinkSync(await joinPath([modelDirPath, file])) + } + } + + /** + * Quantizes a GGUF model. + * @param repoID - The repo ID of the model to quantize. + * @param quantization - The quantization to use. + * @returns A promise that resolves when the quantization is complete. + */ + async quantize(repoID: string, quantization: Quantization): Promise { + if (this.interrupted) return + const modelDirPath = await this.getModelDirPath(repoID) + const modelOutPath = await this.getQuantizedModelPath(repoID, quantization) + if (!(await fs.existsSync(modelDirPath))) { + throw new Error('Model dir not found') + } + if (await fs.existsSync(modelOutPath)) return + + await executeOnMain( + NODE, + 'quantize', + await this.getConvertedModelPath(repoID), + modelOutPath + '.temp', + quantization + ) + await executeOnMain( + NODE, + 'renameSync', + modelOutPath + '.temp', + modelOutPath + ) + + await fs.unlinkSync(await this.getConvertedModelPath(repoID)) + } + + /** + * Cancels the convert of current Hugging Face model. + * @param repoID - The repository ID to cancel. + * @param repoData - The repository data to cancel. + * @returns {Promise} A promise that resolves when the download has been cancelled. + */ + async cancelConvert( + repoID: string, + repoData: HuggingFaceRepoData + ): Promise { + this.interrupted = true + const modelDirPath = await this.getModelDirPath(repoID) + const files = this.getFileList(repoData) + for (const file of files) { + const filePath = file + const localPath = await joinPath([modelDirPath, filePath]) + await abortDownload(localPath) + } + + executeOnMain(NODE, 'killProcesses') + } } diff --git a/extensions/huggingface-extension/src/node/index.ts b/extensions/model-extension/src/node/index.ts similarity index 100% rename from extensions/huggingface-extension/src/node/index.ts rename to extensions/model-extension/src/node/index.ts diff --git a/extensions/monitoring-extension/src/index.ts b/extensions/monitoring-extension/src/index.ts index 166627fd3..1d21fde77 100644 --- a/extensions/monitoring-extension/src/index.ts +++ b/extensions/monitoring-extension/src/index.ts @@ -1,7 +1,6 @@ import { GpuSetting, MonitoringExtension, - MonitoringInterface, OperatingSystemInfo, executeOnMain, } from '@janhq/core' diff --git a/extensions/monitoring-extension/src/node/index.ts b/extensions/monitoring-extension/src/node/index.ts index 049620026..980ee75d1 100644 --- a/extensions/monitoring-extension/src/node/index.ts +++ b/extensions/monitoring-extension/src/node/index.ts @@ -335,7 +335,7 @@ const updateCudaExistence = async ( // Attempt to query CUDA using NVIDIA SMI if (!cudaExists) { - await new Promise((resolve, reject) => { + await new Promise((resolve) => { exec('nvidia-smi', (error, stdout) => { if (!error) { const regex = /CUDA\s*Version:\s*(\d+\.\d+)/g diff --git a/extensions/monitoring-extension/src/node/logger.ts b/extensions/monitoring-extension/src/node/logger.ts index 9bafa7451..29a391313 100644 --- a/extensions/monitoring-extension/src/node/logger.ts +++ b/extensions/monitoring-extension/src/node/logger.ts @@ -126,6 +126,10 @@ export class FileLogger extends Logger { const writeLog = (message: string, logPath: string) => { if (!fs.existsSync(logPath)) { + const logDirectory = path.join(getJanDataFolderPath(), 'logs') + if (!fs.existsSync(logDirectory)) { + fs.mkdirSync(logDirectory) + } fs.writeFileSync(logPath, message) } else { const logFile = fs.createWriteStream(logPath, { diff --git a/extensions/tensorrt-llm-extension/package.json b/extensions/tensorrt-llm-extension/package.json index c8eafb10d..c5cb54809 100644 --- a/extensions/tensorrt-llm-extension/package.json +++ b/extensions/tensorrt-llm-extension/package.json @@ -55,7 +55,6 @@ "@janhq/core": "file:../../core", "decompress": "^4.2.1", "fetch-retry": "^5.0.6", - "path-browserify": "^1.0.1", "rxjs": "^7.8.1", "tcp-port-used": "^1.0.2", "terminate": "^2.6.1", diff --git a/server/helpers/logger.ts b/server/helpers/logger.ts index c8d4af428..2e6147386 100644 --- a/server/helpers/logger.ts +++ b/server/helpers/logger.ts @@ -14,22 +14,45 @@ export class Logger implements FastifyBaseLogger { silent = () => {} - info = function (msg: any) { - log(msg) + info = (obj?: any, msg?: string, ...args: any[]) => { + if (obj?.res?.raw?.statusCode || obj?.req?.url) { + log( + `[SERVER]::${JSON.stringify({ + level: obj?.level, + time: obj?.time, + hostname: obj?.hostname, + reqId: obj?.req?.id ?? obj?.res?.request?.id, + res: { + statusCode: obj?.res?.raw?.statusCode, + }, + req: { + method: obj?.req?.method, + url: obj?.req?.url, + path: obj?.req?.path, + hostname: obj?.req?.hostname, + remoteAddress: obj?.req?.remoteAddress, + remotePort: obj?.req?.remotePort, + }, + msg, + responseTime: obj?.responseTime, + ...args, + })}` + ) + } } - error = function (msg: any) { - log(msg) + error = function (message: any) { + log(`[SERVER]::${JSON.stringify(message)}`) } - debug = function (msg: any) { - log(msg) + debug = function (message: any) { + log(`[SERVER]::${JSON.stringify(message)}`) } - fatal = function (msg: any) { - log(msg) + fatal = function (message: any) { + log(`[SERVER]::${JSON.stringify(message)}`) } - warn = function (msg: any) { - log(msg) + warn = function (message: any) { + log(`[SERVER]::${JSON.stringify(message)}`) } - trace = function (msg: any) { - log(msg) + trace = function (message: any) { + log(`[SERVER]::${JSON.stringify(message)}`) } } diff --git a/uikit/src/modal/index.tsx b/uikit/src/modal/index.tsx index 1c0586637..288631c3d 100644 --- a/uikit/src/modal/index.tsx +++ b/uikit/src/modal/index.tsx @@ -33,7 +33,7 @@ const ModalContent = React.forwardRef< {children} diff --git a/web/containers/DropdownListSidebar/index.tsx b/web/containers/DropdownListSidebar/index.tsx index 361ae658c..5bb3d29cb 100644 --- a/web/containers/DropdownListSidebar/index.tsx +++ b/web/containers/DropdownListSidebar/index.tsx @@ -202,7 +202,7 @@ const DropdownListSidebar = ({ @@ -266,12 +266,21 @@ const DropdownListSidebar = ({ value={x.id} className={twMerge( x.id === selectedModel?.id && 'bg-secondary', - 'my-0 pb-8 pt-4' + 'my-0 py-2' )} > -
- {x.name} -
+
+
+

{x.name}

+
+

{x.id}

+
+
+
{toGibibytes(x.metadata.size)} @@ -283,10 +292,12 @@ const DropdownListSidebar = ({
- {x.id} +

+ {x.id} +

{clipboard.copied && copyId === x.id ? ( ) : ( diff --git a/web/containers/Layout/BottomBar/SystemMonitor/TableActiveModel/index.tsx b/web/containers/Layout/BottomBar/SystemMonitor/TableActiveModel/index.tsx index 8bcccdba2..afdb553f4 100644 --- a/web/containers/Layout/BottomBar/SystemMonitor/TableActiveModel/index.tsx +++ b/web/containers/Layout/BottomBar/SystemMonitor/TableActiveModel/index.tsx @@ -34,7 +34,7 @@ const TableActiveModel = () => { return ( {col} @@ -46,17 +46,27 @@ const TableActiveModel = () => { - {activeModel.name} - {activeModel.id} - + +

{activeModel.name}

+ + +

{activeModel.id}

+ + {toGibibytes(activeModel.metadata.size)} - + v{activeModel.version} - +
) } diff --git a/web/containers/ListContainer/index.tsx b/web/containers/ListContainer/index.tsx new file mode 100644 index 000000000..fdf34b9ec --- /dev/null +++ b/web/containers/ListContainer/index.tsx @@ -0,0 +1,29 @@ +import { ReactNode, useEffect, useRef } from 'react' + +type Props = { + children: ReactNode +} + +const ListContainer: React.FC = ({ children }) => { + const listRef = useRef(null) + + useEffect(() => { + const scrollHeight = listRef.current?.scrollHeight ?? 0 + + listRef.current?.scrollTo({ + top: scrollHeight, + behavior: 'smooth', + }) + }) + + return ( +
+ {children} +
+ ) +} + +export default ListContainer diff --git a/web/containers/Providers/EventHandler.tsx b/web/containers/Providers/EventHandler.tsx index 110d36e36..f772dd6cb 100644 --- a/web/containers/Providers/EventHandler.tsx +++ b/web/containers/Providers/EventHandler.tsx @@ -34,6 +34,9 @@ import { updateThreadAtom, } from '@/helpers/atoms/Thread.atom' +const maxWordForThreadTitle = 10 +const defaultThreadTitle = 'New Thread' + export default function EventHandler({ children }: { children: ReactNode }) { const messages = useAtomValue(getCurrentChatMessagesAtom) const addNewMessage = useSetAtom(addNewMessageAtom) @@ -90,34 +93,64 @@ export default function EventHandler({ children }: { children: ReactNode }) { } const thread = threadsRef.current?.find((e) => e.id == message.thread_id) + if (!thread) { + console.warn( + `Failed to update title for thread ${message.thread_id}: Thread not found!` + ) + return + } + const messageContent = message.content[0]?.text?.value + if (!messageContent) { + console.warn( + `Failed to update title for thread ${message.thread_id}: Responded content is null!` + ) + return + } // The thread title should not be updated if the message is less than 10 words // And no new line character is present // And non-alphanumeric characters should be removed - if (thread && messageContent && !messageContent.includes('\n')) { - // Remove non-alphanumeric characters - const cleanedMessageContent = messageContent - .replace(/[^a-z0-9\s]/gi, '') - .trim() - // Split the message into words - const words = cleanedMessageContent.split(' ') - // Check if the message is less than 10 words - if (words.length < 10) { + if (messageContent.includes('\n')) { + console.warn( + `Failed to update title for thread ${message.thread_id}: Title can't contain new line character!` + ) + return + } + + // Remove non-alphanumeric characters + const cleanedMessageContent = messageContent + .replace(/[^a-z0-9\s]/gi, '') + .trim() + + // Split the message into words + const words = cleanedMessageContent.split(' ') + + if (words.length >= maxWordForThreadTitle) { + console.warn( + `Failed to update title for thread ${message.thread_id}: Title can't be greater than ${maxWordForThreadTitle} words!` + ) + return + } + + const updatedThread: Thread = { + ...thread, + + title: cleanedMessageContent, + metadata: thread.metadata, + } + + extensionManager + .get(ExtensionTypeEnum.Conversational) + ?.saveThread({ + ...updatedThread, + }) + .then(() => { // Update the Thread title with the response of the inference on the 1st prompt updateThread({ - ...thread, - title: cleanedMessageContent, - metadata: thread.metadata, + ...updatedThread, }) - - extensionManager - .get(ExtensionTypeEnum.Conversational) - ?.saveThread({ - ...thread, - }) - } - } + }) }, [updateThread] ) @@ -142,33 +175,32 @@ export default function EventHandler({ children }: { children: ReactNode }) { setIsGeneratingResponse(false) const thread = threadsRef.current?.find((e) => e.id == message.thread_id) - if (thread) { - const messageContent = message.content[0]?.text?.value - const metadata = { - ...thread.metadata, - ...(messageContent && { lastMessage: messageContent }), - } + if (!thread) return + const messageContent = message.content[0]?.text?.value + const metadata = { + ...thread.metadata, + ...(messageContent && { lastMessage: messageContent }), + } - updateThread({ + updateThread({ + ...thread, + metadata, + }) + + extensionManager + .get(ExtensionTypeEnum.Conversational) + ?.saveThread({ ...thread, metadata, }) - extensionManager - .get(ExtensionTypeEnum.Conversational) - ?.saveThread({ - ...thread, - metadata, - }) + // If this is not the summary of the Thread, don't need to add it to the Thread + extensionManager + .get(ExtensionTypeEnum.Conversational) + ?.addNewMessage(message) - // If this is not the summary of the Thread, don't need to add it to the Thread - extensionManager - .get(ExtensionTypeEnum.Conversational) - ?.addNewMessage(message) - - // Attempt to generate the title of the Thread when needed - generateThreadTitle(message, thread) - } + // Attempt to generate the title of the Thread when needed + generateThreadTitle(message, thread) }, [setIsGeneratingResponse, updateMessage, updateThread, updateThreadWaiting] ) @@ -181,6 +213,7 @@ export default function EventHandler({ children }: { children: ReactNode }) { break default: updateThreadMessage(message) + break } }, [updateThreadMessage, updateThreadTitle] @@ -188,54 +221,52 @@ export default function EventHandler({ children }: { children: ReactNode }) { const generateThreadTitle = (message: ThreadMessage, thread: Thread) => { // If this is the first ever prompt in the thread - if ( - thread && - thread.title?.trim() === 'New Thread' && - activeModelRef.current - ) { - // This is the first time message comes in on a new thread - // Summarize the first message, and make that the title of the Thread - // 1. Get the summary of the first prompt using whatever engine user is currently using - const threadMessages = messagesRef?.current - - if (!threadMessages || threadMessages.length === 0) return - - const summarizeFirstPrompt = `Summarize in a 5-word Title. Give the title only. "${threadMessages[0].content[0].text.value}"` - // Prompt: Given this query from user {query}, return to me the summary in 5 words as the title - const msgId = ulid() - const messages: ChatCompletionMessage[] = [ - { - role: ChatCompletionRole.System, - content: - 'The conversation below is for a text summarization, user asks assistant to summarize a text and assistant should response in just less than 10 words', - }, - { - role: ChatCompletionRole.User, - content: summarizeFirstPrompt, - }, - ] - - const messageRequest: MessageRequest = { - id: msgId, - threadId: message.thread_id, - type: MessageRequestType.Summary, - messages, - model: { - ...activeModelRef.current, - parameters: { - stream: false, - }, - }, - } - - // 2. Update the title with the result of the inference - setTimeout(() => { - const engine = EngineManager.instance().get( - messageRequest.model?.engine ?? activeModelRef.current?.engine ?? '' - ) - engine?.inference(messageRequest) - }, 1000) + if (thread.title?.trim() !== defaultThreadTitle) { + return } + + if (!activeModelRef.current) { + return + } + + // This is the first time message comes in on a new thread + // Summarize the first message, and make that the title of the Thread + // 1. Get the summary of the first prompt using whatever engine user is currently using + const threadMessages = messagesRef?.current + + if (!threadMessages || threadMessages.length === 0) return + + const summarizeFirstPrompt = `Summarize in a ${maxWordForThreadTitle}-word Title. Give the title only. "${threadMessages[0].content[0].text.value}"` + + // Prompt: Given this query from user {query}, return to me the summary in 10 words as the title + const msgId = ulid() + const messages: ChatCompletionMessage[] = [ + { + role: ChatCompletionRole.User, + content: summarizeFirstPrompt, + }, + ] + + const messageRequest: MessageRequest = { + id: msgId, + threadId: message.thread_id, + type: MessageRequestType.Summary, + messages, + model: { + ...activeModelRef.current, + parameters: { + stream: false, + }, + }, + } + + // 2. Update the title with the result of the inference + setTimeout(() => { + const engine = EngineManager.instance().get( + messageRequest.model?.engine ?? activeModelRef.current?.engine ?? '' + ) + engine?.inference(messageRequest) + }, 1000) } useEffect(() => { @@ -244,14 +275,13 @@ export default function EventHandler({ children }: { children: ReactNode }) { events.on(MessageEvent.OnMessageUpdate, onMessageResponseUpdate) events.on(ModelEvent.OnModelStopped, onModelStopped) } - }, [onNewMessageResponse, onMessageResponseUpdate, onModelStopped]) - useEffect(() => { return () => { events.off(MessageEvent.OnMessageResponse, onNewMessageResponse) events.off(MessageEvent.OnMessageUpdate, onMessageResponseUpdate) events.off(ModelEvent.OnModelStopped, onModelStopped) } }, [onNewMessageResponse, onMessageResponseUpdate, onModelStopped]) + return {children} } diff --git a/web/containers/Providers/EventListener.tsx b/web/containers/Providers/EventListener.tsx index 20fc6dde2..b35ab2e43 100644 --- a/web/containers/Providers/EventListener.tsx +++ b/web/containers/Providers/EventListener.tsx @@ -2,7 +2,7 @@ import { PropsWithChildren, useCallback, useEffect } from 'react' import React from 'react' -import { DownloadEvent, events, DownloadState } from '@janhq/core' +import { DownloadEvent, events, DownloadState, ModelEvent } from '@janhq/core' import { useSetAtom } from 'jotai' import { setDownloadStateAtom } from '@/hooks/useDownloadState' @@ -64,6 +64,7 @@ const EventListenerWrapper = ({ children }: PropsWithChildren) => { if (state.downloadType !== 'extension') { setDownloadState(state) } + events.emit(ModelEvent.OnModelsUpdate, {}) }, [setDownloadState] ) diff --git a/web/helpers/atoms/HFConverter.atom.ts b/web/helpers/atoms/HFConverter.atom.ts deleted file mode 100644 index 717ab05a9..000000000 --- a/web/helpers/atoms/HFConverter.atom.ts +++ /dev/null @@ -1,44 +0,0 @@ -import { HuggingFaceRepoData } from '@janhq/core' -import { atom } from 'jotai' - -export const repoIDAtom = atom(null) -export const loadingAtom = atom(false) -export const fetchErrorAtom = atom(null) -export const conversionStatusAtom = atom< - | 'downloading' - | 'converting' - | 'quantizing' - | 'done' - | 'stopping' - | 'generating' - | null ->(null) -export const conversionErrorAtom = atom(null) -const _repoDataAtom = atom(null) -const _unsupportedAtom = atom(false) - -export const resetAtom = atom(null, (_get, set) => { - set(repoIDAtom, null) - set(loadingAtom, false) - set(fetchErrorAtom, null) - set(conversionStatusAtom, null) - set(conversionErrorAtom, null) - set(_repoDataAtom, null) - set(_unsupportedAtom, false) -}) - -export const repoDataAtom = atom( - (get) => get(_repoDataAtom), - (_get, set, repoData: HuggingFaceRepoData) => { - set(_repoDataAtom, repoData) - if ( - !repoData.tags.includes('transformers') || - (!repoData.tags.includes('pytorch') && - !repoData.tags.includes('safetensors')) - ) { - set(_unsupportedAtom, true) - } - } -) - -export const unsupportedAtom = atom((get) => get(_unsupportedAtom)) diff --git a/web/helpers/atoms/HuggingFace.atom.ts b/web/helpers/atoms/HuggingFace.atom.ts new file mode 100644 index 000000000..514efb186 --- /dev/null +++ b/web/helpers/atoms/HuggingFace.atom.ts @@ -0,0 +1,12 @@ +import { HuggingFaceRepoData } from '@janhq/core/.' +import { atom } from 'jotai' + +// modals +export type ImportHuggingFaceModelStage = 'NONE' | 'REPO_DETAIL' + +export const importingHuggingFaceRepoDataAtom = atom< + HuggingFaceRepoData | undefined +>(undefined) + +export const importHuggingFaceModelStageAtom = + atom('NONE') diff --git a/web/helpers/atoms/Model.atom.ts b/web/helpers/atoms/Model.atom.ts index da6dc5918..b50293df4 100644 --- a/web/helpers/atoms/Model.atom.ts +++ b/web/helpers/atoms/Model.atom.ts @@ -46,6 +46,8 @@ export const removeDownloadedModelAtom = atom( export const configuredModelsAtom = atom([]) +export const defaultModelAtom = atom(undefined) + /// TODO: move this part to another atom // store the paths of the models that are being imported export const importingModelsAtom = atom([]) diff --git a/web/hooks/useConvertHuggingFaceModel.ts b/web/hooks/useConvertHuggingFaceModel.ts deleted file mode 100644 index 0616c4ee7..000000000 --- a/web/hooks/useConvertHuggingFaceModel.ts +++ /dev/null @@ -1,79 +0,0 @@ -import { - ExtensionTypeEnum, - HuggingFaceExtension, - HuggingFaceRepoData, - Quantization, -} from '@janhq/core' - -import { useAtomValue, useSetAtom } from 'jotai' - -import { extensionManager } from '@/extension/ExtensionManager' -import { ignoreSslAtom, proxyAtom } from '@/helpers/atoms/AppConfig.atom' -import { - conversionStatusAtom, - conversionErrorAtom, -} from '@/helpers/atoms/HFConverter.atom' - -export const useConvertHuggingFaceModel = () => { - const proxy = useAtomValue(proxyAtom) - const ignoreSSL = useAtomValue(ignoreSslAtom) - const setConversionStatus = useSetAtom(conversionStatusAtom) - const setConversionError = useSetAtom(conversionErrorAtom) - - const convertHuggingFaceModel = async ( - repoID: string, - repoData: HuggingFaceRepoData, - quantization: Quantization - ) => { - const extension = await extensionManager.get( - ExtensionTypeEnum.HuggingFace - ) - try { - if (extension) { - extension.interrupted = false - } - setConversionStatus('downloading') - await extension?.downloadModelFiles(repoID, repoData, { - ignoreSSL, - proxy, - }) - if (extension?.interrupted) return - setConversionStatus('converting') - await extension?.convert(repoID) - if (extension?.interrupted) return - setConversionStatus('quantizing') - await extension?.quantize(repoID, quantization) - if (extension?.interrupted) return - setConversionStatus('generating') - await extension?.generateMetadata(repoID, repoData, quantization) - setConversionStatus('done') - } catch (err) { - if (extension?.interrupted) return - extension?.cancelConvert(repoID, repoData) - if (typeof err === 'number') { - setConversionError(new Error(`exit code: ${err}`)) - } else { - setConversionError(err as Error) - } - console.error(err) - } - } - - const cancelConvertHuggingFaceModel = async ( - repoID: string, - repoData: HuggingFaceRepoData - ) => { - const extension = await extensionManager.get( - ExtensionTypeEnum.HuggingFace - ) - - setConversionStatus('stopping') - await extension?.cancelConvert(repoID, repoData) - setConversionStatus(null) - } - - return { - convertHuggingFaceModel, - cancelConvertHuggingFaceModel, - } -} diff --git a/web/hooks/useGetHFRepoData.ts b/web/hooks/useGetHFRepoData.ts index d14458854..3dab2c72e 100644 --- a/web/hooks/useGetHFRepoData.ts +++ b/web/hooks/useGetHFRepoData.ts @@ -1,31 +1,41 @@ -import { useAtomValue, useSetAtom } from 'jotai' +import { useCallback, useState } from 'react' import { - repoDataAtom, - repoIDAtom, - loadingAtom, - fetchErrorAtom, -} from '@/helpers/atoms/HFConverter.atom' + ExtensionTypeEnum, + HuggingFaceRepoData, + ModelExtension, +} from '@janhq/core' + +import { extensionManager } from '@/extension' export const useGetHFRepoData = () => { - const repoID = useAtomValue(repoIDAtom) - const setRepoData = useSetAtom(repoDataAtom) - const setLoading = useSetAtom(loadingAtom) - const setFetchError = useSetAtom(fetchErrorAtom) + const [error, setError] = useState(undefined) + const [loading, setLoading] = useState(false) - const getRepoData = async () => { - setLoading(true) + const getHfRepoData = useCallback(async (repoId: string) => { try { - const res = await fetch(`https://huggingface.co/api/models/${repoID}`) - const data = await res.json() - setRepoData(data) + setError(undefined) + setLoading(true) + const data = await extensionGetHfRepoData(repoId) + return data } catch (err) { - setFetchError( - Error("The repo does not exist or you don't have access to it.") - ) + console.error(err) + if (err instanceof Error) { + setError(err.message) + } + throw err + } finally { + setLoading(false) } - setLoading(false) - } + }, []) - return getRepoData + return { loading, error, getHfRepoData } +} + +const extensionGetHfRepoData = async ( + repoId: string +): Promise => { + return extensionManager + .get(ExtensionTypeEnum.Model) + ?.fetchHuggingFaceRepoData(repoId) } diff --git a/web/hooks/useModels.ts b/web/hooks/useModels.ts index b2aa0b518..5a6f13e03 100644 --- a/web/hooks/useModels.ts +++ b/web/hooks/useModels.ts @@ -13,25 +13,37 @@ import { useSetAtom } from 'jotai' import { extensionManager } from '@/extension' import { configuredModelsAtom, + defaultModelAtom, downloadedModelsAtom, } from '@/helpers/atoms/Model.atom' const useModels = () => { const setDownloadedModels = useSetAtom(downloadedModelsAtom) const setConfiguredModels = useSetAtom(configuredModelsAtom) + const setDefaultModel = useSetAtom(defaultModelAtom) const getData = useCallback(() => { const getDownloadedModels = async () => { const models = await getLocalDownloadedModels() setDownloadedModels(models) } + const getConfiguredModels = async () => { const models = await getLocalConfiguredModels() setConfiguredModels(models) } - getDownloadedModels() - getConfiguredModels() - }, [setDownloadedModels, setConfiguredModels]) + + const getDefaultModel = async () => { + const defaultModel = await getLocalDefaultModel() + setDefaultModel(defaultModel) + } + + Promise.all([ + getDownloadedModels(), + getConfiguredModels(), + getDefaultModel(), + ]) + }, [setDownloadedModels, setConfiguredModels, setDefaultModel]) useEffect(() => { // Try get data on mount @@ -46,6 +58,11 @@ const useModels = () => { }, [getData]) } +const getLocalDefaultModel = async (): Promise => + extensionManager + .get(ExtensionTypeEnum.Model) + ?.getDefaultModel() + const getLocalConfiguredModels = async (): Promise => extensionManager .get(ExtensionTypeEnum.Model) diff --git a/web/hooks/useUpdateModelParameters.ts b/web/hooks/useUpdateModelParameters.ts index dda3e2ecf..a1461bac1 100644 --- a/web/hooks/useUpdateModelParameters.ts +++ b/web/hooks/useUpdateModelParameters.ts @@ -34,13 +34,10 @@ export default function useUpdateModelParameters() { const updateModelParameter = useCallback( async (thread: Thread, settings: UpdateModelParameter) => { - const params = settings.modelId - ? settings.params - : { ...activeModelParams, ...settings.params } - - const updatedModelParams: ModelParams = { - ...params, - } + const toUpdateSettings = processStopWords(settings.params ?? {}) + const updatedModelParams = settings.modelId + ? toUpdateSettings + : { ...activeModelParams, ...toUpdateSettings } // update the state setThreadModelParams(thread.id, updatedModelParams) @@ -73,5 +70,13 @@ export default function useUpdateModelParameters() { [activeModelParams, selectedModel, setThreadModelParams] ) + const processStopWords = (params: ModelParams): ModelParams => { + if ('stop' in params && typeof params['stop'] === 'string') { + // Input as string but stop words accept an array of strings (space as separator) + params['stop'] = (params['stop'] as string).split(' ') + } + return params + } + return { updateModelParameter } } diff --git a/web/package.json b/web/package.json index de42f053c..a654e3a5c 100644 --- a/web/package.json +++ b/web/package.json @@ -23,10 +23,12 @@ "framer-motion": "^10.16.4", "highlight.js": "^11.9.0", "jotai": "^2.6.0", + "katex": "^0.16.10", "lodash": "^4.17.21", "lucide-react": "^0.291.0", "marked": "^9.1.2", "marked-highlight": "^2.0.6", + "marked-katex-extension": "^5.0.1", "next": "14.0.1", "next-themes": "^0.2.1", "postcss": "8.4.31", diff --git a/web/public/icons/Jan_AppIcon.svg b/web/public/icons/Jan_AppIcon.svg deleted file mode 100644 index a823ab3ed..000000000 --- a/web/public/icons/Jan_AppIcon.svg +++ /dev/null @@ -1,15 +0,0 @@ - - - - - - - - - - - - - - - diff --git a/web/public/icons/ViewGrid.svg b/web/public/icons/ViewGrid.svg deleted file mode 100644 index 2a1fd31b1..000000000 --- a/web/public/icons/ViewGrid.svg +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - diff --git a/web/public/icons/avatar.svg b/web/public/icons/avatar.svg deleted file mode 100644 index 30d300289..000000000 --- a/web/public/icons/avatar.svg +++ /dev/null @@ -1 +0,0 @@ -U \ No newline at end of file diff --git a/web/public/icons/ic_sidebar_off.svg b/web/public/icons/ic_sidebar_off.svg deleted file mode 100644 index 64dbf3dab..000000000 --- a/web/public/icons/ic_sidebar_off.svg +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/web/public/icons/ic_sidebar_on.svg b/web/public/icons/ic_sidebar_on.svg deleted file mode 100644 index 5a0c91a53..000000000 --- a/web/public/icons/ic_sidebar_on.svg +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/web/public/images/banner.jpg b/web/public/images/banner.jpg deleted file mode 100644 index dddc713a9..000000000 Binary files a/web/public/images/banner.jpg and /dev/null differ diff --git a/web/public/images/mobile.jpg b/web/public/images/mobile.jpg deleted file mode 100644 index f33682659..000000000 Binary files a/web/public/images/mobile.jpg and /dev/null differ diff --git a/web/public/images/preview.jpg b/web/public/images/preview.jpg deleted file mode 100644 index 1fca575ce..000000000 Binary files a/web/public/images/preview.jpg and /dev/null differ diff --git a/web/screens/Chat/ChatBody/index.tsx b/web/screens/Chat/ChatBody/index.tsx index ed950c699..5f89b76cd 100644 --- a/web/screens/Chat/ChatBody/index.tsx +++ b/web/screens/Chat/ChatBody/index.tsx @@ -1,8 +1,8 @@ -import ScrollToBottom from 'react-scroll-to-bottom' - import { MessageStatus } from '@janhq/core' import { useAtomValue } from 'jotai' +import ListContainer from '@/containers/ListContainer' + import { loadModelErrorAtom } from '@/hooks/useActiveModel' import ChatItem from '../ChatItem' @@ -26,7 +26,7 @@ const ChatBody: React.FC = () => { if (messages.length === 0) return return ( - + {messages.map((message, index) => (
{message.status !== MessageStatus.Error && @@ -43,7 +43,7 @@ const ChatBody: React.FC = () => {
))} {loadModelError && } -
+ ) } diff --git a/web/screens/Chat/EngineSetting/index.tsx b/web/screens/Chat/EngineSetting/index.tsx index 57b19a484..6e8ef73a8 100644 --- a/web/screens/Chat/EngineSetting/index.tsx +++ b/web/screens/Chat/EngineSetting/index.tsx @@ -1,4 +1,4 @@ -import { SettingComponentProps } from '@janhq/core/.' +import { SettingComponentProps } from '@janhq/core' import SettingComponentBuilder from '../../Chat/ModelSetting/SettingComponent' diff --git a/web/screens/Chat/ModelSetting/index.tsx b/web/screens/Chat/ModelSetting/index.tsx index e345279c3..fdfee07bf 100644 --- a/web/screens/Chat/ModelSetting/index.tsx +++ b/web/screens/Chat/ModelSetting/index.tsx @@ -1,6 +1,6 @@ import React from 'react' -import { SettingComponentProps } from '@janhq/core/.' +import { SettingComponentProps } from '@janhq/core' import SettingComponentBuilder from './SettingComponent' diff --git a/web/screens/Chat/ModelSetting/predefinedComponent.ts b/web/screens/Chat/ModelSetting/predefinedComponent.ts index 4879732b1..a52214e38 100644 --- a/web/screens/Chat/ModelSetting/predefinedComponent.ts +++ b/web/screens/Chat/ModelSetting/predefinedComponent.ts @@ -36,7 +36,7 @@ export const presetConfiguration: Record = { min: 0, max: 4096, step: 128, - value: 4096, + value: 2048, }, requireModelReload: true, configType: 'setting', diff --git a/web/screens/Chat/Sidebar/index.tsx b/web/screens/Chat/Sidebar/index.tsx index 62bda54d0..6829ac9ff 100644 --- a/web/screens/Chat/Sidebar/index.tsx +++ b/web/screens/Chat/Sidebar/index.tsx @@ -65,7 +65,9 @@ const Sidebar: React.FC = () => { modelEngineParams, selectedModel ) - return componentDataEngineSetting.filter((x) => x.key !== 'prompt_template') + return componentDataEngineSetting.filter( + (x) => x.key !== 'prompt_template' && x.key !== 'embedding' + ) }, [activeModelParams, selectedModel]) const promptTemplateSettings = useMemo(() => { diff --git a/web/screens/Chat/SimpleTextMessage/index.tsx b/web/screens/Chat/SimpleTextMessage/index.tsx index 376d011eb..489d28732 100644 --- a/web/screens/Chat/SimpleTextMessage/index.tsx +++ b/web/screens/Chat/SimpleTextMessage/index.tsx @@ -19,8 +19,8 @@ import hljs from 'highlight.js' import { useAtomValue } from 'jotai' import { FolderOpenIcon } from 'lucide-react' import { Marked, Renderer } from 'marked' - import { markedHighlight } from 'marked-highlight' +import markedKatex from 'marked-katex-extension' import { twMerge } from 'tailwind-merge' @@ -100,6 +100,8 @@ const SimpleTextMessage: React.FC = (props) => { } ) + marked.use(markedKatex({ throwOnError: false })) + const { onViewFile, onViewFileContainer } = usePath() const parsedText = marked.parse(text) const [tokenCount, setTokenCount] = useState(0) @@ -297,7 +299,7 @@ const SimpleTextMessage: React.FC = (props) => { ) : (
= ({ model }) => { Tags -
- {model.metadata.tags.map((tag, i) => ( +
+ {model.metadata.tags.map((tag: string) => ( { - // This component only loads when repoData is not null - const repoData = useAtomValue(repoDataAtom)! - // This component only loads when conversionStatus is not null - const conversionStatus = useAtomValue(conversionStatusAtom)! - - return ( - <> -
-

Hugging Face Converter

-
-
-

- An error occurred while {conversionStatus} model {repoData.id}. -

-

Please close this modal and try again.

-
- - ) -} diff --git a/web/screens/ExploreModels/HuggingFaceConvertingModal/index.tsx b/web/screens/ExploreModels/HuggingFaceConvertingModal/index.tsx deleted file mode 100644 index 175722dda..000000000 --- a/web/screens/ExploreModels/HuggingFaceConvertingModal/index.tsx +++ /dev/null @@ -1,73 +0,0 @@ -import { useEffect, useState } from 'react' - -import { Button } from '@janhq/uikit' -import { useAtomValue } from 'jotai' - -import { useConvertHuggingFaceModel } from '@/hooks/useConvertHuggingFaceModel' - -import { - conversionStatusAtom, - repoDataAtom, -} from '@/helpers/atoms/HFConverter.atom' - -export const HuggingFaceConvertingModal = () => { - // This component only loads when repoData is not null - const repoData = useAtomValue(repoDataAtom)! - // This component only loads when conversionStatus is not null - const conversionStatus = useAtomValue(conversionStatusAtom)! - const [status, setStatus] = useState('') - const { cancelConvertHuggingFaceModel } = useConvertHuggingFaceModel() - - useEffect(() => { - switch (conversionStatus) { - case 'downloading': - setStatus('Downloading files...') - break - case 'converting': - setStatus('Converting...') - break - case 'quantizing': - setStatus('Quantizing...') - break - case 'stopping': - setStatus('Stopping...') - break - case 'generating': - setStatus('Generating metadata...') - break - } - }, [conversionStatus]) - - const onStopClick = () => { - cancelConvertHuggingFaceModel(repoData.id, repoData) - } - - return ( - <> -
-

Hugging Face Converter

-
- {conversionStatus === 'done' ? ( -
-

Done!

-

Now you can use the model on Jan as usual. Have fun!

-
- ) : ( - <> -
-

{status}

-
- - - )} - - ) -} diff --git a/web/screens/ExploreModels/HuggingFaceModal/index.tsx b/web/screens/ExploreModels/HuggingFaceModal/index.tsx deleted file mode 100644 index 9051e15e6..000000000 --- a/web/screens/ExploreModels/HuggingFaceModal/index.tsx +++ /dev/null @@ -1,70 +0,0 @@ -import { CommandModal, Modal, ModalContent } from '@janhq/uikit' -import { useAtomValue, useSetAtom } from 'jotai' - -import { HuggingFaceConvertingErrorModal } from '../HuggingFaceConvertingErrorModal' -import { HuggingFaceConvertingModal } from '../HuggingFaceConvertingModal' -import { HuggingFaceRepoDataLoadedModal } from '../HuggingFaceRepoDataLoadedModal' -import { HuggingFaceSearchErrorModal } from '../HuggingFaceSearchErrorModal' -import { HuggingFaceSearchModal } from '../HuggingFaceSearchModal' - -import { - repoDataAtom, - fetchErrorAtom, - resetAtom, - conversionStatusAtom, - conversionErrorAtom, -} from '@/helpers/atoms/HFConverter.atom' - -const HuggingFaceModal = ({ - ...props -}: Omit[0], 'children'>) => { - const repoData = useAtomValue(repoDataAtom) - const fetchError = useAtomValue(fetchErrorAtom) - const conversionStatus = useAtomValue(conversionStatusAtom) - const conversionError = useAtomValue(conversionErrorAtom) - const setReset = useSetAtom(resetAtom) - - return ( - { - if (open === false) { - if ( - !repoData || - ['done', 'stopping'].includes(conversionStatus ?? '') || - conversionError - ) { - setReset() - } - } - if (props.onOpenChange) { - props.onOpenChange(open) - } - }} - > - -
-
- {repoData ? ( - conversionStatus ? ( - conversionError ? ( - - ) : ( - - ) - ) : ( - - ) - ) : fetchError ? ( - - ) : ( - - )} -
-
-
-
- ) -} - -export { HuggingFaceModal } diff --git a/web/screens/ExploreModels/HuggingFaceRepoDataLoadedModal/index.tsx b/web/screens/ExploreModels/HuggingFaceRepoDataLoadedModal/index.tsx deleted file mode 100644 index 32284ede5..000000000 --- a/web/screens/ExploreModels/HuggingFaceRepoDataLoadedModal/index.tsx +++ /dev/null @@ -1,100 +0,0 @@ -import { useState } from 'react' - -import { Quantization } from '@janhq/core' -import { - Button, - Select, - SelectContent, - SelectGroup, - SelectItem, - SelectPortal, - SelectTrigger, - SelectValue, -} from '@janhq/uikit' -import { useAtomValue } from 'jotai' - -import { twMerge } from 'tailwind-merge' - -import { useConvertHuggingFaceModel } from '@/hooks/useConvertHuggingFaceModel' - -import { - loadingAtom, - repoDataAtom, - unsupportedAtom, -} from '@/helpers/atoms/HFConverter.atom' - -export const HuggingFaceRepoDataLoadedModal = () => { - const loading = useAtomValue(loadingAtom) - // This component only loads when repoData is not null - const repoData = useAtomValue(repoDataAtom)! - const unsupported = useAtomValue(unsupportedAtom) - const [quantization, setQuantization] = useState( - Quantization.Q4_K_M - ) - const { convertHuggingFaceModel } = useConvertHuggingFaceModel() - - const onValueSelected = (value: Quantization) => { - setQuantization(value) - } - const onConvertClick = () => { - convertHuggingFaceModel(repoData.id, repoData, quantization) - } - - return ( - <> -
-

Hugging Face Converter

-

Found the repository!

-
-
-

{repoData.id}

-

- {unsupported - ? '❌ This model is not supported!' - : '✅ This model is supported!'} -

- {repoData.tags?.includes('gguf') ? ( -

...But you can import it manually!

- ) : null} -
- - - - ) -} diff --git a/web/screens/ExploreModels/HuggingFaceSearchErrorModal/index.tsx b/web/screens/ExploreModels/HuggingFaceSearchErrorModal/index.tsx deleted file mode 100644 index 4cb58332b..000000000 --- a/web/screens/ExploreModels/HuggingFaceSearchErrorModal/index.tsx +++ /dev/null @@ -1,32 +0,0 @@ -import { Button } from '@janhq/uikit' -import { useAtomValue } from 'jotai' - -import { useGetHFRepoData } from '@/hooks/useGetHFRepoData' - -import { fetchErrorAtom, loadingAtom } from '@/helpers/atoms/HFConverter.atom' - -export const HuggingFaceSearchErrorModal = () => { - // This component only loads when fetchError is not null - const fetchError = useAtomValue(fetchErrorAtom)! - const loading = useAtomValue(loadingAtom) - - const getRepoData = useGetHFRepoData() - - return ( - <> -
-

Error!

-

Fetch error

-
-

{fetchError.message}

- - - ) -} diff --git a/web/screens/ExploreModels/HuggingFaceSearchModal/index.tsx b/web/screens/ExploreModels/HuggingFaceSearchModal/index.tsx deleted file mode 100644 index 3add92ed1..000000000 --- a/web/screens/ExploreModels/HuggingFaceSearchModal/index.tsx +++ /dev/null @@ -1,45 +0,0 @@ -import { Button, Input } from '@janhq/uikit' -import { useSetAtom, useAtomValue } from 'jotai' - -import { useGetHFRepoData } from '@/hooks/useGetHFRepoData' - -import { repoIDAtom, loadingAtom } from '@/helpers/atoms/HFConverter.atom' - -export const HuggingFaceSearchModal = () => { - const setRepoID = useSetAtom(repoIDAtom) - const loading = useAtomValue(loadingAtom) - - const getRepoData = useGetHFRepoData() - - const onKeyDown = (e: React.KeyboardEvent) => { - if (e.key === 'Enter') { - e.preventDefault() - getRepoData() - } - } - - return ( - <> -
-

Hugging Face Converter

-

Type the repository id below

-
- { - setRepoID(e.target.value) - }} - onKeyDown={onKeyDown} - /> - - - ) -} diff --git a/web/screens/ExploreModels/index.tsx b/web/screens/ExploreModels/index.tsx index 484e62b0e..f2e19661e 100644 --- a/web/screens/ExploreModels/index.tsx +++ b/web/screens/ExploreModels/index.tsx @@ -1,7 +1,6 @@ import { useCallback, useState } from 'react' import { - Input, ScrollArea, Select, SelectTrigger, @@ -13,12 +12,13 @@ import { } from '@janhq/uikit' import { useAtomValue, useSetAtom } from 'jotai' -import { UploadIcon, SearchIcon } from 'lucide-react' +import { UploadIcon } from 'lucide-react' import { setImportModelStageAtom } from '@/hooks/useImportModel' +import ModelSearch from '../Settings/Models/ModelSearch' + import ExploreModelList from './ExploreModelList' -import { HuggingFaceModal } from './HuggingFaceModal' import { configuredModelsAtom, @@ -33,7 +33,6 @@ const ExploreModelsScreen = () => { const [searchValue, setsearchValue] = useState('') const [sortSelected, setSortSelected] = useState('All Models') - const [showHuggingFaceModal, setShowHuggingFaceModal] = useState(false) const setImportModelStage = useSetAtom(setImportModelStageAtom) const filteredModels = configuredModels.filter((x) => { @@ -56,6 +55,10 @@ const ExploreModelsScreen = () => { setImportModelStage('SELECTING_MODEL') }, [setImportModelStage]) + const onSearchUpdate = useCallback((input: string) => { + setsearchValue(input) + }, []) + return (
{ >
-
{ alt="Hub Banner" className="w-full object-cover" /> -
+
-
- - setsearchValue(e.target.value)} - /> -
+
- {/* {experimentalFeature && ( -
-

- Convert from Hugging Face -

-
- )} */}
- {/* Temporary hide tabs */} - {/*
-
setTabActive('Model')} - > - - Model -
- - -
setTabActive('Assistant')} - > - - Assistant -
-
- - Coming Soon - - -
-
*/} - +
+ {loading && ( + + )} +
+ ) +} + +export default ModelSearch diff --git a/web/screens/Settings/Models/Row.tsx b/web/screens/Settings/Models/Row.tsx index 1d9283efa..5da7f9177 100644 --- a/web/screens/Settings/Models/Row.tsx +++ b/web/screens/Settings/Models/Row.tsx @@ -63,19 +63,23 @@ export default function RowModel(props: RowModelProps) { return ( - {props.data.name} - {props.data.id} - + +

{props.data.name}

+ + +

{props.data.id}

+ + {props.data.metadata.size ? toGibibytes(props.data.metadata.size) : '-'} - + v{props.data.version} - + {isRemoteModel ? ( )} - + {!isRemoteModel && (
{ const downloadedModels = useAtomValue(downloadedModelsAtom) const setImportModelStage = useSetAtom(setImportModelStageAtom) - const [searchValue, setsearchValue] = useState('') const { onDropModels } = useDropModelBinaries() + const [searchText, setSearchText] = useState('') - const filteredDownloadedModels = downloadedModels - .filter((x) => x.name?.toLowerCase().includes(searchValue.toLowerCase())) - .sort((a, b) => a.name.localeCompare(b.name)) + const filteredDownloadedModels = useMemo( + () => + downloadedModels + .filter((e) => + e.name.toLowerCase().includes(searchText.toLowerCase().trim()) + ) + .sort((a, b) => a.name.localeCompare(b.name)), + [downloadedModels, searchText] + ) const { getRootProps, isDragActive } = useDropzone({ noClick: true, @@ -38,6 +45,10 @@ const Models: React.FC = () => { setImportModelStage('SELECTING_MODEL') }, [setImportModelStage]) + const onSearchChange = useCallback((input: string) => { + setSearchText(input) + }, []) + return ( {isDragActive && ( @@ -61,20 +72,7 @@ const Models: React.FC = () => { )}
-
- - { - setsearchValue(e.target.value) - }} - /> -
- +