diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..982e9f412 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,37 @@ +--- +name: "🖋️ Report" +about: Create a report to help us improve Jan +title: 'bug: [DESCRIPTION]' +labels: 'type: bug' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**Steps to reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your issue. + +**Environment details** +- Operating System: [Specify your OS. e.g., MacOS Sonoma 14.2.1, Windows 11, Ubuntu 22, etc] +- Jan Version: [e.g., 0.4.xxx nightly or manual] +- Processor: [e.g., Apple M1, Intel Core i7, AMD Ryzen 5, etc] +- RAM: [e.g., 8GB, 16GB] +- Any additional relevant hardware specifics: [e.g., Graphics card, SSD/HDD] + +**Logs** +If the cause of the error is not clear, kindly provide your usage logs: https://jan.ai/docs/troubleshooting#how-to-get-error-logs + +**Additional context** +Add any other context or information that could be helpful in diagnosing the problem. diff --git a/.github/workflows/jan-electron-build-nightly.yml b/.github/workflows/jan-electron-build-nightly.yml index 26bbcc672..d666bdc56 100644 --- a/.github/workflows/jan-electron-build-nightly.yml +++ b/.github/workflows/jan-electron-build-nightly.yml @@ -10,7 +10,7 @@ on: description: 'Public Provider' options: - none - - cloudflare-r2 + - aws-s3 default: none jobs: @@ -28,10 +28,10 @@ jobs: echo "::set-output name=ref::${{ github.ref }}" else if [ "${{ github.event_name }}" == "schedule" ]; then - echo "::set-output name=public_provider::cloudflare-r2" + echo "::set-output name=public_provider::aws-s3" echo "::set-output name=ref::refs/heads/dev" elif [ "${{ github.event_name }}" == "push" ]; then - echo "::set-output name=public_provider::cloudflare-r2" + echo "::set-output name=public_provider::aws-s3" echo "::set-output name=ref::${{ github.ref }}" else echo "::set-output name=public_provider::none" @@ -112,13 +112,13 @@ jobs: cat ./latest-mac.yml - name: Upload latest-mac.yml - if: ${{ needs.set-public-provider.outputs.public_provider == 'cloudflare-r2' }} + if: ${{ needs.set-public-provider.outputs.public_provider == 'aws-s3' }} run: | - aws s3api put-object --endpoint-url https://${{ secrets.CLOUDFLARE_ACCOUNT_ID }}.r2.cloudflarestorage.com --bucket ${{ secrets.CLOUDFLARE_R2_BUCKET_NAME }} --key "latest/latest-mac.yml" --body "./latest-mac.yml" + aws s3 cp ./latest-mac.yml "s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/latest/latest-mac.yml" env: - AWS_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: auto + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.DELTA_AWS_REGION }} AWS_EC2_METADATA_DISABLED: "true" @@ -147,7 +147,7 @@ jobs: noti-discord-manual-and-update-url-readme: needs: [build-macos-x64, build-macos-arm64, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, combine-latest-mac-yml] secrets: inherit - if: github.event_name == 'workflow_dispatch' && github.event.inputs.public_provider == 'cloudflare-r2' + if: github.event_name == 'workflow_dispatch' && github.event.inputs.public_provider == 'aws-s3' uses: ./.github/workflows/template-noti-discord-and-update-url-readme.yml with: ref: refs/heads/dev diff --git a/.github/workflows/jan-server-build-nightly.yml b/.github/workflows/jan-server-build-nightly.yml deleted file mode 100644 index 29e13804e..000000000 --- a/.github/workflows/jan-server-build-nightly.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Docker Builder - Nightly / Manual - -on: - push: - branches: - - main - - feature/helmchart-and-ci-jan-server - paths-ignore: - - 'README.md' - - 'docs/**' - schedule: - - cron: '0 21 * * 1,2,3' # At 8 PM UTC on Monday, Tuesday, and Wednesday which is 4 AM UTC+7 Tuesday, Wednesday, and Thursday - workflow_dispatch: - -jobs: - # Job create Update app version based on latest release tag with build number and save to output - get-update-version: - uses: ./.github/workflows/template-get-update-version.yml - - build-cpu: - uses: ./.github/workflows/template-build-jan-server.yml - permissions: - packages: write - secrets: inherit - needs: [get-update-version] - with: - dockerfile_path: ./Dockerfile - docker_image_tag: "ghcr.io/janhq/jan-server:dev-cpu-latest,ghcr.io/janhq/jan-server:dev-cpu-${{ needs.get-update-version.outputs.new_version }}" - - build-gpu: - uses: ./.github/workflows/template-build-jan-server.yml - permissions: - packages: write - secrets: inherit - needs: [get-update-version] - with: - dockerfile_path: ./Dockerfile.gpu - docker_image_tag: "ghcr.io/janhq/jan-server:dev-cuda-12.2-latest,ghcr.io/janhq/jan-server:dev-cuda-12.2-${{ needs.get-update-version.outputs.new_version }}" - - diff --git a/.github/workflows/jan-server-build.yml b/.github/workflows/jan-server-build.yml deleted file mode 100644 index 503efd298..000000000 --- a/.github/workflows/jan-server-build.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Docker Builder - Tag - -on: - push: - tags: ["v[0-9]+.[0-9]+.[0-9]+"] - -jobs: - # Job create Update app version based on latest release tag with build number and save to output - get-update-version: - uses: ./.github/workflows/template-get-update-version.yml - - build-cpu: - permissions: - packages: write - uses: ./.github/workflows/template-build-jan-server.yml - secrets: inherit - needs: [get-update-version] - with: - dockerfile_path: ./Dockerfile - docker_image_tag: "ghcr.io/janhq/jan-server:cpu-latest,ghcr.io/janhq/jan-server:cpu-${{ needs.get-update-version.outputs.new_version }}" - - build-gpu: - permissions: - packages: write - uses: ./.github/workflows/template-build-jan-server.yml - secrets: inherit - needs: [get-update-version] - with: - dockerfile_path: ./Dockerfile.gpu - docker_image_tag: "ghcr.io/janhq/jan-server:cuda-12.2-latest,ghcr.io/janhq/jan-server:cuda-12.2-${{ needs.get-update-version.outputs.new_version }}" diff --git a/.github/workflows/template-build-linux-x64.yml b/.github/workflows/template-build-linux-x64.yml index 08cb1dada..e64b23f40 100644 --- a/.github/workflows/template-build-linux-x64.yml +++ b/.github/workflows/template-build-linux-x64.yml @@ -10,23 +10,21 @@ on: required: true type: string default: none - description: 'none: build only, github: build and publish to github, cloudflare: build and publish to cloudflare' + description: 'none: build only, github: build and publish to github, aws s3: build and publish to aws s3' new_version: required: true type: string default: '' - cloudflare_r2_path: + aws_s3_prefix: required: false type: string default: '/latest/' secrets: - CLOUDFLARE_R2_BUCKET_NAME: + DELTA_AWS_S3_BUCKET_NAME: required: false - CLOUDFLARE_R2_ACCESS_KEY_ID: + DELTA_AWS_ACCESS_KEY_ID: required: false - CLOUDFLARE_R2_SECRET_ACCESS_KEY: - required: false - CLOUDFLARE_ACCOUNT_ID: + DELTA_AWS_SECRET_ACCESS_KEY: required: false jobs: @@ -58,7 +56,7 @@ jobs: mv /tmp/package.json electron/package.json jq --arg version "${{ inputs.new_version }}" '.version = $version' web/package.json > /tmp/package.json mv /tmp/package.json web/package.json - jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "bucket": "${{ secrets.CLOUDFLARE_R2_BUCKET_NAME }}", "region": "auto", "endpoint": "https://${{ secrets.CLOUDFLARE_ACCOUNT_ID }}.r2.cloudflarestorage.com", "path": "${{ inputs.cloudflare_r2_path }}", "channel": "latest"}]' electron/package.json > /tmp/package.json + jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "acl": null, "bucket": "${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}", "region": "${{ secrets.DELTA_AWS_REGION}}", "path": "${{ inputs.aws_s3_prefix }}", "channel": "latest"}]' electron/package.json > /tmp/package.json mv /tmp/package.json electron/package.json cat electron/package.json @@ -76,7 +74,7 @@ jobs: env: VERSION_TAG: ${{ inputs.new_version }} - - name: Build and publish app to cloudflare r2 or github artifactory + - name: Build and publish app to aws s3 r2 or github artifactory if: inputs.public_provider != 'github' run: | # check public_provider is true or not @@ -88,9 +86,10 @@ jobs: fi env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} AWS_EC2_METADATA_DISABLED: "true" + AWS_MAX_ATTEMPTS: "5" - name: Build and publish app to github if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') && inputs.public_provider == 'github' diff --git a/.github/workflows/template-build-macos-arm64.yml b/.github/workflows/template-build-macos-arm64.yml index a5bc1e539..753ad6793 100644 --- a/.github/workflows/template-build-macos-arm64.yml +++ b/.github/workflows/template-build-macos-arm64.yml @@ -10,23 +10,21 @@ on: required: true type: string default: none - description: 'none: build only, github: build and publish to github, cloudflare: build and publish to cloudflare' + description: 'none: build only, github: build and publish to github, aws s3: build and publish to aws s3' new_version: required: true type: string default: '' - cloudflare_r2_path: + aws_s3_prefix: required: false type: string default: '/latest/' secrets: - CLOUDFLARE_R2_BUCKET_NAME: + DELTA_AWS_S3_BUCKET_NAME: required: false - CLOUDFLARE_R2_ACCESS_KEY_ID: + DELTA_AWS_ACCESS_KEY_ID: required: false - CLOUDFLARE_R2_SECRET_ACCESS_KEY: - required: false - CLOUDFLARE_ACCOUNT_ID: + DELTA_AWS_SECRET_ACCESS_KEY: required: false CODE_SIGN_P12_BASE64: required: false @@ -70,7 +68,7 @@ jobs: jq --arg version "${{ inputs.new_version }}" '.version = $version' web/package.json > /tmp/package.json mv /tmp/package.json web/package.json - jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "bucket": "${{ secrets.CLOUDFLARE_R2_BUCKET_NAME }}", "region": "auto", "endpoint": "https://${{ secrets.CLOUDFLARE_ACCOUNT_ID }}.r2.cloudflarestorage.com", "path": "${{ inputs.cloudflare_r2_path }}", "channel": "latest"}]' electron/package.json > /tmp/package.json + jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "acl": null, "bucket": "${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}", "region": "${{ secrets.DELTA_AWS_REGION}}", "path": "${{ inputs.aws_s3_prefix }}", "channel": "latest"}]' electron/package.json > /tmp/package.json mv /tmp/package.json electron/package.json jq --arg teamid "${{ secrets.APPLE_TEAM_ID }}" '.build.mac.notarize.teamId = $teamid' electron/package.json > /tmp/package.json @@ -107,7 +105,7 @@ jobs: p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }} p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }} - - name: Build and publish app to cloudflare r2 or github artifactory + - name: Build and publish app to aws s3 r2 or github artifactory if: inputs.public_provider != 'github' run: | # check public_provider is true or not @@ -126,10 +124,11 @@ jobs: APPLE_APP_SPECIFIC_PASSWORD: ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }} APP_PATH: "." DEVELOPER_ID: ${{ secrets.DEVELOPER_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: auto AWS_EC2_METADATA_DISABLED: "true" + AWS_MAX_ATTEMPTS: "5" - name: Build and publish app to github if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') && inputs.public_provider == 'github' diff --git a/.github/workflows/template-build-macos-x64.yml b/.github/workflows/template-build-macos-x64.yml index d9543194d..769e0f808 100644 --- a/.github/workflows/template-build-macos-x64.yml +++ b/.github/workflows/template-build-macos-x64.yml @@ -10,23 +10,21 @@ on: required: true type: string default: none - description: 'none: build only, github: build and publish to github, cloudflare: build and publish to cloudflare' + description: 'none: build only, github: build and publish to github, aws s3: build and publish to aws s3' new_version: required: true type: string default: '' - cloudflare_r2_path: + aws_s3_prefix: required: false type: string default: '/latest/' secrets: - CLOUDFLARE_R2_BUCKET_NAME: + DELTA_AWS_S3_BUCKET_NAME: required: false - CLOUDFLARE_R2_ACCESS_KEY_ID: + DELTA_AWS_ACCESS_KEY_ID: required: false - CLOUDFLARE_R2_SECRET_ACCESS_KEY: - required: false - CLOUDFLARE_ACCOUNT_ID: + DELTA_AWS_SECRET_ACCESS_KEY: required: false CODE_SIGN_P12_BASE64: required: false @@ -70,7 +68,7 @@ jobs: jq --arg version "${{ inputs.new_version }}" '.version = $version' web/package.json > /tmp/package.json mv /tmp/package.json web/package.json - jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "bucket": "${{ secrets.CLOUDFLARE_R2_BUCKET_NAME }}", "region": "auto", "endpoint": "https://${{ secrets.CLOUDFLARE_ACCOUNT_ID }}.r2.cloudflarestorage.com", "path": "${{ inputs.cloudflare_r2_path }}", "channel": "latest"}]' electron/package.json > /tmp/package.json + jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "acl": null, "bucket": "${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}", "region": "${{ secrets.DELTA_AWS_REGION}}", "path": "${{ inputs.aws_s3_prefix }}", "channel": "latest"}]' electron/package.json > /tmp/package.json mv /tmp/package.json electron/package.json jq --arg teamid "${{ secrets.APPLE_TEAM_ID }}" '.build.mac.notarize.teamId = $teamid' electron/package.json > /tmp/package.json @@ -107,7 +105,7 @@ jobs: p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }} p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }} - - name: Build and publish app to cloudflare r2 or github artifactory + - name: Build and publish app to aws s3 r2 or github artifactory if: inputs.public_provider != 'github' run: | # check public_provider is true or not @@ -126,10 +124,11 @@ jobs: APPLE_APP_SPECIFIC_PASSWORD: ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }} APP_PATH: "." DEVELOPER_ID: ${{ secrets.DEVELOPER_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: auto AWS_EC2_METADATA_DISABLED: "true" + AWS_MAX_ATTEMPTS: "5" - name: Build and publish app to github if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') && inputs.public_provider == 'github' diff --git a/.github/workflows/template-build-windows-x64.yml b/.github/workflows/template-build-windows-x64.yml index b81997bde..491a8f031 100644 --- a/.github/workflows/template-build-windows-x64.yml +++ b/.github/workflows/template-build-windows-x64.yml @@ -10,23 +10,21 @@ on: required: true type: string default: none - description: 'none: build only, github: build and publish to github, cloudflare: build and publish to cloudflare' + description: 'none: build only, github: build and publish to github, aws s3: build and publish to aws s3' new_version: required: true type: string default: '' - cloudflare_r2_path: + aws_s3_prefix: required: false type: string default: '/latest/' secrets: - CLOUDFLARE_R2_BUCKET_NAME: + DELTA_AWS_S3_BUCKET_NAME: required: false - CLOUDFLARE_R2_ACCESS_KEY_ID: + DELTA_AWS_ACCESS_KEY_ID: required: false - CLOUDFLARE_R2_SECRET_ACCESS_KEY: - required: false - CLOUDFLARE_ACCOUNT_ID: + DELTA_AWS_SECRET_ACCESS_KEY: required: false AZURE_KEY_VAULT_URI: required: false @@ -71,7 +69,7 @@ jobs: jq --arg version "${{ inputs.new_version }}" '.version = $version' web/package.json > /tmp/package.json mv /tmp/package.json web/package.json - jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "bucket": "${{ secrets.CLOUDFLARE_R2_BUCKET_NAME }}", "region": "auto", "endpoint": "https://${{ secrets.CLOUDFLARE_ACCOUNT_ID }}.r2.cloudflarestorage.com", "path": "${{ inputs.cloudflare_r2_path }}", "channel": "latest"}]' electron/package.json > /tmp/package.json + jq '.build.publish = [{"provider": "generic", "url": "${{ secrets.CLOUDFLARE_R2_PUBLIC_URL }}", "channel": "latest"}, {"provider": "s3", "acl": null, "bucket": "${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}", "region": "${{ secrets.DELTA_AWS_REGION}}", "path": "${{ inputs.aws_s3_prefix }}", "channel": "latest"}]' electron/package.json > /tmp/package.json mv /tmp/package.json electron/package.json jq '.build.win.sign = "./sign.js"' electron/package.json > /tmp/package.json @@ -99,7 +97,7 @@ jobs: run: | dotnet tool install --global AzureSignTool - - name: Build and publish app to cloudflare r2 or github artifactory + - name: Build and publish app to aws s3 r2 or github artifactory shell: bash if: inputs.public_provider != 'github' run: | @@ -116,10 +114,11 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} AZURE_CERT_NAME: ${{ secrets.AZURE_CERT_NAME }} - AWS_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: auto AWS_EC2_METADATA_DISABLED: "true" + AWS_MAX_ATTEMPTS: "5" - name: Build app and publish app to github if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') && inputs.public_provider == 'github' diff --git a/.gitignore b/.gitignore index 0b6f98465..d3b50445e 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,5 @@ extensions/*-extension/bin/vulkaninfo # Turborepo .turbo +electron/test-data +electron/test-results diff --git a/.husky/pre-commit b/.husky/pre-commit index 0da96d6ba..a4aa5add4 100644 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -1,4 +1 @@ -#!/usr/bin/env sh -. "$(dirname -- "$0")/_/husky.sh" - -npx pretty-quick --staged +npm run lint --fix \ No newline at end of file diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 7fbbda2cf..000000000 --- a/Dockerfile +++ /dev/null @@ -1,60 +0,0 @@ -FROM node:20-bookworm AS base - -# 1. Install dependencies only when needed -FROM base AS builder - -# Install g++ 11 -RUN apt update && apt install -y gcc-11 g++-11 cpp-11 jq xsel && rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -# Install dependencies based on the preferred package manager -COPY . ./ - -RUN export NITRO_VERSION=$(cat extensions/inference-nitro-extension/bin/version.txt) && \ - jq --arg nitroVersion $NITRO_VERSION '(.scripts."downloadnitro:linux" | gsub("\\${NITRO_VERSION}"; $nitroVersion)) | gsub("\r"; "")' extensions/inference-nitro-extension/package.json > /tmp/newcommand.txt && export NEW_COMMAND=$(sed 's/^"//;s/"$//' /tmp/newcommand.txt) && jq --arg newCommand "$NEW_COMMAND" '.scripts."downloadnitro:linux" = $newCommand' extensions/inference-nitro-extension/package.json > /tmp/package.json && mv /tmp/package.json extensions/inference-nitro-extension/package.json -RUN make install-and-build - -# # 2. Rebuild the source code only when needed -FROM base AS runner - -# Install g++ 11 -RUN apt update && apt install -y gcc-11 g++-11 cpp-11 jq xsel && rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -# Copy the package.json and yarn.lock of root yarn space to leverage Docker cache -COPY --from=builder /app/package.json ./package.json -COPY --from=builder /app/node_modules ./node_modules/ -COPY --from=builder /app/yarn.lock ./yarn.lock - -# Copy the package.json, yarn.lock, and build output of server yarn space to leverage Docker cache -COPY --from=builder /app/core ./core/ -COPY --from=builder /app/server ./server/ -RUN cd core && yarn install && yarn run build -RUN yarn workspace @janhq/server install && yarn workspace @janhq/server build -COPY --from=builder /app/docs/openapi ./docs/openapi/ - -# Copy pre-install dependencies -COPY --from=builder /app/pre-install ./pre-install/ - -# Copy the package.json, yarn.lock, and output of web yarn space to leverage Docker cache -COPY --from=builder /app/joi ./joi/ -COPY --from=builder /app/web ./web/ - -RUN yarn workspace @janhq/joi install && yarn workspace @janhq/joi build -RUN yarn workspace @janhq/web install - -RUN npm install -g serve@latest - -EXPOSE 1337 3000 3928 - -ENV JAN_API_HOST 0.0.0.0 -ENV JAN_API_PORT 1337 - -ENV API_BASE_URL http://localhost:1337 - -CMD ["sh", "-c", "export NODE_ENV=production && yarn workspace @janhq/web build && cd web && npx serve out & cd server && node build/main.js"] - -# docker build -t jan . -# docker run -p 1337:1337 -p 3000:3000 -p 3928:3928 jan diff --git a/Dockerfile.gpu b/Dockerfile.gpu deleted file mode 100644 index 195a28d42..000000000 --- a/Dockerfile.gpu +++ /dev/null @@ -1,87 +0,0 @@ -# Please change the base image to the appropriate CUDA version base on NVIDIA Driver Compatibility -# Run nvidia-smi to check the CUDA version and the corresponding driver version -# Then update the base image to the appropriate CUDA version refer https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags - -FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS base - -# 1. Install dependencies only when needed -FROM base AS builder - -# Install g++ 11 -RUN apt update && apt install -y gcc-11 g++-11 cpp-11 jq xsel curl gnupg make python3-dev && curl -sL https://deb.nodesource.com/setup_20.x | bash - && apt install nodejs -y && rm -rf /var/lib/apt/lists/* - -# Update alternatives for GCC and related tools -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 110 \ - --slave /usr/bin/g++ g++ /usr/bin/g++-11 \ - --slave /usr/bin/gcov gcov /usr/bin/gcov-11 \ - --slave /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 \ - --slave /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 && \ - update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 110 - -RUN npm install -g yarn - -WORKDIR /app - -# Install dependencies based on the preferred package manager -COPY . ./ - -RUN export NITRO_VERSION=$(cat extensions/inference-nitro-extension/bin/version.txt) && \ - jq --arg nitroVersion $NITRO_VERSION '(.scripts."downloadnitro:linux" | gsub("\\${NITRO_VERSION}"; $nitroVersion)) | gsub("\r"; "")' extensions/inference-nitro-extension/package.json > /tmp/newcommand.txt && export NEW_COMMAND=$(sed 's/^"//;s/"$//' /tmp/newcommand.txt) && jq --arg newCommand "$NEW_COMMAND" '.scripts."downloadnitro:linux" = $newCommand' extensions/inference-nitro-extension/package.json > /tmp/package.json && mv /tmp/package.json extensions/inference-nitro-extension/package.json -RUN make install-and-build - -# # 2. Rebuild the source code only when needed -FROM base AS runner - -# Install g++ 11 -RUN apt update && apt install -y gcc-11 g++-11 cpp-11 jq xsel curl gnupg make python3-dev && curl -sL https://deb.nodesource.com/setup_20.x | bash - && apt-get install nodejs -y && rm -rf /var/lib/apt/lists/* - -# Update alternatives for GCC and related tools -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 110 \ - --slave /usr/bin/g++ g++ /usr/bin/g++-11 \ - --slave /usr/bin/gcov gcov /usr/bin/gcov-11 \ - --slave /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 \ - --slave /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 && \ - update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 110 - -RUN npm install -g yarn - -WORKDIR /app - -# Copy the package.json and yarn.lock of root yarn space to leverage Docker cache -COPY --from=builder /app/package.json ./package.json -COPY --from=builder /app/node_modules ./node_modules/ -COPY --from=builder /app/yarn.lock ./yarn.lock - -# Copy the package.json, yarn.lock, and build output of server yarn space to leverage Docker cache -COPY --from=builder /app/core ./core/ -COPY --from=builder /app/server ./server/ -RUN cd core && yarn install && yarn run build -RUN yarn workspace @janhq/server install && yarn workspace @janhq/server build -COPY --from=builder /app/docs/openapi ./docs/openapi/ - -# Copy pre-install dependencies -COPY --from=builder /app/pre-install ./pre-install/ - -# Copy the package.json, yarn.lock, and output of web yarn space to leverage Docker cache -COPY --from=builder /app/joi ./joi/ -COPY --from=builder /app/web ./web/ - -RUN yarn workspace @janhq/joi install && yarn workspace @janhq/joi build -RUN yarn workspace @janhq/web install - -RUN npm install -g serve@latest - -EXPOSE 1337 3000 3928 - -ENV LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/cuda-12.0/compat${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} - -ENV JAN_API_HOST 0.0.0.0 -ENV JAN_API_PORT 1337 - -ENV API_BASE_URL http://localhost:1337 - -CMD ["sh", "-c", "export NODE_ENV=production && yarn workspace @janhq/web build && cd web && npx serve out & cd server && node build/main.js"] - -# pre-requisites: nvidia-docker -# docker build -t jan-gpu . -f Dockerfile.gpu -# docker run -p 1337:1337 -p 3000:3000 -p 3928:3928 --gpus all jan-gpu diff --git a/charts/server/Chart.lock b/charts/server/Chart.lock deleted file mode 100644 index 915788d61..000000000 --- a/charts/server/Chart.lock +++ /dev/null @@ -1,6 +0,0 @@ -dependencies: -- name: common - repository: oci://ghcr.io/janhq/charts - version: 0.1.2 -digest: sha256:35e98bde174130787755b0f8ea2359b7b6790d965a7157c2f7cabf1bc8c04471 -generated: "2024-02-20T16:20:37.6530108+07:00" diff --git a/charts/server/Chart.yaml b/charts/server/Chart.yaml deleted file mode 100644 index fb2e1c91b..000000000 --- a/charts/server/Chart.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v2 -name: jan-server -description: A Helm chart for Kubernetes -type: application -version: 0.1.0 -appVersion: '1.0.0' -dependencies: - - name: common - version: 0.1.2 # common-chart-version - repository: oci://ghcr.io/janhq/charts diff --git a/charts/server/charts/common-0.1.2.tgz b/charts/server/charts/common-0.1.2.tgz deleted file mode 100644 index 946617eab..000000000 Binary files a/charts/server/charts/common-0.1.2.tgz and /dev/null differ diff --git a/charts/server/config.json b/charts/server/config.json deleted file mode 100644 index 62e9682fa..000000000 --- a/charts/server/config.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "image-list": "server=ghcr.io/janhq/jan-server", - "platforms": "linux/amd64" -} \ No newline at end of file diff --git a/charts/server/values.yaml b/charts/server/values.yaml deleted file mode 100644 index b31f47656..000000000 --- a/charts/server/values.yaml +++ /dev/null @@ -1,256 +0,0 @@ -common: - imageTag: v0.4.6-cpu - # DO NOT CHANGE THE LINE ABOVE. MAKE ALL CHANGES BELOW - - # Global pvc for all workload - pvc: - enabled: false - name: 'janroot' - accessModes: 'ReadWriteOnce' - storageClassName: '' - capacity: '50Gi' - - # Global image pull secret - imagePullSecrets: [] - - externalSecret: - create: false - name: '' - annotations: {} - - nameOverride: 'jan-server' - fullnameOverride: 'jan-server' - - serviceAccount: - create: true - annotations: {} - name: 'jan-server-service-account' - - podDisruptionBudget: - create: false - minAvailable: 1 - - workloads: - - name: server - image: - repository: ghcr.io/janhq/jan-server - pullPolicy: Always - - command: ['/bin/sh', '-c'] - args: ['cd server && node build/main.js'] - - replicaCount: 1 - ports: - containerPort: 1337 - - strategy: - canary: - steps: - - setWeight: 50 - - pause: { duration: 1m } - - ingress: - enabled: true - className: 'nginx' - annotations: - nginx.ingress.kubernetes.io/proxy-body-size: '100m' - nginx.ingress.kubernetes.io/proxy-read-timeout: '1800' - nginx.ingress.kubernetes.io/proxy-send-timeout: '1800' - # cert-manager.io/cluster-issuer: 'jan-ai-dns01-cluster-issuer' - # nginx.ingress.kubernetes.io/force-ssl-redirect: 'true' - nginx.ingress.kubernetes.io/backend-protocol: HTTP - hosts: - - host: server.local - paths: - - path: / - pathType: Prefix - tls: - [] - # - hosts: - # - server-dev.jan.ai - # secretName: jan-server-prod-tls-v2 - - instrumentation: - enabled: false - podAnnotations: {} - - podSecurityContext: {} - - securityContext: {} - - service: - externalLabel: {} - type: ClusterIP - port: 1337 - targetPort: 1337 - - # If you want to use GPU, please uncomment the following lines and change imageTag to the one with GPU support - resources: - # limits: - # nvidia.com/gpu: 1 - requests: - cpu: 2000m - memory: 8192M - - # If you want to use pv, please uncomment the following lines and enable pvc.enabled - volumes: - [] - # - name: janroot - # persistentVolumeClaim: - # claimName: janroot - - volumeMounts: - [] - # - name: janroot - # mountPath: /app/server/build/jan - - # AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, S3_BUCKET_NAME, AWS_ENDPOINT, AWS_REGION should mount as a secret env instead of plain text here - # Change API_BASE_URL to your server's public domain - env: - - name: API_BASE_URL - value: 'http://server.local' - - lifecycle: {} - autoscaling: - enabled: false - minReplicas: 2 - maxReplicas: 3 - targetCPUUtilizationPercentage: 95 - targetMemoryUtilizationPercentage: 95 - - kedaScaling: - enabled: false # ignore if autoscaling.enable = true - cooldownPeriod: 30 - pollingInterval: 2 - minReplicas: 1 - maxReplicas: 5 - metricName: celery_queue_length - query: celery_queue_length{queue_name="myqueue"} # change queue_name here - serverAddress: http://prometheus-prod-kube-prome-prometheus.monitoring.svc:9090 - threshold: '3' - - nodeSelector: {} - - tolerations: [] - - podSecurityGroup: - enabled: false - securitygroupid: [] - - # Reloader Option - reloader: 'false' - vpa: - enabled: false - - - name: web - image: - repository: ghcr.io/janhq/jan-server - pullPolicy: Always - - command: ['/bin/sh', '-c'] - args: - [ - 'export NODE_ENV=production && yarn workspace @janhq/web build && cd web && npx serve out', - ] - - replicaCount: 1 - ports: - containerPort: 3000 - - strategy: - canary: - steps: - - setWeight: 50 - - pause: { duration: 1m } - - ingress: - enabled: true - className: 'nginx' - annotations: - nginx.ingress.kubernetes.io/proxy-body-size: '100m' - nginx.ingress.kubernetes.io/proxy-read-timeout: '1800' - nginx.ingress.kubernetes.io/proxy-send-timeout: '1800' - # cert-manager.io/cluster-issuer: 'jan-ai-dns01-cluster-issuer' - # nginx.ingress.kubernetes.io/force-ssl-redirect: 'true' - nginx.ingress.kubernetes.io/backend-protocol: HTTP - hosts: - - host: web.local - paths: - - path: / - pathType: Prefix - tls: - [] - # - hosts: - # - server-dev.jan.ai - # secretName: jan-server-prod-tls-v2 - - instrumentation: - enabled: false - podAnnotations: {} - - podSecurityContext: {} - - securityContext: {} - - service: - externalLabel: {} - type: ClusterIP - port: 3000 - targetPort: 3000 - - resources: - limits: - cpu: 1000m - memory: 2048M - requests: - cpu: 50m - memory: 500M - - volumes: - [] - # - name: janroot - # persistentVolumeClaim: - # claimName: janroot - - volumeMounts: - [] - # - name: janroot - # mountPath: /app/server/build/jan - - # AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, S3_BUCKET_NAME, AWS_ENDPOINT, AWS_REGION should mount as a secret env instead of plain text here - # Change API_BASE_URL to your server's public domain - env: - - name: API_BASE_URL - value: 'http://server.local' - - lifecycle: {} - autoscaling: - enabled: true - minReplicas: 1 - maxReplicas: 3 - targetCPUUtilizationPercentage: 95 - targetMemoryUtilizationPercentage: 95 - - kedaScaling: - enabled: false # ignore if autoscaling.enable = true - cooldownPeriod: 30 - pollingInterval: 2 - minReplicas: 1 - maxReplicas: 5 - metricName: celery_queue_length - query: celery_queue_length{queue_name="myqueue"} # change queue_name here - serverAddress: http://prometheus-prod-kube-prome-prometheus.monitoring.svc:9090 - threshold: '3' - - nodeSelector: {} - - tolerations: [] - - podSecurityGroup: - enabled: false - securitygroupid: [] - - # Reloader Option - reloader: 'false' - vpa: - enabled: false diff --git a/core/src/browser/extension.ts b/core/src/browser/extension.ts index 18a6e4491..603445745 100644 --- a/core/src/browser/extension.ts +++ b/core/src/browser/extension.ts @@ -118,10 +118,21 @@ export abstract class BaseExtension implements ExtensionType { setting.extensionName = this.name }) try { - await fs.mkdir(extensionSettingFolderPath) + if (!(await fs.existsSync(extensionSettingFolderPath))) + await fs.mkdir(extensionSettingFolderPath) const settingFilePath = await joinPath([extensionSettingFolderPath, this.settingFileName]) - if (await fs.existsSync(settingFilePath)) return + // Persists new settings + if (await fs.existsSync(settingFilePath)) { + const oldSettings = JSON.parse(await fs.readFileSync(settingFilePath, 'utf-8')) + settings.forEach((setting) => { + // Keep setting value + if (setting.controllerProps && Array.isArray(oldSettings)) + setting.controllerProps.value = oldSettings.find( + (e: any) => e.key === setting.key + )?.controllerProps?.value + }) + } await fs.writeFileSync(settingFilePath, JSON.stringify(settings, null, 2)) } catch (err) { console.error(err) @@ -168,6 +179,7 @@ export abstract class BaseExtension implements ExtensionType { ]) try { + if (!(await fs.existsSync(settingPath))) return [] const content = await fs.readFileSync(settingPath, 'utf-8') const settings: SettingComponentProps[] = JSON.parse(content) return settings diff --git a/core/src/browser/extensions/engines/OAIEngine.ts b/core/src/browser/extensions/engines/OAIEngine.ts index 01ef55e5e..a8dde4677 100644 --- a/core/src/browser/extensions/engines/OAIEngine.ts +++ b/core/src/browser/extensions/engines/OAIEngine.ts @@ -89,6 +89,7 @@ export abstract class OAIEngine extends AIEngine { model: model.id, stream: true, ...model.parameters, + ...(this.provider === 'nitro' ? { engine: 'cortex.llamacpp'} : {}), } if (this.transformPayload) { requestBody = this.transformPayload(requestBody) diff --git a/core/src/browser/fs.ts b/core/src/browser/fs.ts index cca9bb1d3..9240b3876 100644 --- a/core/src/browser/fs.ts +++ b/core/src/browser/fs.ts @@ -58,6 +58,15 @@ const appendFileSync = (...args: any[]) => globalThis.core.api?.appendFileSync(. const copyFile: (src: string, dest: string) => Promise = (src, dest) => globalThis.core.api?.copyFile(src, dest) +/** + * Gets the list of gguf files in a directory + * + * @param path - The paths to the file. + * @returns {Promise<{any}>} - A promise that resolves with the list of gguf and non-gguf files + */ +const getGgufFiles: (paths: string[]) => Promise = ( + paths) => globalThis.core.api?.getGgufFiles(paths) + /** * Gets the file's stats. * @@ -84,4 +93,5 @@ export const fs = { copyFile, fileStat, writeBlob, + getGgufFiles, } diff --git a/core/src/node/api/processors/app.ts b/core/src/node/api/processors/app.ts index c98060da4..15460ba56 100644 --- a/core/src/node/api/processors/app.ts +++ b/core/src/node/api/processors/app.ts @@ -77,8 +77,8 @@ export class App implements Processor { port: args?.port, isCorsEnabled: args?.isCorsEnabled, isVerboseEnabled: args?.isVerboseEnabled, - schemaPath: join(await appResourcePath(), 'docs', 'openapi', 'jan.yaml'), - baseDir: join(await appResourcePath(), 'docs', 'openapi'), + schemaPath: join(appResourcePath(), 'docs', 'openapi', 'jan.yaml'), + baseDir: join(appResourcePath(), 'docs', 'openapi'), prefix: args?.prefix, }) } diff --git a/core/src/node/api/processors/extension.ts b/core/src/node/api/processors/extension.ts index df5d2d945..c8637d004 100644 --- a/core/src/node/api/processors/extension.ts +++ b/core/src/node/api/processors/extension.ts @@ -42,7 +42,7 @@ export class Extension implements Processor { * @returns An array of paths to the base extensions. */ async baseExtensions() { - const baseExtensionPath = join(await appResourcePath(), 'pre-install') + const baseExtensionPath = join(appResourcePath(), 'pre-install') return readdirSync(baseExtensionPath) .filter((file) => extname(file) === '.tgz') .map((file) => join(baseExtensionPath, file)) diff --git a/core/src/node/api/processors/fsExt.ts b/core/src/node/api/processors/fsExt.ts index 155732cfc..4d113e1ee 100644 --- a/core/src/node/api/processors/fsExt.ts +++ b/core/src/node/api/processors/fsExt.ts @@ -1,7 +1,7 @@ -import { join } from 'path' -import fs from 'fs' +import { basename, join } from 'path' +import fs, { readdirSync } from 'fs' import { appResourcePath, normalizeFilePath, validatePath } from '../../helper/path' -import { getJanDataFolderPath, getJanDataFolderPath as getPath } from '../../helper' +import { defaultAppConfig, getJanDataFolderPath, getJanDataFolderPath as getPath } from '../../helper' import { Processor } from './Processor' import { FileStat } from '../../../types' @@ -28,9 +28,10 @@ export class FSExt implements Processor { return appResourcePath() } - // Handles the 'getUserHomePath' IPC event. This event is triggered to get the user home path. + // Handles the 'getUserHomePath' IPC event. This event is triggered to get the user app data path. + // CAUTION: This would not return OS home path but the app data path. getUserHomePath() { - return process.env[process.platform == 'win32' ? 'USERPROFILE' : 'HOME'] + return defaultAppConfig().data_folder } // handle fs is directory here @@ -79,4 +80,53 @@ export class FSExt implements Processor { }) }) } + + async getGgufFiles(paths: string[]) { + const sanitizedFilePaths: { + path: string + name: string + size: number + }[] = [] + for (const filePath of paths) { + const normalizedPath = normalizeFilePath(filePath) + + const isExist = fs.existsSync(normalizedPath) + if (!isExist) continue + const fileStats = fs.statSync(normalizedPath) + if (!fileStats) continue + if (!fileStats.isDirectory()) { + const fileName = await basename(normalizedPath) + sanitizedFilePaths.push({ + path: normalizedPath, + name: fileName, + size: fileStats.size, + }) + } else { + // allowing only one level of directory + const files = await readdirSync(normalizedPath) + + for (const file of files) { + const fullPath = await join(normalizedPath, file) + const fileStats = await fs.statSync(fullPath) + if (!fileStats || fileStats.isDirectory()) continue + + sanitizedFilePaths.push({ + path: fullPath, + name: file, + size: fileStats.size, + }) + } + } + } + const unsupportedFiles = sanitizedFilePaths.filter( + (file) => !file.path.endsWith('.gguf') + ) + const supportedFiles = sanitizedFilePaths.filter((file) => + file.path.endsWith('.gguf') + ) + return { + unsupportedFiles, + supportedFiles, + } + } } diff --git a/core/src/node/api/restful/v1.ts b/core/src/node/api/restful/v1.ts index 5eb8f5067..9d57de841 100644 --- a/core/src/node/api/restful/v1.ts +++ b/core/src/node/api/restful/v1.ts @@ -1,16 +1,16 @@ import { HttpServer } from '../HttpServer' import { commonRouter } from './common' -import { downloadRouter } from './app/download' -import { handleRequests } from './app/handlers' export const v1Router = async (app: HttpServer) => { // MARK: Public API Routes app.register(commonRouter) // MARK: Internal Application Routes - handleRequests(app) + // DEPRECATED: Vulnerability possible issues + // handleRequests(app) // Expanded route for tracking download progress // TODO: Replace by Observer Wrapper (ZeroMQ / Vanilla Websocket) - app.register(downloadRouter) + // DEPRECATED: Jan FE Docker deploy is deprecated + // app.register(downloadRouter) } diff --git a/core/src/node/helper/config.ts b/core/src/node/helper/config.ts index 1a341a625..8bf48d629 100644 --- a/core/src/node/helper/config.ts +++ b/core/src/node/helper/config.ts @@ -1,25 +1,18 @@ import { AppConfiguration, SettingComponentProps } from '../../types' -import { join } from 'path' +import { join, resolve } from 'path' import fs from 'fs' import os from 'os' import childProcess from 'child_process' - const configurationFileName = 'settings.json' -// TODO: do no specify app name in framework module -// TODO: do not default the os.homedir -const defaultJanDataFolder = join(os?.homedir() || '', 'jan') -const defaultAppConfig: AppConfiguration = { - data_folder: defaultJanDataFolder, - quick_ask: false, -} - /** * Getting App Configurations. * * @returns {AppConfiguration} The app configurations. */ export const getAppConfigurations = (): AppConfiguration => { + const appDefaultConfiguration = defaultAppConfig() + if (process.env.CI === 'e2e') return appDefaultConfiguration // Retrieve Application Support folder path // Fallback to user home directory if not found const configurationFile = getConfigurationFilePath() @@ -27,8 +20,8 @@ export const getAppConfigurations = (): AppConfiguration => { if (!fs.existsSync(configurationFile)) { // create default app config if we don't have one console.debug(`App config not found, creating default config at ${configurationFile}`) - fs.writeFileSync(configurationFile, JSON.stringify(defaultAppConfig)) - return defaultAppConfig + fs.writeFileSync(configurationFile, JSON.stringify(appDefaultConfiguration)) + return appDefaultConfiguration } try { @@ -38,7 +31,7 @@ export const getAppConfigurations = (): AppConfiguration => { return appConfigurations } catch (err) { console.error(`Failed to read app config, return default config instead! Err: ${err}`) - return defaultAppConfig + return defaultAppConfig() } } @@ -155,3 +148,22 @@ export const getEngineConfiguration = async (engineId: string) => { full_url: fullUrl, } } + +/** + * Default app configurations + * App Data Folder default to Electron's userData + * %APPDATA% on Windows + * $XDG_CONFIG_HOME or ~/.config on Linux + * ~/Library/Application Support on macOS + */ +export const defaultAppConfig = (): AppConfiguration => { + const { app } = require('electron') + const defaultJanDataFolder = join(app?.getPath('userData') ?? os?.homedir() ?? '', 'data') + return { + data_folder: + process.env.CI === 'e2e' + ? (process.env.APP_CONFIG_PATH ?? resolve('./test-data')) + : defaultJanDataFolder, + quick_ask: false, + } +} diff --git a/core/src/node/helper/path.ts b/core/src/node/helper/path.ts index a2d57ed3e..8115383bb 100644 --- a/core/src/node/helper/path.ts +++ b/core/src/node/helper/path.ts @@ -11,34 +11,41 @@ export function normalizeFilePath(path: string): string { return path.replace(/^(file:[\\/]+)([^:\s]+)$/, '$2') } -export async function appResourcePath(): Promise { - let electron: any = undefined - +/** + * App resources path + * Returns string - The current application directory. + */ +export function appResourcePath() { try { - const moduleName = 'electron' - electron = await import(moduleName) + const electron = require('electron') + // electron + if (electron && electron.protocol) { + let appPath = join(electron.app.getAppPath(), '..', 'app.asar.unpacked') + + if (!electron.app.isPackaged) { + // for development mode + appPath = join(electron.app.getAppPath()) + } + return appPath + } } catch (err) { console.error('Electron is not available') } - // electron - if (electron && electron.protocol) { - let appPath = join(electron.app.getAppPath(), '..', 'app.asar.unpacked') - - if (!electron.app.isPackaged) { - // for development mode - appPath = join(electron.app.getAppPath()) - } - return appPath - } // server return join(global.core.appPath(), '../../..') } export function validatePath(path: string) { - const janDataFolderPath = getJanDataFolderPath() + const appDataFolderPath = getJanDataFolderPath() + const resourcePath = appResourcePath() + const applicationSupportPath = global.core?.appPath() ?? resourcePath const absolutePath = resolve(__dirname, path) - if (!absolutePath.startsWith(janDataFolderPath)) { + if ( + ![appDataFolderPath, resourcePath, applicationSupportPath].some((whiteListedPath) => + absolutePath.startsWith(whiteListedPath) + ) + ) { throw new Error(`Invalid path: ${absolutePath}`) } } diff --git a/core/src/types/api/index.ts b/core/src/types/api/index.ts index e50dce6de..e1d1b28da 100644 --- a/core/src/types/api/index.ts +++ b/core/src/types/api/index.ts @@ -105,6 +105,7 @@ export enum FileManagerRoute { getUserHomePath = 'getUserHomePath', fileStat = 'fileStat', writeBlob = 'writeBlob', + getGgufFiles = 'getGgufFiles', } export type ApiFunction = (...args: any[]) => any diff --git a/core/src/types/model/modelEntity.ts b/core/src/types/model/modelEntity.ts index 426b30846..f154f7f04 100644 --- a/core/src/types/model/modelEntity.ts +++ b/core/src/types/model/modelEntity.ts @@ -25,6 +25,10 @@ export enum InferenceEngine { triton_trtllm = 'triton_trtllm', nitro_tensorrt_llm = 'nitro-tensorrt-llm', cohere = 'cohere', + nvidia = 'nvidia', + cortex_llamacpp = 'cortex.llamacpp', + cortex_onnx = 'cortex.onnx', + cortex_tensorrtllm = 'cortex.tensorrt-llm', } export type ModelArtifact = { @@ -103,6 +107,9 @@ export type ModelMetadata = { tags: string[] size: number cover?: string + // These settings to preserve model settings across threads + default_ctx_len?: number + default_max_tokens?: number } /** diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml deleted file mode 100644 index 2e09d641b..000000000 --- a/docker-compose-dev.yml +++ /dev/null @@ -1,171 +0,0 @@ -# Docker Compose file for setting up Minio, createbuckets, app_cpu, and app_gpu services - -version: '3.7' - -services: - # Minio service for object storage - minio: - image: minio/minio - volumes: - - minio_data:/data - ports: - - '9000:9000' - - '9001:9001' - environment: - # Set the root user and password for Minio - MINIO_ROOT_USER: minioadmin # This acts as AWS_ACCESS_KEY - MINIO_ROOT_PASSWORD: minioadmin # This acts as AWS_SECRET_ACCESS_KEY - command: server --console-address ":9001" /data - restart: always - healthcheck: - test: ['CMD', 'curl', '-f', 'http://localhost:9000/minio/health/live'] - interval: 30s - timeout: 20s - retries: 3 - networks: - vpcbr: - ipv4_address: 10.5.0.2 - - # createbuckets service to create a bucket and set its policy - createbuckets: - image: minio/mc - depends_on: - - minio - entrypoint: > - /bin/sh -c " - /usr/bin/mc alias set myminio http://minio:9000 minioadmin minioadmin; - /usr/bin/mc mb myminio/mybucket; - /usr/bin/mc policy set public myminio/mybucket; - exit 0; - " - networks: - vpcbr: - - # app_cpu service for running the CPU version of the application - app_cpu_s3fs: - image: jan:latest - volumes: - - app_data_cpu_s3fs:/app/server/build/jan - build: - context: . - dockerfile: Dockerfile - environment: - # Set the AWS access key, secret access key, bucket name, endpoint, and region for app_cpu - AWS_ACCESS_KEY_ID: minioadmin - AWS_SECRET_ACCESS_KEY: minioadmin - S3_BUCKET_NAME: mybucket - AWS_ENDPOINT: http://10.5.0.2:9000 - AWS_REGION: us-east-1 - API_BASE_URL: http://localhost:1337 - restart: always - profiles: - - cpu-s3fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.3 - - # app_gpu service for running the GPU version of the application - app_gpu_s3fs: - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - image: jan-gpu:latest - volumes: - - app_data_gpu_s3fs:/app/server/build/jan - build: - context: . - dockerfile: Dockerfile.gpu - restart: always - environment: - # Set the AWS access key, secret access key, bucket name, endpoint, and region for app_gpu - AWS_ACCESS_KEY_ID: minioadmin - AWS_SECRET_ACCESS_KEY: minioadmin - S3_BUCKET_NAME: mybucket - AWS_ENDPOINT: http://10.5.0.2:9000 - AWS_REGION: us-east-1 - API_BASE_URL: http://localhost:1337 - profiles: - - gpu-s3fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.4 - - app_cpu_fs: - image: jan:latest - volumes: - - app_data_cpu_fs:/app/server/build/jan - build: - context: . - dockerfile: Dockerfile - environment: - API_BASE_URL: http://localhost:1337 - restart: always - profiles: - - cpu-fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.5 - - # app_gpu service for running the GPU version of the application - app_gpu_fs: - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - image: jan-gpu:latest - volumes: - - app_data_gpu_fs:/app/server/build/jan - build: - context: . - dockerfile: Dockerfile.gpu - restart: always - environment: - API_BASE_URL: http://localhost:1337 - profiles: - - gpu-fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.6 - -volumes: - minio_data: - app_data_cpu_s3fs: - app_data_gpu_s3fs: - app_data_cpu_fs: - app_data_gpu_fs: - -networks: - vpcbr: - driver: bridge - ipam: - config: - - subnet: 10.5.0.0/16 - gateway: 10.5.0.1 -# Usage: -# - Run 'docker compose -f docker-compose-dev.yml --profile cpu-s3fs up -d' to start the app_cpu service -# - Run 'docker compose -f docker-compose-dev.yml --profile gpu-s3fs up -d' to start the app_gpu service -# - Run 'docker compose -f docker-compose-dev.yml --profile cpu-fs up -d' to start the app_cpu service -# - Run 'docker compose -f docker-compose-dev.yml --profile gpu-fs up -d' to start the app_gpu service diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 1e5660c12..000000000 --- a/docker-compose.yml +++ /dev/null @@ -1,159 +0,0 @@ -# Docker Compose file for setting up Minio, createbuckets, app_cpu, and app_gpu services - -version: '3.7' - -services: - # Minio service for object storage - minio: - image: minio/minio - volumes: - - minio_data:/data - ports: - - '9000:9000' - - '9001:9001' - environment: - # Set the root user and password for Minio - MINIO_ROOT_USER: minioadmin # This acts as AWS_ACCESS_KEY - MINIO_ROOT_PASSWORD: minioadmin # This acts as AWS_SECRET_ACCESS_KEY - command: server --console-address ":9001" /data - restart: always - healthcheck: - test: ['CMD', 'curl', '-f', 'http://localhost:9000/minio/health/live'] - interval: 30s - timeout: 20s - retries: 3 - networks: - vpcbr: - ipv4_address: 10.5.0.2 - - # createbuckets service to create a bucket and set its policy - createbuckets: - image: minio/mc - depends_on: - - minio - entrypoint: > - /bin/sh -c " - /usr/bin/mc alias set myminio http://minio:9000 minioadmin minioadmin; - /usr/bin/mc mb myminio/mybucket; - /usr/bin/mc policy set public myminio/mybucket; - exit 0; - " - networks: - vpcbr: - - # app_cpu service for running the CPU version of the application - app_cpu_s3fs: - volumes: - - app_data_cpu_s3fs:/app/server/build/jan - image: ghcr.io/janhq/jan-server:dev-cpu-latest - environment: - # Set the AWS access key, secret access key, bucket name, endpoint, and region for app_cpu - AWS_ACCESS_KEY_ID: minioadmin - AWS_SECRET_ACCESS_KEY: minioadmin - S3_BUCKET_NAME: mybucket - AWS_ENDPOINT: http://10.5.0.2:9000 - AWS_REGION: us-east-1 - API_BASE_URL: http://localhost:1337 - restart: always - profiles: - - cpu-s3fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.3 - - # app_gpu service for running the GPU version of the application - app_gpu_s3fs: - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - image: ghcr.io/janhq/jan-server:dev-cuda-12.2-latest - volumes: - - app_data_gpu_s3fs:/app/server/build/jan - restart: always - environment: - # Set the AWS access key, secret access key, bucket name, endpoint, and region for app_gpu - AWS_ACCESS_KEY_ID: minioadmin - AWS_SECRET_ACCESS_KEY: minioadmin - S3_BUCKET_NAME: mybucket - AWS_ENDPOINT: http://10.5.0.2:9000 - AWS_REGION: us-east-1 - API_BASE_URL: http://localhost:1337 - profiles: - - gpu-s3fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.4 - - app_cpu_fs: - image: ghcr.io/janhq/jan-server:dev-cpu-latest - volumes: - - app_data_cpu_fs:/app/server/build/jan - environment: - API_BASE_URL: http://localhost:1337 - restart: always - profiles: - - cpu-fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.5 - - # app_gpu service for running the GPU version of the application - app_gpu_fs: - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - image: ghcr.io/janhq/jan-server:dev-cuda-12.2-latest - volumes: - - app_data_gpu_fs:/app/server/build/jan - restart: always - environment: - API_BASE_URL: http://localhost:1337 - profiles: - - gpu-fs - ports: - - '3000:3000' - - '1337:1337' - - '3928:3928' - networks: - vpcbr: - ipv4_address: 10.5.0.6 - -volumes: - minio_data: - app_data_cpu_s3fs: - app_data_gpu_s3fs: - app_data_cpu_fs: - app_data_gpu_fs: - -networks: - vpcbr: - driver: bridge - ipam: - config: - - subnet: 10.5.0.0/16 - gateway: 10.5.0.1 -# Usage: -# - Run 'docker compose --profile cpu-s3fs up -d' to start the app_cpu service -# - Run 'docker compose --profile gpu-s3fs up -d' to start the app_gpu service -# - Run 'docker compose --profile cpu-fs up -d' to start the app_cpu service -# - Run 'docker compose --profile gpu-fs up -d' to start the app_gpu service diff --git a/electron/managers/mainWindowConfig.ts b/electron/managers/mainWindowConfig.ts index c3f9c01bd..82c437106 100644 --- a/electron/managers/mainWindowConfig.ts +++ b/electron/managers/mainWindowConfig.ts @@ -1,8 +1,10 @@ const DEFAULT_MIN_WIDTH = 400 +const DEFAULT_MIN_HEIGHT = 600 export const mainWindowConfig: Electron.BrowserWindowConstructorOptions = { skipTaskbar: false, minWidth: DEFAULT_MIN_WIDTH, + minHeight: DEFAULT_MIN_HEIGHT, show: true, transparent: true, frame: false, diff --git a/electron/utils/migration.ts b/electron/utils/migration.ts index defe0cebb..52ee45ed0 100644 --- a/electron/utils/migration.ts +++ b/electron/utils/migration.ts @@ -12,9 +12,9 @@ import { } from 'fs' import Store from 'electron-store' import { - getJanExtensionsPath, getJanDataFolderPath, appResourcePath, + getJanExtensionsPath, } from '@janhq/core/node' /** @@ -28,8 +28,9 @@ export async function migrate() { if (store.get('migrated_version') !== app.getVersion()) { console.debug('start migration:', store.get('migrated_version')) - // if (existsSync(getJanExtensionsPath())) - // rmdirSync(getJanExtensionsPath(), { recursive: true }) + if (existsSync(getJanExtensionsPath())) + rmdirSync(getJanExtensionsPath(), { recursive: true }) + await migrateThemes() store.set('migrated_version', app.getVersion()) @@ -43,9 +44,9 @@ async function migrateThemes() { if (!existsSync(join(getJanDataFolderPath(), 'themes'))) mkdirSync(join(getJanDataFolderPath(), 'themes'), { recursive: true }) - const themes = readdirSync(join(await appResourcePath(), 'themes')) + const themes = readdirSync(join(appResourcePath(), 'themes')) for (const theme of themes) { - const themePath = join(await appResourcePath(), 'themes', theme) + const themePath = join(appResourcePath(), 'themes', theme) if (existsSync(themePath) && !lstatSync(themePath).isDirectory()) { continue } diff --git a/extensions/inference-anthropic-extension/resources/settings.json b/extensions/inference-anthropic-extension/resources/settings.json index bb35e6b3d..9ca4405ac 100644 --- a/extensions/inference-anthropic-extension/resources/settings.json +++ b/extensions/inference-anthropic-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "anthropic-api-key", + "title": "API Key", + "description": "The Anthropic API uses API keys for authentication. Visit your [API Keys](https://console.anthropic.com/settings/keys) page to retrieve the API key you'll use in your requests.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,16 +20,5 @@ "placeholder": "https://api.anthropic.com/v1/messages", "value": "https://api.anthropic.com/v1/messages" } - }, - { - "key": "anthropic-api-key", - "title": "API Key", - "description": "The Anthropic API uses API keys for authentication. Visit your [API Keys](https://console.anthropic.com/settings/keys) page to retrieve the API key you'll use in your requests.", - "controllerType": "input", - "controllerProps": { - "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" - } } -] \ No newline at end of file +] diff --git a/extensions/inference-cohere-extension/resources/settings.json b/extensions/inference-cohere-extension/resources/settings.json index 2a32b57f8..79150d7e5 100644 --- a/extensions/inference-cohere-extension/resources/settings.json +++ b/extensions/inference-cohere-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "cohere-api-key", + "title": "API Key", + "description": "The Cohere API uses API keys for authentication. Visit your [API Keys](https://dashboard.cohere.com/api-keys) page to retrieve the API key you'll use in your requests.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,16 +20,5 @@ "placeholder": "https://api.cohere.ai/v1/chat", "value": "https://api.cohere.ai/v1/chat" } - }, - { - "key": "cohere-api-key", - "title": "API Key", - "description": "The Cohere API uses API keys for authentication. Visit your [API Keys](https://dashboard.cohere.com/api-keys) page to retrieve the API key you'll use in your requests.", - "controllerType": "input", - "controllerProps": { - "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" - } } ] diff --git a/extensions/inference-groq-extension/resources/settings.json b/extensions/inference-groq-extension/resources/settings.json index 493b602cd..767fec0ba 100644 --- a/extensions/inference-groq-extension/resources/settings.json +++ b/extensions/inference-groq-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "groq-api-key", + "title": "API Key", + "description": "The Groq API uses API keys for authentication. Visit your [API Keys](https://console.groq.com/keys) page to retrieve the API key you'll use in your requests.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,16 +20,5 @@ "placeholder": "https://api.groq.com/openai/v1/chat/completions", "value": "https://api.groq.com/openai/v1/chat/completions" } - }, - { - "key": "groq-api-key", - "title": "API Key", - "description": "The Groq API uses API keys for authentication. Visit your [API Keys](https://console.groq.com/keys) page to retrieve the API key you'll use in your requests.", - "controllerType": "input", - "controllerProps": { - "placeholder": "gsk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" - } } ] diff --git a/extensions/inference-martian-extension/resources/settings.json b/extensions/inference-martian-extension/resources/settings.json index bc83d76d4..2341ad6cd 100644 --- a/extensions/inference-martian-extension/resources/settings.json +++ b/extensions/inference-martian-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "martian-api-key", + "title": "API Key", + "description": "The Martian API uses API keys for authentication. Visit your [API Keys](https://withmartian.com/dashboard) page to retrieve the API key you'll use in your requests.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,16 +20,5 @@ "placeholder": "https://withmartian.com/api/openai/v1/chat/completions", "value": "https://withmartian.com/api/openai/v1/chat/completions" } - }, - { - "key": "martian-api-key", - "title": "API Key", - "description": "The Martian API uses API keys for authentication. Visit your [API Keys](https://withmartian.com/dashboard) page to retrieve the API key you'll use in your requests.", - "controllerType": "input", - "controllerProps": { - "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" - } } ] diff --git a/extensions/inference-mistral-extension/resources/settings.json b/extensions/inference-mistral-extension/resources/settings.json index 2ca8ec7e5..963674b02 100644 --- a/extensions/inference-mistral-extension/resources/settings.json +++ b/extensions/inference-mistral-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "mistral-api-key", + "title": "API Key", + "description": "The Mistral API uses API keys for authentication. Visit your [API Keys](https://console.mistral.ai/api-keys/) page to retrieve the API key you'll use in your requests.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,16 +20,5 @@ "placeholder": "https://api.mistral.ai/v1/chat/completions", "value": "https://api.mistral.ai/v1/chat/completions" } - }, - { - "key": "mistral-api-key", - "title": "API Key", - "description": "The Mistral API uses API keys for authentication. Visit your [API Keys](https://console.mistral.ai/api-keys/) page to retrieve the API key you'll use in your requests.", - "controllerType": "input", - "controllerProps": { - "placeholder": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" - } } ] diff --git a/extensions/inference-nitro-extension/bin/version.txt b/extensions/inference-nitro-extension/bin/version.txt index 2b2a18d26..8f0916f76 100644 --- a/extensions/inference-nitro-extension/bin/version.txt +++ b/extensions/inference-nitro-extension/bin/version.txt @@ -1 +1 @@ -0.4.20 +0.5.0 diff --git a/extensions/inference-nitro-extension/download.bat b/extensions/inference-nitro-extension/download.bat index 9bd2d4b07..b7fbd3252 100644 --- a/extensions/inference-nitro-extension/download.bat +++ b/extensions/inference-nitro-extension/download.bat @@ -1,3 +1,3 @@ @echo off set /p CORTEX_VERSION=<./bin/version.txt -.\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-avx2-cuda-12-0.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-avx2-cuda-11-7.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7 && .\node_modules\.bin\download https://github.com/janhq/nitro/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-avx2.tar.gz -e --strip 1 -o ./bin/win-cpu && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/win-vulkan +.\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cpu && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-vulkan && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx.tar.gz -e --strip 1 -o ./bin/win-cpu/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/win-vulkan/engines/cortex.llamacpp diff --git a/extensions/inference-nitro-extension/package.json b/extensions/inference-nitro-extension/package.json index 3150108c4..7be4be69a 100644 --- a/extensions/inference-nitro-extension/package.json +++ b/extensions/inference-nitro-extension/package.json @@ -1,7 +1,7 @@ { "name": "@janhq/inference-cortex-extension", "productName": "Cortex Inference Engine", - "version": "1.0.14", + "version": "1.0.15", "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.", "main": "dist/index.js", "node": "dist/node/index.cjs.js", @@ -10,8 +10,8 @@ "scripts": { "test": "jest", "build": "tsc --module commonjs && rollup -c rollup.config.ts", - "downloadnitro:linux": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-avx2.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-avx2-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-avx2-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/linux-vulkan && chmod +x ./bin/linux-vulkan/cortex-cpp", - "downloadnitro:darwin": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-arm64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz --strip-components=1 -C ./bin/mac-arm64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz && chmod +x ./bin/mac-arm64/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-amd64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz --strip-components=1 -C ./bin/mac-amd64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz && chmod +x ./bin/mac-amd64/cortex-cpp", + "downloadnitro:linux": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-vulkan && chmod +x ./bin/linux-vulkan/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx.tar.gz -e --strip 1 -o ./bin/linux-cpu/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/linux-vulkan/engines/cortex.llamacpp", + "downloadnitro:darwin": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-arm64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz --strip-components=1 -C ./bin/mac-arm64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz && chmod +x ./bin/mac-arm64/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-amd64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz --strip-components=1 -C ./bin/mac-amd64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz && chmod +x ./bin/mac-amd64/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-arm64.tar.gz -e --strip 1 -o ./bin/mac-arm64/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-amd64.tar.gz -e --strip 1 -o ./bin/mac-amd64/engines/cortex.llamacpp", "downloadnitro:win32": "download.bat", "downloadnitro": "run-script-os", "build:publish:darwin": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && ../../.github/scripts/auto-sign.sh && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", diff --git a/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json b/extensions/inference-nitro-extension/resources/models/gemma-1.1-2b/model.json similarity index 68% rename from extensions/inference-nitro-extension/resources/models/gemma-2b/model.json rename to extensions/inference-nitro-extension/resources/models/gemma-1.1-2b/model.json index 68cff325a..56cd9c81c 100644 --- a/extensions/inference-nitro-extension/resources/models/gemma-2b/model.json +++ b/extensions/inference-nitro-extension/resources/models/gemma-1.1-2b/model.json @@ -1,20 +1,20 @@ { "sources": [ { - "filename": "gemma-2b-it-q4_k_m.gguf", - "url": "https://huggingface.co/lmstudio-ai/gemma-2b-it-GGUF/resolve/main/gemma-2b-it-q4_k_m.gguf" + "filename": "gemma-1.1-2b-it-q4_k_m.gguf", + "url": "https://huggingface.co/bartowski/gemma-1.1-2b-it-GGUF/resolve/main/gemma-1.1-2b-it-Q4_K_M.gguf" } ], - "id": "gemma-2b", + "id": "gemma-1.1-2b-it", "object": "model", - "name": "Gemma 2B Q4", + "name": "Gemma 1.1 2B Q4", "version": "1.3", "description": "Gemma is built from the same technology with Google's Gemini.", "format": "gguf", "settings": { "ctx_len": 8192, "prompt_template": "user\n{prompt}\nmodel", - "llama_model_path": "gemma-2b-it-q4_k_m.gguf", + "llama_model_path": "gemma-1.1-2b-it-Q4_K_M.gguf", "ngl": 19 }, "parameters": { @@ -29,7 +29,7 @@ "metadata": { "author": "Google", "tags": ["2B", "Finetuned", "Tiny"], - "size": 1500000000 + "size": 1630000000 }, "engine": "nitro" } diff --git a/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json b/extensions/inference-nitro-extension/resources/models/gemma-1.1-7b/model.json similarity index 69% rename from extensions/inference-nitro-extension/resources/models/gemma-7b/model.json rename to extensions/inference-nitro-extension/resources/models/gemma-1.1-7b/model.json index 615f1149b..5bd89b478 100644 --- a/extensions/inference-nitro-extension/resources/models/gemma-7b/model.json +++ b/extensions/inference-nitro-extension/resources/models/gemma-1.1-7b/model.json @@ -1,20 +1,20 @@ { "sources": [ { - "filename": "gemma-7b-it-q4_K_M.gguf", - "url": "https://huggingface.co/mmnga/gemma-7b-it-gguf/resolve/main/gemma-7b-it-q4_K_M.gguf" + "filename": "gemma-1.1-7b-it-q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/gemma-1.1-7b-it-GGUF/resolve/main/gemma-1.1-7b-it-Q4_K_M.gguf" } ], - "id": "gemma-7b", + "id": "gemma-1.1-7b-it", "object": "model", - "name": "Gemma 7B Q4", + "name": "Gemma 1.1 7B Q4", "version": "1.2", "description": "Google's Gemma is built for multilingual purpose", "format": "gguf", "settings": { "ctx_len": 8192, "prompt_template": "user\n{prompt}\nmodel", - "llama_model_path": "gemma-7b-it-q4_K_M.gguf", + "llama_model_path": "gemma-1.1-7b-it-q4_K_M.gguf", "ngl": 29 }, "parameters": { diff --git a/extensions/inference-nitro-extension/resources/models/gemma-2-27b/model.json b/extensions/inference-nitro-extension/resources/models/gemma-2-27b/model.json new file mode 100644 index 000000000..bdf2d5c9c --- /dev/null +++ b/extensions/inference-nitro-extension/resources/models/gemma-2-27b/model.json @@ -0,0 +1,42 @@ +{ + "sources": [ + { + "filename": "gemma-2-27b-it-Q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/gemma-2-27b-it-GGUF/resolve/main/gemma-2-27b-it-Q4_K_M.gguf" + } + ], + "id": "gemma-2-27b-it", + "object": "model", + "name": "Gemma 2 27B Q4", + "version": "1.0", + "description": "Gemma is built from the same technology with Google's Gemini.", + "format": "gguf", + "settings": { + "ctx_len": 8192, + "prompt_template": "user\n{prompt}\nmodel\n\nmodel\n", + "llama_model_path": "gemma-2-27b-it-Q4_K_M.gguf", + "ngl": 47 + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 8192, + "stop": [ + "" + ], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "Google", + "tags": [ + "27B", + "Conversational", + "Text-generation", + "Featured" + ], + "size": 16600000000 + }, + "engine": "nitro" +} diff --git a/extensions/inference-nitro-extension/resources/models/gemma-2-2b/model.json b/extensions/inference-nitro-extension/resources/models/gemma-2-2b/model.json new file mode 100644 index 000000000..1665f76ee --- /dev/null +++ b/extensions/inference-nitro-extension/resources/models/gemma-2-2b/model.json @@ -0,0 +1,43 @@ +{ + "sources": [ + { + "filename": "gemma-2-2b-it-Q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/gemma-2-2b-it-GGUF/resolve/main/gemma-2-2b-it-Q4_K_M.gguf" + } + ], + "id": "gemma-2-2b-it", + "object": "model", + "name": "Gemma 2 2B Q4", + "version": "1.0", + "description": "Gemma is built from the same technology with Google's Gemini.", + "format": "gguf", + "settings": { + "ctx_len": 8192, + "prompt_template": "user\n{prompt}\nmodel\n\nmodel\n", + "llama_model_path": "gemma-2-2b-it-Q4_K_M.gguf", + "ngl": 27 + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 8192, + "stop": [ + "" + ], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "Google", + "tags": [ + "2B", + "Tiny", + "Conversational", + "Text-generation", + "Featured" + ], + "size": 1710000000 + }, + "engine": "nitro" +} diff --git a/extensions/inference-nitro-extension/resources/models/gemma-2-9b/model.json b/extensions/inference-nitro-extension/resources/models/gemma-2-9b/model.json new file mode 100644 index 000000000..42e7dcee2 --- /dev/null +++ b/extensions/inference-nitro-extension/resources/models/gemma-2-9b/model.json @@ -0,0 +1,42 @@ +{ + "sources": [ + { + "filename": "gemma-2-9b-it-Q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf" + } + ], + "id": "gemma-2-9b-it", + "object": "model", + "name": "Gemma 2 9B Q4", + "version": "1.0", + "description": "Gemma is built from the same technology with Google's Gemini.", + "format": "gguf", + "settings": { + "ctx_len": 8192, + "prompt_template": "user\n{prompt}\nmodel\n\nmodel\n", + "llama_model_path": "gemma-2-9b-it-Q4_K_M.gguf", + "ngl": 43 + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 8192, + "stop": [ + "" + ], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "Google", + "tags": [ + "9B", + "Conversational", + "Text-generation", + "Featured" + ], + "size": 5760000000 + }, + "engine": "nitro" +} diff --git a/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json b/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json index 313bf8425..ced7e1ca8 100644 --- a/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json +++ b/extensions/inference-nitro-extension/resources/models/llama3-8b-instruct/model.json @@ -2,7 +2,7 @@ "sources": [ { "filename": "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf", - "url": "https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf" + "url": "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf" } ], "id": "llama3-8b-instruct", @@ -28,7 +28,7 @@ }, "metadata": { "author": "MetaAI", - "tags": ["7B", "Featured"], + "tags": ["8B", "Featured"], "size": 4920000000 }, "engine": "nitro" diff --git a/extensions/inference-nitro-extension/resources/models/llama3.1-70b-instruct/model.json b/extensions/inference-nitro-extension/resources/models/llama3.1-70b-instruct/model.json new file mode 100644 index 000000000..4d8eab7e3 --- /dev/null +++ b/extensions/inference-nitro-extension/resources/models/llama3.1-70b-instruct/model.json @@ -0,0 +1,42 @@ +{ + "sources": [ + { + "filename": "Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/Meta-Llama-3.1-70B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf" + } + ], + "id": "llama3.1-70b-instruct", + "object": "model", + "name": "Llama 3.1 70B Q4 Instruct", + "version": "1.0", + "description": "Meta's Llama 3.1 excels at general usage situations, including chat, general world knowledge, and coding.", + "format": "gguf", + "settings": { + "ctx_len": 131072, + "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "llama_model_path": "Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf", + "ngl": 33 + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 8192, + "stop": [ + "<|end_of_text|>", + "<|eot_id|>", + "<|eom_id|>" + ], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "MetaAI", + "tags": [ + "70B", + "Featured" + ], + "size": 42500000000 + }, + "engine": "nitro" +} diff --git a/extensions/inference-nitro-extension/resources/models/llama3.1-8b-instruct/model.json b/extensions/inference-nitro-extension/resources/models/llama3.1-8b-instruct/model.json new file mode 100644 index 000000000..fe44b0b1c --- /dev/null +++ b/extensions/inference-nitro-extension/resources/models/llama3.1-8b-instruct/model.json @@ -0,0 +1,42 @@ +{ + "sources": [ + { + "filename": "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", + "url": "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf" + } + ], + "id": "llama3.1-8b-instruct", + "object": "model", + "name": "Llama 3.1 8B Q4 Instruct", + "version": "1.0", + "description": "Meta's Llama 3.1 excels at general usage situations, including chat, general world knowledge, and coding.", + "format": "gguf", + "settings": { + "ctx_len": 131072, + "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "llama_model_path": "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf", + "ngl": 33 + }, + "parameters": { + "temperature": 0.7, + "top_p": 0.95, + "stream": true, + "max_tokens": 8192, + "stop": [ + "<|end_of_text|>", + "<|eot_id|>", + "<|eom_id|>" + ], + "frequency_penalty": 0, + "presence_penalty": 0 + }, + "metadata": { + "author": "MetaAI", + "tags": [ + "8B", + "Featured" + ], + "size": 4920000000 + }, + "engine": "nitro" +} diff --git a/extensions/inference-nitro-extension/rollup.config.ts b/extensions/inference-nitro-extension/rollup.config.ts index 71712a4d6..fdd11f961 100644 --- a/extensions/inference-nitro-extension/rollup.config.ts +++ b/extensions/inference-nitro-extension/rollup.config.ts @@ -12,8 +12,8 @@ const codeninja7bJson = require('./resources/models/codeninja-1.0-7b/model.json' const commandr34bJson = require('./resources/models/command-r-34b/model.json') const deepseekCoder13bJson = require('./resources/models/deepseek-coder-1.3b/model.json') const deepseekCoder34bJson = require('./resources/models/deepseek-coder-34b/model.json') -const gemma2bJson = require('./resources/models/gemma-2b/model.json') -const gemma7bJson = require('./resources/models/gemma-7b/model.json') +const gemma112bJson = require('./resources/models/gemma-1.1-2b/model.json') +const gemma117bJson = require('./resources/models/gemma-1.1-7b/model.json') const llama2Chat70bJson = require('./resources/models/llama2-chat-70b/model.json') const llama2Chat7bJson = require('./resources/models/llama2-chat-7b/model.json') const llamacorn1bJson = require('./resources/models/llamacorn-1.1b/model.json') @@ -40,7 +40,11 @@ const aya35bJson = require('./resources/models/aya-23-35b/model.json') const phimediumJson = require('./resources/models/phi3-medium/model.json') const codestralJson = require('./resources/models/codestral-22b/model.json') const qwen2Json = require('./resources/models/qwen2-7b/model.json') - +const llama318bJson = require('./resources/models/llama3.1-8b-instruct/model.json') +const llama3170bJson = require('./resources/models/llama3.1-70b-instruct/model.json') +const gemma22bJson = require('./resources/models/gemma-2-2b/model.json') +const gemma29bJson = require('./resources/models/gemma-2-9b/model.json') +const gemma227bJson = require('./resources/models/gemma-2-27b/model.json') export default [ { @@ -60,8 +64,8 @@ export default [ commandr34bJson, deepseekCoder13bJson, deepseekCoder34bJson, - gemma2bJson, - gemma7bJson, + gemma112bJson, + gemma117bJson, llama2Chat70bJson, llama2Chat7bJson, llamacorn1bJson, @@ -87,7 +91,12 @@ export default [ aya8bJson, aya35bJson, codestralJson, - qwen2Json + qwen2Json, + llama318bJson, + llama3170bJson, + gemma22bJson, + gemma29bJson, + gemma227bJson ]), NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`), DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson), diff --git a/extensions/inference-nitro-extension/src/node/index.ts b/extensions/inference-nitro-extension/src/node/index.ts index 1b24e0a38..77ac9af7a 100644 --- a/extensions/inference-nitro-extension/src/node/index.ts +++ b/extensions/inference-nitro-extension/src/node/index.ts @@ -260,9 +260,14 @@ function loadLLMModel(settings: any): Promise { async function validateModelStatus(modelId: string): Promise { // Send a GET request to the validation URL. // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries. + log(`[CORTEX]::Debug: Validating model ${modelId}`) return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, { method: 'POST', - body: JSON.stringify({ model: modelId }), + body: JSON.stringify({ + model: modelId, + // TODO: force to use cortex llamacpp by default + engine: 'cortex.llamacpp' + }), headers: { 'Content-Type': 'application/json', }, @@ -288,8 +293,9 @@ async function validateModelStatus(modelId: string): Promise { return Promise.resolve() } } + const errorBody = await res.text() log( - `[CORTEX]::Debug: Validate model state failed with response ${JSON.stringify( + `[CORTEX]::Debug: Validate model state failed with response ${errorBody} and status is ${JSON.stringify( res.statusText )}` ) diff --git a/extensions/inference-nvidia-extension/resources/settings.json b/extensions/inference-nvidia-extension/resources/settings.json index e7647b562..6b2652653 100644 --- a/extensions/inference-nvidia-extension/resources/settings.json +++ b/extensions/inference-nvidia-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "nvidia-api-key", + "title": "API Key", + "description": "The NVIDIA API uses API keys for authentication. Visit your [API Keys](https://org.ngc.nvidia.com/setup/personal-keys) page to retrieve the API key you'll use in your requests..", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,17 +20,5 @@ "placeholder": "https://integrate.api.nvidia.com/v1/chat/completions", "value": "https://integrate.api.nvidia.com/v1/chat/completions" } - }, - { - "key": "nvidia-api-key", - "title": "API Key", - "description": "The NVIDIA API uses API keys for authentication. Visit your [API Keys](https://org.ngc.nvidia.com/setup/personal-keys) page to retrieve the API key you'll use in your requests..", - "controllerType": "input", - "controllerProps": { - "placeholder": "nvapi-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password", - "inputActions": ["unobscure", "copy"] - } } ] diff --git a/extensions/inference-openai-extension/resources/settings.json b/extensions/inference-openai-extension/resources/settings.json index ccd7dd545..db2e80c9b 100644 --- a/extensions/inference-openai-extension/resources/settings.json +++ b/extensions/inference-openai-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "openai-api-key", + "title": "API Key", + "description": "The OpenAI API uses API keys for authentication. Visit your [API Keys](https://platform.openai.com/account/api-keys) page to retrieve the API key you'll use in your requests.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,16 +20,5 @@ "placeholder": "https://api.openai.com/v1/chat/completions", "value": "https://api.openai.com/v1/chat/completions" } - }, - { - "key": "openai-api-key", - "title": "API Key", - "description": "The OpenAI API uses API keys for authentication. Visit your [API Keys](https://platform.openai.com/account/api-keys) page to retrieve the API key you'll use in your requests.", - "controllerType": "input", - "controllerProps": { - "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" - } } ] diff --git a/extensions/inference-openrouter-extension/resources/settings.json b/extensions/inference-openrouter-extension/resources/settings.json index 85040e96b..189aee0a0 100644 --- a/extensions/inference-openrouter-extension/resources/settings.json +++ b/extensions/inference-openrouter-extension/resources/settings.json @@ -1,8 +1,20 @@ [ + { + "key": "openrouter-api-key", + "title": "API Key", + "description": "The OpenRouter API uses API keys for authentication. Visit your [API Keys](https://openrouter.ai/keys) page to retrieve the API key you'll use in your requests.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", - "description": "The endpoint to use for chat completions. See the [OpenRouter API documentation](https://openrouter.ai/docs) for more information.", + "description": "The endpoint to use for chat completions. See the [OpenRouter API documentation](https://openrouter.ai/docs/requests) for more information.", "controllerType": "input", "controllerProps": { "placeholder": "https://openrouter.ai/api/v1/chat/completions", @@ -10,14 +22,13 @@ } }, { - "key": "openrouter-api-key", - "title": "API Key", - "description": "The OpenRouter API uses API keys for authentication. Visit your [API Keys](https://openrouter.ai/keys) page to retrieve the API key you'll use in your requests.", + "key": "openrouter-model", + "title": "Model", + "description": "If the model parameter is omitted, the user or payer's default is used. Otherwise, remember to select a value for model from the [supported models](https://openrouter.ai/docs/models) or API, and include the organization prefix.", "controllerType": "input", "controllerProps": { - "placeholder": "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" + "placeholder": "Leave empty for default model", + "value": "" } } ] diff --git a/extensions/inference-openrouter-extension/src/index.ts b/extensions/inference-openrouter-extension/src/index.ts index 5417503e5..75d1188a8 100644 --- a/extensions/inference-openrouter-extension/src/index.ts +++ b/extensions/inference-openrouter-extension/src/index.ts @@ -8,22 +8,16 @@ import { RemoteOAIEngine } from '@janhq/core' import { PayloadType } from '@janhq/core' -import { ChatCompletionRole } from '@janhq/core' declare const SETTINGS: Array declare const MODELS: Array enum Settings { apiKey = 'openrouter-api-key', + model = 'openrouter-model', chatCompletionsEndPoint = 'chat-completions-endpoint', } -enum RoleType { - user = 'USER', - chatbot = 'CHATBOT', - system = 'SYSTEM', -} - /** * A class that implements the InferenceExtension interface from the @janhq/core package. * The class provides methods for initializing and stopping a model, and for making inference requests. @@ -32,6 +26,7 @@ enum RoleType { export default class JanInferenceOpenRouterExtension extends RemoteOAIEngine { inferenceUrl: string = '' provider: string = 'openrouter' + model?: string | undefined override async onLoad(): Promise { super.onLoad() @@ -45,6 +40,9 @@ export default class JanInferenceOpenRouterExtension extends RemoteOAIEngine { Settings.chatCompletionsEndPoint, '' ) + this.model = await this.getSetting(Settings.model, '') + // Openrouter uses default model on no model param set + if (!this.model?.length) this.model = undefined if (this.inferenceUrl.length === 0) { SETTINGS.forEach((setting) => { if (setting.key === Settings.chatCompletionsEndPoint) { @@ -54,6 +52,14 @@ export default class JanInferenceOpenRouterExtension extends RemoteOAIEngine { } } + override async headers(): Promise { + return { + 'Content-Type': 'application/json', + 'HTTP-Referer': 'https://jan.ai', + 'Authorization': `Bearer ${this.apiKey}`, + } + } + onSettingUpdate(key: string, value: T): void { if (key === Settings.apiKey) { this.apiKey = value as string @@ -69,8 +75,14 @@ export default class JanInferenceOpenRouterExtension extends RemoteOAIEngine { } else { this.inferenceUrl = value } + } else if (key === Settings.model) { + this.model = + typeof value === 'string' && value.length > 0 ? value : undefined } } - transformPayload = (payload: PayloadType)=>({...payload,model:"openrouter/auto"}) + transformPayload = (payload: PayloadType) => ({ + ...payload, + model: this.model, + }) } diff --git a/extensions/inference-triton-trtllm-extension/resources/settings.json b/extensions/inference-triton-trtllm-extension/resources/settings.json index 9c220eed7..26b80a686 100644 --- a/extensions/inference-triton-trtllm-extension/resources/settings.json +++ b/extensions/inference-triton-trtllm-extension/resources/settings.json @@ -1,4 +1,16 @@ [ + { + "key": "tritonllm-api-key", + "title": "API Key", + "description": "The Triton LLM API uses API keys for authentication.", + "controllerType": "input", + "controllerProps": { + "placeholder": "Insert API Key", + "value": "", + "type": "password", + "inputActions": ["unobscure", "copy"] + } + }, { "key": "chat-completions-endpoint", "title": "Chat Completions Endpoint", @@ -8,16 +20,5 @@ "placeholder": "http://localhost:8000/v2/models/tensorrt_llm_bls/generate", "value": "http://localhost:8000/v2/models/tensorrt_llm_bls/generate" } - }, - { - "key": "tritonllm-api-key", - "title": "Triton LLM API Key", - "description": "The Triton LLM API uses API keys for authentication.", - "controllerType": "input", - "controllerProps": { - "placeholder": "xxxxxxxxxxxxxxxxxxxx", - "value": "", - "type": "password" - } } ] diff --git a/extensions/model-extension/download.bat b/extensions/model-extension/download.bat deleted file mode 100644 index de055cb80..000000000 --- a/extensions/model-extension/download.bat +++ /dev/null @@ -1,3 +0,0 @@ -@echo off -set /p LLAMA_CPP_VERSION=<./scripts/version.txt -.\node_modules\.bin\download https://github.com/ggerganov/llama.cpp/archive/refs/tags/%LLAMA_CPP_VERSION%.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf .\scripts\llama.cpp.tar.gz "llama.cpp-%LLAMA_CPP_VERSION%/convert.py" "llama.cpp-%LLAMA_CPP_VERSION%/convert-hf-to-gguf.py" "llama.cpp-%LLAMA_CPP_VERSION%/gguf-py" && cpx "./llama.cpp-%LLAMA_CPP_VERSION%/**" "scripts" && rimraf "./scripts/llama.cpp.tar.gz" && rimraf "./llama.cpp-%LLAMA_CPP_VERSION%" \ No newline at end of file diff --git a/extensions/model-extension/package.json b/extensions/model-extension/package.json index 6bd8bbe5e..4a2c61b71 100644 --- a/extensions/model-extension/package.json +++ b/extensions/model-extension/package.json @@ -9,31 +9,25 @@ "license": "AGPL-3.0", "scripts": { "build": "tsc --module commonjs && rollup -c rollup.config.ts --configPlugin @rollup/plugin-typescript --bundleConfigAsCjs", - "download:llama": "run-script-os", - "download:llama:linux": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz --wildcards '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", - "download:llama:darwin": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", - "download:llama:win32": "download.bat", - "build:publish:linux": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", - "build:publish:darwin": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && ../../.github/scripts/auto-sign.sh && npm pack && cpx *.tgz ../../pre-install", - "build:publish:win32": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", - "build:publish": "run-script-os" + "build:publish": "rimraf *.tgz --glob && yarn build && npm pack && cpx *.tgz ../../pre-install" }, "devDependencies": { - "cpx": "^1.5.0", - "download-cli": "^1.1.1", - "rimraf": "^3.0.2", - "ts-loader": "^9.5.0", - "typescript": "5.3.3", "@rollup/plugin-commonjs": "^25.0.7", "@rollup/plugin-json": "^6.1.0", "@rollup/plugin-node-resolve": "^15.2.3", "@rollup/plugin-replace": "^5.0.5", "@rollup/plugin-typescript": "^11.1.6", "@types/pdf-parse": "^1.1.4", + "cpx": "^1.5.0", + "download-cli": "^1.1.1", + "rimraf": "^3.0.2", "rollup": "^2.38.5", "rollup-plugin-define": "^1.0.1", "rollup-plugin-sourcemaps": "^0.6.3", - "rollup-plugin-typescript2": "^0.36.0" + "rollup-plugin-typescript2": "^0.36.0", + "run-script-os": "^1.1.6", + "ts-loader": "^9.5.0", + "typescript": "5.3.3" }, "files": [ "dist/*", @@ -41,8 +35,15 @@ "README.md" ], "dependencies": { - "@janhq/core": "file:../../core", "@huggingface/gguf": "^0.0.11", + "@huggingface/jinja": "^0.3.0", + "@janhq/core": "file:../../core", + "hyllama": "^0.2.2", "python-shell": "^5.0.0" - } + }, + "bundleDependencies": [ + "hyllama", + "@huggingface/gguf", + "@huggingface/jinja" + ] } diff --git a/extensions/model-extension/rollup.config.ts b/extensions/model-extension/rollup.config.ts index aa22bd1f6..c3f3acc77 100644 --- a/extensions/model-extension/rollup.config.ts +++ b/extensions/model-extension/rollup.config.ts @@ -3,7 +3,7 @@ import sourceMaps from 'rollup-plugin-sourcemaps' import typescript from 'rollup-plugin-typescript2' import json from '@rollup/plugin-json' import replace from '@rollup/plugin-replace' - +import commonjs from '@rollup/plugin-commonjs' const settingJson = require('./resources/settings.json') const packageJson = require('./package.json') const defaultModelJson = require('./resources/default-model.json') @@ -39,6 +39,39 @@ export default [ browser: true, }), + // Resolve source maps to the original source + sourceMaps(), + ], + }, + { + input: `src/node/index.ts`, + output: [ + { + file: 'dist/node/index.cjs.js', + format: 'cjs', + sourcemap: true, + inlineDynamicImports: true, + }, + ], + // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash') + external: ['@janhq/core/node'], + watch: { + include: 'src/node/**', + }, + plugins: [ + // Allow json resolution + json(), + // Compile TypeScript files + typescript({ useTsconfigDeclarationDir: true }), + // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs) + commonjs(), + // Allow node_modules resolution, so you can use 'external' to control + // which external modules to include in the bundle + // https://github.com/rollup/rollup-plugin-node-resolve#usage + resolve({ + extensions: ['.ts', '.js', '.json'], + }), + // Resolve source maps to the original source sourceMaps(), ], diff --git a/extensions/model-extension/scripts/convert-hf-to-gguf.py b/extensions/model-extension/scripts/convert-hf-to-gguf.py deleted file mode 100755 index 0d4ea03b4..000000000 --- a/extensions/model-extension/scripts/convert-hf-to-gguf.py +++ /dev/null @@ -1,1720 +0,0 @@ -#!/usr/bin/env python3 - -from __future__ import annotations - -import argparse -import contextlib -import json -import os -import re -import sys -from enum import IntEnum -from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast - -import numpy as np -import torch - -if TYPE_CHECKING: - from torch import Tensor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf - -from convert import HfVocab - - -# check for any of the given keys in the dictionary and return the value of the first key found -def get_key_opts(d, keys): - for k in keys: - if k in d: - return d[k] - print(f"Could not find any of {keys}") - sys.exit() - - -###### MODEL DEFINITIONS ###### - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class Model: - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): - self.dir_model = dir_model - self.ftype = ftype - self.fname_out = fname_out - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.is_safetensors = self._is_model_safetensors() - self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") - self.part_names = self._get_part_names() - self.hparams = Model.load_hparams(self.dir_model) - self.model_arch = self._get_model_architecture() - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) - - def set_vocab(self): - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for part_name in self.part_names: - print(f"gguf: loading model part '{part_name}'") - ctx: ContextManager[Any] - if self.is_safetensors: - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) - else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) - - with ctx as model_part: - for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - yield name, data - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - )) - if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: - self.gguf_writer.add_context_length(n_ctx) - if (n_embd := self.hparams.get("hidden_size")) is not None: - self.gguf_writer.add_embedding_length(n_embd) - if (n_ff := self.hparams.get("intermediate_size")) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_heads")) is not None: - self.gguf_writer.add_head_count(n_head) - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - - if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file() - self.gguf_writer.close() - - def write_vocab(self): - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - @staticmethod - def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.endswith(prefix): - num_parts += 1 - - return num_parts - - @staticmethod - def load_hparams(dir_model): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @staticmethod - def from_model_architecture(model_architecture): - if model_architecture == "GPTNeoXForCausalLM": - return GPTNeoXModel - if model_architecture == "BloomForCausalLM": - return BloomModel - if model_architecture == "MPTForCausalLM": - return MPTModel - if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return BaichuanModel - if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): - return FalconModel - if model_architecture == "GPTBigCodeForCausalLM": - return StarCoderModel - if model_architecture == "GPTRefactForCausalLM": - return RefactModel - if model_architecture == "PersimmonForCausalLM": - return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return StableLMModel - if model_architecture == "QWenLMHeadModel": - return QwenModel - if model_architecture == "Qwen2ForCausalLM": - return Model - if model_architecture == "MixtralForCausalLM": - return MixtralModel - if model_architecture == "GPT2LMHeadModel": - return GPT2Model - if model_architecture == "PhiForCausalLM": - return Phi2Model - if model_architecture == "PlamoForCausalLM": - return PlamoModel - if model_architecture == "CodeShellForCausalLM": - return CodeShellModel - if model_architecture == "OrionForCausalLM": - return OrionModel - if model_architecture == "InternLM2ForCausalLM": - return InternLM2Model - if model_architecture == "MiniCPMForCausalLM": - return MiniCPMModel - return Model - - def _is_model_safetensors(self) -> bool: - return Model.count_model_parts(self.dir_model, ".safetensors") > 0 - - def _get_part_names(self): - if self.is_safetensors: - if self.num_parts == 1: # there's only one .safetensors file - return ("model.safetensors",) - return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) - - if self.num_parts == 1: # there's only one .bin file - return ("pytorch_model.bin",) - return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - - def _get_model_architecture(self) -> gguf.MODEL_ARCH: - arch = self.hparams["architectures"][0] - if arch == "GPTNeoXForCausalLM": - return gguf.MODEL_ARCH.GPTNEOX - if arch == "BloomForCausalLM": - return gguf.MODEL_ARCH.BLOOM - if arch == "MPTForCausalLM": - return gguf.MODEL_ARCH.MPT - if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return gguf.MODEL_ARCH.BAICHUAN - if arch in ("FalconForCausalLM", "RWForCausalLM"): - return gguf.MODEL_ARCH.FALCON - if arch == "GPTBigCodeForCausalLM": - return gguf.MODEL_ARCH.STARCODER - if arch == "GPTRefactForCausalLM": - return gguf.MODEL_ARCH.REFACT - if arch == "PersimmonForCausalLM": - return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return gguf.MODEL_ARCH.STABLELM - if arch == "QWenLMHeadModel": - return gguf.MODEL_ARCH.QWEN - if arch == "Qwen2ForCausalLM": - return gguf.MODEL_ARCH.QWEN2 - if arch == "MixtralForCausalLM": - return gguf.MODEL_ARCH.LLAMA - if arch == "GPT2LMHeadModel": - return gguf.MODEL_ARCH.GPT2 - if arch == "PhiForCausalLM": - return gguf.MODEL_ARCH.PHI2 - if arch == "PlamoForCausalLM": - return gguf.MODEL_ARCH.PLAMO - if arch == "CodeShellForCausalLM": - return gguf.MODEL_ARCH.CODESHELL - if arch == "OrionForCausalLM": - return gguf.MODEL_ARCH.ORION - if arch == "InternLM2ForCausalLM": - return gguf.MODEL_ARCH.INTERNLM2 - if arch == "MiniCPMForCausalLM": - return gguf.MODEL_ARCH.MINICPM - - raise NotImplementedError(f'Architecture "{arch}" not supported!') - - def _set_vocab_gpt2(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode('utf-8') - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_qwen(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode("utf-8") - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_sentencepiece(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) - - tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.id_to_piece(token_id) - text = piece.encode("utf-8") - score = tokenizer.get_score(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.is_unknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.is_control(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.is_unused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.is_byte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_hf(self): - path = self.dir_model - added_tokens_path = self.dir_model - vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - -class GPTNeoXModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - -class BloomModel(Model): - def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams["n_layer"] - tensors = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - has_lm_head = True - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - for name, data_torch in tensors.items(): - if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): - has_lm_head = False - - name = re.sub(r'transformer\.', '', name) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) - data = np.concatenate( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) - data = np.concatenate( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.bias") - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "word_embeddings.weight": - self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - -class MPTModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - if "scales" in name: - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) - if new_name is not None: - new_name = new_name.replace("scales", "act.scales") - else: - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - self.gguf_writer.add_tensor("output.weight", data) - - -class OrionModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - print("gguf: can not find ctx length parameter.") - sys.exit() - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - - def write_tensors(self): - # Collect tensors from generator object - model_kv = dict(self.get_tensors()) - block_count = self.hparams["num_hidden_layers"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - -class BaichuanModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - print("gguf: can not find ctx length parameter.") - sys.exit() - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def write_tensors(self): - # Collect tensors from generator object - model_kv = dict(self.get_tensors()) - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - for i in range(block_count): - if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: - print(f"Unpacking and permuting layer {i}") - model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ - self._reverse_hf_permute_part(w, 0, head_count, head_count) - model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ - self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) - model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ - self._reverse_hf_part(w, 2) - del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -class FalconModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_name("Falcon") - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - head_dim = self.hparams["hidden_size"] // n_head - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class StarCoderModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("StarCoder") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -class RefactModel(Model): - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("Refact") - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - block_count = self.hparams["n_layer"] - - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - tensors = dict(self.get_tensors()) - for i in range(block_count): - if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] - del tensors[f"transformer.h.{i}.attn.kv.weight"] - if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w - del tensors[f"transformer.h.{i}.attn.q.weight"] - if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: - tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] - tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] - del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - - for name, data_torch in tensors.items(): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class PersimmonModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - head_count = self.hparams["num_attention_heads"] - head_count_kv = head_count - hidden_size = self.hparams["hidden_size"] - - self.gguf_writer.add_name('persimmon-8b-chat') - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - - # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller - # than the head size? - # ref: https://github.com/ggerganov/llama.cpp/pull/4889 - # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) - self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) - - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - - def set_vocab(self): - self._set_vocab_sentencepiece() - # self.gguf_writer.add_bos_token_id(71013) - # self.gguf_writer.add_eos_token_id(71013) - - def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - if name.endswith(".self_attention.rotary_emb.inv_freq"): - continue - old_dtype = data_torch.dtype - # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data_torch.to(torch.float32).squeeze().numpy() - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - n_dims = len(data.shape) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - -class StableLMModel(Model): - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab - self._set_vocab_qwen() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) - - -class MixtralModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - -class MiniCPMModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def set_vocab(self): - self._set_vocab_hf() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - n_head = self.hparams.get("num_attention_heads") - n_kv_head = self.hparams.get("num_key_value_heads") - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class QwenModel(Model): - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - self._set_vocab_qwen() - - def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - - def write_tensors(self): - block_count = self.hparams["num_hidden_layers"] - model_kv = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - -class GPT2Model(Model): - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_ctx"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): - continue - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - # note: GPT2 output is tied to (same as) wte in original model - if new_name == "token_embd.weight": - print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor("output.weight", data) - - -class Phi2Model(Model): - def set_gguf_parameters(self): - block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) - - rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) - n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) - n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) - - self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) - - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(4 * n_embd) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_add_bos_token(False) - - -class PlamoModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name("PLaMo") - self.gguf_writer.add_context_length(4096) # not in config.json - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - - def shuffle_attn_q_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(8, 5, 128, 5120) - data_torch = torch.permute(data_torch, (1, 0, 2, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def shuffle_attn_output_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(5120, 8, 5, 128) - data_torch = torch.permute(data_torch, (0, 2, 1, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - if "self_attn.rotary_emb.inv_freq" in name: - continue - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class CodeShellModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("CodeShell") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(10000.0) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - tensors = dict(self.get_tensors()) - has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() - for name, data_torch in tensors.items(): - # we don't need these - if name.endswith((".attn.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "transformer.wte.weight": - self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - -class InternLM2Model(Model): - def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) - - sentencepiece_model = model.ModelProto() - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - - tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.id_to_piece(token_id) - text = piece.encode("utf-8") - score = tokenizer.get_score(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - print(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉" - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.is_unknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.is_control(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.is_unused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.is_byte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - old_eos = special_vocab.special_token_ids["eos"] - if "chat" in os.path.basename(self.dir_model.absolute()): - # For the chat model, we replace the eos with '<|im_end|>'. - special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ -in chat mode so that the conversation can end normally.") - - special_vocab.add_to_gguf(self.gguf_writer) - - def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') - im_end_list = tokenizer.encode('<|im_end|>') - assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) - if len(unused_145_list) == 1: - eos_token = unused_145_list[0] - if len(im_end_list) == 1: - eos_token = im_end_list[0] - return eos_token - - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - - def post_write_tensors(self, tensor_map, name, data_torch): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - def write_tensors(self): - from einops import rearrange - - num_heads = self.hparams.get("num_attention_heads") - num_kv_heads = self.hparams.get("num_key_value_heads") - hidden_size = self.hparams.get("hidden_size") - q_per_kv = num_heads // num_kv_heads - head_dim = hidden_size // num_heads - num_groups = num_heads // q_per_kv - - block_count = self.hparams["num_hidden_layers"] - model_kv = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] - qkv = data_torch - qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] - # The model weights of q and k equire additional reshape. - q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - v = rearrange(v, " o g n i -> o (g n i)").T - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) - else: - self.post_write_tensors(tensor_map, name, data_torch) - - -###### CONVERSION LOGIC ###### - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--awq-path", type=Path, default=None, - help="Path to scale awq cache file") - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "--outtype", type=str, choices=["f32", "f16"], default="f16", - help="output format - use f32 for float32, f16 for float16", - ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") - parser.add_argument( - "model", type=Path, - help="directory containing model file", - ) - - return parser.parse_args() - - -def main() -> None: - args = parse_args() - - dir_model = args.model - - if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" - dir_model = tmp_model_path - if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") - - if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file=sys.stderr) - sys.exit(1) - - ftype_map = { - "f32": gguf.GGMLQuantizationType.F32, - "f16": gguf.GGMLQuantizationType.F16, - } - - if args.outfile is not None: - fname_out = args.outfile - else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' - - print(f"Loading model: {dir_model.name}") - - hparams = Model.load_hparams(dir_model) - - with torch.inference_mode(): - model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) - - print("Set model parameters") - model_instance.set_gguf_parameters() - - print("Set model tokenizer") - model_instance.set_vocab() - - if args.vocab_only: - print(f"Exporting model vocab to '{fname_out}'") - model_instance.write_vocab() - else: - print(f"Exporting model to '{fname_out}'") - model_instance.write() - - print(f"Model successfully exported to '{fname_out}'") - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/convert.py b/extensions/model-extension/scripts/convert.py deleted file mode 100755 index 323e8058d..000000000 --- a/extensions/model-extension/scripts/convert.py +++ /dev/null @@ -1,1478 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import concurrent.futures -import enum -import faulthandler -import functools -import itertools -import json -import math -import mmap -import os -import pickle -import re -import signal -import struct -import sys -import time -import zipfile -from abc import ABCMeta, abstractmethod -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from dataclasses import dataclass -from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar - -import numpy as np -from sentencepiece import SentencePieceProcessor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf - -if TYPE_CHECKING: - from typing import TypeAlias - -if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): - faulthandler.register(signal.SIGUSR1) - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -ARCH = gguf.MODEL_ARCH.LLAMA - -DEFAULT_CONCURRENCY = 8 - -# -# data types -# - - -@dataclass(frozen=True) -class DataType: - name: str - dtype: np.dtype[Any] - valid_conversions: list[str] - - def elements_to_bytes(self, n_elements: int) -> int: - return n_elements * self.dtype.itemsize - - -@dataclass(frozen=True) -class UnquantizedDataType(DataType): - pass - - -DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) -DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) -DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) -DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) - - -@dataclass(frozen=True) -class QuantizedDataType(DataType): - block_size: int - quantized_dtype: np.dtype[Any] - ggml_type: gguf.GGMLQuantizationType - - def quantize(self, arr: NDArray) -> NDArray: - raise NotImplementedError(f'Quantization for {self.name} not implemented') - - def elements_to_bytes(self, n_elements: int) -> int: - assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' - return self.quantized_dtype.itemsize * (n_elements // self.block_size) - - -@dataclass(frozen=True) -class Q8_0QuantizedDataType(QuantizedDataType): - # Mini Q8_0 quantization in Python! - def quantize(self, arr: NDArray) -> NDArray: - assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}' - assert arr.dtype == np.float32, f'Bad array type {arr.dtype}' - n_blocks = arr.size // self.block_size - blocks = arr.reshape((n_blocks, self.block_size)) - # Much faster implementation of block quantization contributed by @Cebtenzzre - - def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: - d = abs(blocks).max(axis = 1) / np.float32(127) - with np.errstate(divide = 'ignore'): - qs = (blocks / d[:, None]).round() - qs[d == 0] = 0 - yield from zip(d, qs) - return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) - - -DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', - dtype = np.dtype(np.float32), valid_conversions = [], - ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, - quantized_dtype = np.dtype([('d', ' DataType: - dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) - if dt is None: - raise ValueError(self) - # 1D tensors are always F32. - return dt if len(tensor.shape) > 1 else DT_F32 - - -GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { - GGMLFileType.AllF32 : DT_F32, - GGMLFileType.MostlyF16 : DT_F16, - GGMLFileType.MostlyQ8_0: DT_Q8_0, -} - -# -# hparams loading -# - - -@dataclass -class Params: - n_vocab: int - n_embd: int - n_layer: int - n_ctx: int - n_ff: int - n_head: int - n_head_kv: int - n_experts: int | None = None - n_experts_used: int | None = None - f_norm_eps: float | None = None - - rope_scaling_type: gguf.RopeScalingType | None = None - f_rope_freq_base: float | None = None - f_rope_scale: float | None = None - n_orig_ctx: int | None = None - rope_finetuned: bool | None = None - - ftype: GGMLFileType | None = None - - # path to the directory containing the model files - path_model: Path | None = None - - @staticmethod - def guessed(model: LazyModel) -> Params: - # try transformer naming first - n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape - - # try transformer naming first - if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) - elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming - n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) - else: - n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) - - if n_layer < 1: - raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - - n_head = n_embd // 128 # guessed - n_mult = 256 # guessed - - # TODO: verify this - n_ff = int(2 * (4 * n_embd) / 3) - n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) - - return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_layer = n_layer, - n_ctx = -1, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head, - f_norm_eps = 1e-5, - ) - - @staticmethod - def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: - config = json.load(open(config_path)) - - rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None - rope_scaling = config.get("rope_scaling") - - if rope_scaling is not None and (typ := rope_scaling.get("type")): - rope_factor = rope_scaling.get("factor") - f_rope_scale = rope_factor - if typ == "linear": - rope_scaling_type = gguf.RopeScalingType.LINEAR - elif typ == "yarn": - rope_scaling_type = gguf.RopeScalingType.YARN - n_orig_ctx = rope_scaling['original_max_position_embeddings'] - rope_finetuned = rope_scaling['finetuned'] - else: - raise NotImplementedError(f'Unknown rope scaling type: {typ}') - - if "max_sequence_length" in config: - n_ctx = config["max_sequence_length"] - elif "max_position_embeddings" in config: - n_ctx = config["max_position_embeddings"] - else: - raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - - n_experts = None - n_experts_used = None - - if "num_local_experts" in config: - n_experts = config["num_local_experts"] - n_experts_used = config["num_experts_per_tok"] - - return Params( - n_vocab = config["vocab_size"], - n_embd = config["hidden_size"], - n_layer = config["num_hidden_layers"], - n_ctx = n_ctx, - n_ff = config["intermediate_size"], - n_head = (n_head := config["num_attention_heads"]), - n_head_kv = config.get("num_key_value_heads", n_head), - n_experts = n_experts, - n_experts_used = n_experts_used, - f_norm_eps = config["rms_norm_eps"], - f_rope_freq_base = config.get("rope_theta"), - rope_scaling_type = rope_scaling_type, - f_rope_scale = f_rope_scale, - n_orig_ctx = n_orig_ctx, - rope_finetuned = rope_finetuned, - ) - - # LLaMA v2 70B params.json - # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} - @staticmethod - def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: - config = json.load(open(config_path)) - - n_experts = None - n_experts_used = None - f_rope_freq_base = None - - # hack to determine LLaMA v1 vs v2 vs CodeLlama - if config.get("moe"): - # Mixtral - n_ctx = 32768 - elif config.get("rope_theta") == 1000000: - # CodeLlama - n_ctx = 16384 - elif config["norm_eps"] == 1e-05: - # LLaMA v2 - n_ctx = 4096 - else: - # LLaMA v1 - n_ctx = 2048 - - if "layers.0.feed_forward.w1.weight" in model: - n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] - - if config.get("moe"): - n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_experts = config["moe"]["num_experts"] - n_experts_used = config["moe"]["num_experts_per_tok"] - f_rope_freq_base = 1e6 - - return Params( - n_vocab = model["tok_embeddings.weight"].shape[0], - n_embd = config["dim"], - n_layer = config["n_layers"], - n_ctx = n_ctx, - n_ff = n_ff, - n_head = (n_head := config["n_heads"]), - n_head_kv = config.get("n_kv_heads", n_head), - n_experts = n_experts, - n_experts_used = n_experts_used, - f_norm_eps = config["norm_eps"], - f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), - ) - - @staticmethod - def load(model_plus: ModelPlus) -> Params: - hf_config_path = model_plus.paths[0].parent / "config.json" - orig_config_path = model_plus.paths[0].parent / "params.json" - - if hf_config_path.exists(): - params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) - elif orig_config_path.exists(): - params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) - elif model_plus.format != 'none': - params = Params.guessed(model_plus.model) - else: - raise ValueError('Cannot guess params when model format is none') - - params.path_model = model_plus.paths[0].parent - - return params - - -# -# vocab -# - -class BpeVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - if isinstance(self.bpe_tokenizer.get('model'), dict): - self.vocab = self.bpe_tokenizer["model"]["vocab"] - else: - self.vocab = self.bpe_tokenizer - added_tokens: dict[str, int] - if fname_added_tokens is not None: - # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - # Fall back to trying to find the added tokens in tokenizer.json - tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' - if not tokenizer_json_file.is_file(): - added_tokens = {} - else: - tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) - added_tokens = dict( - (item['content'], item['id']) - for item in tokenizer_json.get('added_tokens', []) - # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer) - - vocab_size: int = len(self.vocab) - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - expected_end_id = vocab_size + len(actual_ids) - 1 - raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") - - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_dict = added_tokens - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} - - for i, _ in enumerate(self.vocab): - yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.CONTROL - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.bpe_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class SentencePieceVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - added_tokens: dict[str, int] - if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - added_tokens = {} - - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - - new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} - expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) - - if expected_new_ids != actual_new_ids: - raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") - - # Token pieces that were added to the base vocabulary. - self.added_tokens_dict = added_tokens - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.sentencepiece_tokenizer - for i in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(i) - text: bytes = piece.encode("utf-8") - score: float = tokenizer.get_score(i) - - toktype = gguf.TokenType.NORMAL - if tokenizer.is_unknown(i): - toktype = gguf.TokenType.UNKNOWN - if tokenizer.is_control(i): - toktype = gguf.TokenType.CONTROL - - # NOTE: I think added_tokens are user defined. - # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto - # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - - if tokenizer.is_unused(i): - toktype = gguf.TokenType.UNUSED - if tokenizer.is_byte(i): - toktype = gguf.TokenType.BYTE - - yield text, score, toktype - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.sentencepiece_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class HfVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: - try: - from transformers import AutoTokenizer - except ImportError as e: - raise ImportError( - "To use HfVocab, please install the `transformers` package. " - "You can install it with `pip install transformers`." - ) from e - - print("fname_tokenizer:", fname_tokenizer) - # Allow the tokenizer to default to slow or fast versions. - # Explicitly set tokenizer to use local paths. - self.tokenizer = AutoTokenizer.from_pretrained( - fname_tokenizer, - cache_dir=fname_tokenizer, - local_files_only=True, - ) - - # Initialize lists and dictionaries for added tokens - self.added_tokens_list = [] - self.added_tokens_dict = dict() - self.added_tokens_ids = set() - - # Process added tokens - for tok, tokidx in sorted( - self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] - ): - # Only consider added tokens that are not in the base vocabulary - if tokidx >= self.tokenizer.vocab_size: - self.added_tokens_list.append(tok) - self.added_tokens_dict[tok] = tokidx - self.added_tokens_ids.add(tokidx) - - # Store special tokens and their IDs - self.specials = { - tok: self.tokenizer.get_vocab()[tok] - for tok in self.tokenizer.all_special_tokens - } - self.special_ids = set(self.tokenizer.all_special_ids) - - # Set vocabulary sizes - self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - reverse_vocab = { - id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() - } - - for token_id in range(self.vocab_size_base): - # Skip processing added tokens here - if token_id in self.added_tokens_ids: - continue - - # Convert token text to bytes - token_text = reverse_vocab[token_id].encode("utf-8") - - # Yield token text, score, and type - yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, token_text, self.special_ids # Reuse already stored special IDs - ) - - def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: - # Special case for byte tokens - if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - return gguf.TokenType.BYTE - - # Determine token type based on whether it's a special token - return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - - def get_token_score(self, token_id: int) -> float: - # Placeholder for actual logic to determine the token's score - # This needs to be implemented based on specific requirements - return -1000.0 # Default score - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - if text in self.specials: - toktype = self.get_token_type(self.specials[text], b'', self.special_ids) - score = self.get_token_score(self.specials[text]) - else: - toktype = gguf.TokenType.USER_DEFINED - score = -1000.0 - - yield text.encode("utf-8"), score, toktype - - def has_newline_token(self): - return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.hf_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" - - -# -# data loading -# TODO: reuse (probably move to gguf.py?) -# - - -def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - -class Tensor(metaclass=ABCMeta): - data_type: DataType - - @abstractmethod - def astype(self, data_type: DataType) -> Tensor: ... - @abstractmethod - def permute(self, n_head: int, n_head_kv: int) -> Tensor: ... - @abstractmethod - def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ... - @abstractmethod - def part(self, n_part: int) -> UnquantizedTensor: ... - @abstractmethod - def to_ggml(self) -> GGMLCompatibleTensor: ... - - -def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: - assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" - fp32_arr = bf16_arr.astype(np.uint32) << 16 - return fp32_arr.view(np.float32) - - -class UnquantizedTensor(Tensor): - def __init__(self, ndarray: NDArray) -> None: - assert isinstance(ndarray, np.ndarray) - self.ndarray = ndarray - self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] - - def astype(self, data_type: DataType) -> Tensor: - dtype = data_type.dtype - if self.data_type == DT_BF16: - self.ndarray = bf16_to_fp32(self.ndarray) - return UnquantizedTensor(self.ndarray.astype(dtype)) - - def to_ggml(self) -> UnquantizedTensor: - return self - - def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: - r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - - def part(self, n_part: int) -> UnquantizedTensor: - r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) - - def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor: - return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv)) - - -def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray: - tensor = lazy_tensor.load() - assert isinstance(tensor, UnquantizedTensor) - - # double-check: - actual_shape = list(tensor.ndarray.shape) - assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape) - if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype: - if convert: - tensor.ndarray = tensor.ndarray.astype(expected_dtype) - else: - raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}') - - return tensor.ndarray - - -GGMLCompatibleTensor = UnquantizedTensor - - -@dataclass -class LazyTensor: - _load: Callable[[], Tensor] - shape: list[int] - data_type: DataType - description: str - - def load(self) -> Tensor: - ret = self._load() - # Should be okay if it maps to the same numpy type? - assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ - (self.data_type, ret.data_type, self.description) - return ret - - def astype(self, data_type: DataType) -> LazyTensor: - self.validate_conversion_to(data_type) - - def load() -> Tensor: - return self.load().astype(data_type) - return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') - - def validate_conversion_to(self, data_type: DataType) -> None: - if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions: - raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') - - -LazyModel: TypeAlias = 'dict[str, LazyTensor]' - - -@dataclass -class ModelPlus: - model: LazyModel - paths: list[Path] # Where this was read from. - format: Literal['ggml', 'torch', 'safetensors', 'none'] - vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. - - -def merge_sharded(models: list[LazyModel]) -> LazyModel: - # Original LLaMA models have each file contain one part of each tensor. - # Use a dict instead of a set to preserve order. - names = {name: None for model in models for name in model} - - def convert(name: str) -> LazyTensor: - lazy_tensors: list[LazyTensor] = [model[name] for model in models] - if len(lazy_tensors) == 1: - # only one file; don't go through this procedure since there might - # be quantized tensors - return lazy_tensors[0] - if len(lazy_tensors[0].shape) == 1: - # the tensor is just duplicated in every file - return lazy_tensors[0] - if name.startswith('tok_embeddings.') or \ - name.endswith('.attention.wo.weight') or \ - name.endswith('.feed_forward.w2.weight'): - # split by columns - axis = 1 - else: - # split by rows - axis = 0 - concatenated_shape = list(lazy_tensors[0].shape) - concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) - - def load() -> UnquantizedTensor: - ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] - concatenated: NDArray = np.concatenate(ndarrays, axis=axis) - return UnquantizedTensor(concatenated) - description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' - return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) - return {name: convert(name) for name in names} - - -def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: - formats = set(mp.format for mp in models_plus) - assert len(formats) == 1, "different formats?" - format = formats.pop() - paths = [path for mp in models_plus for path in mp.paths] - # Use the first non-None vocab, if any. - try: - vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None) - except StopIteration: - vocab = None - - if any("model.embed_tokens.weight" in mp.model for mp in models_plus): - # Transformers models put different tensors in different files, but - # don't split individual tensors between files. - model: LazyModel = {} - for mp in models_plus: - model.update(mp.model) - else: - model = merge_sharded([mp.model for mp in models_plus]) - - return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types - - -def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: - def load() -> Tensor: - return lazy_tensor.load().permute(n_head, n_head_kv) - return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) - - -def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: - def load() -> Tensor: - return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) - s = lazy_tensor.shape.copy() - s[0] = s[0] // 3 - return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) - - -def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: - def load() -> Tensor: - return lazy_tensor.load().part(n_part) - s = lazy_tensor.shape.copy() - s[0] = s[0] // 3 - return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) - - -# Functionality that simulates `torch.load` but where individual tensors are -# only loaded into memory on demand, not all at once. -# PyTorch can't do this natively as of time of writing: -# - https://github.com/pytorch/pytorch/issues/64327 -# This allows us to de-shard without multiplying RAM usage, and also -# conveniently drops the PyTorch dependency (though we still need numpy). - - -@dataclass -class LazyStorageKind: - data_type: DataType - - -@dataclass -class LazyStorage: - load: Callable[[int, int], NDArray] - kind: LazyStorageKind - description: str - - -class LazyUnpickler(pickle.Unpickler): - def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): - super().__init__(fp) - self.data_base_path = data_base_path - self.zip_file = zip_file - - def persistent_load(self, pid: Any) -> Any: - assert pid[0] == 'storage' - assert isinstance(pid[1], LazyStorageKind) - data_type = pid[1].data_type - filename_stem = pid[2] - filename = f'{self.data_base_path}/{filename_stem}' - info = self.zip_file.getinfo(filename) - - def load(offset: int, elm_count: int) -> NDArray: - dtype = data_type.dtype - fp = self.zip_file.open(info) - fp.seek(offset * dtype.itemsize) - size = elm_count * dtype.itemsize - data = fp.read(size) - assert len(data) == size - return np.frombuffer(data, dtype) - description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' - return LazyStorage(load=load, kind=pid[1], description=description) - - @staticmethod - def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, - requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: - assert isinstance(storage, LazyStorage) - - def load() -> UnquantizedTensor: - elm_count = stride[0] * size[0] - return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size)) - description = f'pickled storage_offset={storage_offset} in {storage.description}' - return LazyTensor(load, list(size), storage.kind.data_type, description) - - @staticmethod - def rebuild_from_type_v2(func, new_type, args, state): - return func(*args) - - CLASSES: dict[tuple[str, str], Any] = { - # getattr used here as a workaround for mypy not being smart enough to determine - # the staticmethods have a __func__ attribute. - ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), - ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), - ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), - ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), - ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), - ('torch', 'IntStorage'): LazyStorageKind(DT_I32), - ('torch', 'Tensor'): LazyTensor, - } - - def find_class(self, module: str, name: str) -> Any: - if not module.startswith('torch'): - return super().find_class(module, name) - return self.CLASSES[(module, name)] - - -def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: - zf = zipfile.ZipFile(outer_fp) - pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] - assert len(pickle_paths) == 1, pickle_paths - pickle_fp = zf.open(pickle_paths[0], 'r') - unpickler = LazyUnpickler(pickle_fp, - data_base_path=pickle_paths[0][:-4], - zip_file=zf) - model = unpickler.load() - if 'model' in model: model = model['model'] - as_dict = dict(model.items()) - return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) - - -def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: - header_size, = struct.unpack(' LazyTensor: - data_type = SAFETENSORS_DATA_TYPES[info['dtype']] - numpy_dtype = data_type.dtype - shape: list[int] = info['shape'] - begin, end = info['data_offsets'] - assert 0 <= begin <= end <= len(byte_buf) - assert end - begin == math.prod(shape) * numpy_dtype.itemsize - buf = byte_buf[begin:end] - - def load() -> UnquantizedTensor: - return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) - description = f'safetensors begin={begin} end={end} type={data_type} path={path}' - return LazyTensor(load, shape, data_type, description) - model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'} - return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None) - - -def must_read(fp: IO[bytes], length: int) -> bytes: - ret = fp.read(length) - if len(ret) < length: - raise Exception("unexpectedly reached end of file") - return ret - - -@functools.lru_cache(maxsize=None) -def lazy_load_file(path: Path) -> ModelPlus: - fp = open(path, 'rb') - first8 = fp.read(8) - fp.seek(0) - if first8[:2] == b'PK': - # A zip file, i.e. PyTorch format - return lazy_load_torch_file(fp, path) - elif struct.unpack(' Iterable[Out]: - '''Parallel map, but with backpressure. If the caller doesn't call `next` - fast enough, this will stop calling `func` at some point rather than - letting results pile up in memory. Specifically, there is a max of one - output value buffered per thread.''' - if concurrency < 2: - yield from map(func, iterable) - # Not reached. - iterable = iter(iterable) - executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor] - if use_processpool_executor: - executor_class = ProcessPoolExecutor - else: - executor_class = ThreadPoolExecutor - with executor_class(max_workers=max_workers) as executor: - futures: list[concurrent.futures.Future[Out]] = [] - done = False - for _ in range(concurrency): - try: - futures.append(executor.submit(func, next(iterable))) - except StopIteration: - done = True - break - - while futures: - result = futures.pop(0).result() - while not done and len(futures) < concurrency: - try: - futures.append(executor.submit(func, next(iterable))) - except StopIteration: - done = True - break - yield result - - -def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: - # Handle special case where the model's vocab size is not set - if params.n_vocab == -1: - raise ValueError( - f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" - ) - - # Check for a vocab size mismatch - if params.n_vocab == vocab.vocab_size: - print("Ignoring added_tokens.json since model matches vocab size without it.") - return - - if pad_vocab and params.n_vocab > vocab.vocab_size: - pad_count = params.n_vocab - vocab.vocab_size - print( - f"Padding vocab with {pad_count} token(s) - through " - ) - for i in range(1, pad_count + 1): - vocab.added_tokens_dict[f""] = -1 - vocab.added_tokens_list.append(f"") - vocab.vocab_size = params.n_vocab - return - - msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})." - if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: - msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." - if vocab.vocab_size < params.n_vocab: - msg += " Add the --pad-vocab option and try again." - - raise Exception(msg) - - -class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - - def add_meta_arch(self, params: Params) -> None: - name = "LLaMA" - - # TODO: better logic to determine model name - if params.n_ctx == 4096: - name = "LLaMA v2" - elif params.path_model is not None: - name = str(params.path_model.parent).split('/')[-1] - - self.gguf.add_name (name) - self.gguf.add_context_length (params.n_ctx) - self.gguf.add_embedding_length (params.n_embd) - self.gguf.add_block_count (params.n_layer) - self.gguf.add_feed_forward_length (params.n_ff) - self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) - self.gguf.add_head_count (params.n_head) - self.gguf.add_head_count_kv (params.n_head_kv) - - if params.n_experts: - self.gguf.add_expert_count(params.n_experts) - - if params.n_experts_used: - self.gguf.add_expert_used_count(params.n_experts_used) - - if params.f_norm_eps: - self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) - else: - raise ValueError('f_norm_eps is None') - - if params.f_rope_freq_base is not None: - self.gguf.add_rope_freq_base(params.f_rope_freq_base) - - if params.rope_scaling_type: - assert params.f_rope_scale is not None - self.gguf.add_rope_scaling_type(params.rope_scaling_type) - self.gguf.add_rope_scaling_factor(params.f_rope_scale) - - if params.n_orig_ctx is not None: - self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx) - - if params.rope_finetuned is not None: - self.gguf.add_rope_scaling_finetuned(params.rope_finetuned) - - if params.ftype is not None: - self.gguf.add_file_type(params.ftype) - - def handle_tokenizer_model(self, vocab: Vocab) -> str: - # Map the vocab types to the supported tokenizer models - tokenizer_model = { - SentencePieceVocab: "llama", - HfVocab: "llama", - BpeVocab: "gpt2", - }.get(type(vocab)) - - # Block if vocab type is not predefined - if tokenizer_model is None: - raise ValueError("Unknown vocab type: Not supported") - - return tokenizer_model - - def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: - tokens = [] - scores = [] - toktypes = [] - - # NOTE: `all_tokens` returns the base vocabulary and added tokens - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - return tokens, scores, toktypes - - def add_meta_vocab(self, vocab: Vocab) -> None: - # Handle the tokenizer model - tokenizer_model = self.handle_tokenizer_model(vocab) - - # Ensure that tokenizer_model is added to the GGUF model - self.gguf.add_tokenizer_model(tokenizer_model) - - # Extract model vocabulary for model conversion - tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) - - # Add extracted token information for model conversion - self.gguf.add_token_list(tokens) - self.gguf.add_token_scores(scores) - self.gguf.add_token_types(toktypes) - - def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None: - svocab.add_to_gguf(self.gguf) - - def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: - n_elements = int(np.prod(tensor.shape)) - raw_dtype = getattr(tensor.data_type, 'ggml_type', None) - data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype - data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) - - def write_meta(self) -> None: - self.gguf.write_header_to_file() - self.gguf.write_kv_data_to_file() - - def write_tensor_info(self) -> None: - self.gguf.write_ti_data_to_file() - - def close(self) -> None: - self.gguf.close() - - @staticmethod - def write_vocab_only( - fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, - ) -> None: - check_vocab_size(params, vocab, pad_vocab = pad_vocab) - - of = OutputFile(fname_out, endianess=endianess) - - # meta data - of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) - - of.write_meta() - - of.close() - - @staticmethod - def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]: - name, lazy_tensor = item - tensor = lazy_tensor.load().to_ggml() - return (lazy_tensor.data_type, tensor.ndarray) - - @staticmethod - def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: - dt, arr = item - if not isinstance(dt, QuantizedDataType): - return arr - return dt.quantize(arr) - - @staticmethod - def write_all( - fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, - ) -> None: - check_vocab_size(params, vocab, pad_vocab=pad_vocab) - - of = OutputFile(fname_out, endianess=endianess) - - # meta data - of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) - - # tensor info - for name, lazy_tensor in model.items(): - of.add_tensor_info(name, lazy_tensor) - - of.write_meta() - of.write_tensor_info() - - # tensor data - ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) - if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, - use_processpool_executor=True, - ) - else: - ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) - - start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): - elapsed = time.time() - start - size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) - padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" - ) - of.gguf.write_tensor_data(ndarray) - - of.close() - - -def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type - - if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): - return GGMLFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): - return GGMLFileType.MostlyF16 - if output_type_str == "q8_0": - return GGMLFileType.MostlyQ8_0 - - name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} - - raise Exception(f"Unexpected combination of types: {name_to_type}") - - -def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: - return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) - for (name, tensor) in model.items()} - - -def convert_model_names(model: LazyModel, params: Params) -> LazyModel: - tmap = gguf.TensorNameMap(ARCH, params.n_layer) - should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) - - tmp = model - - # HF models permut or pack some of the tensors, so we need to undo that - for i in itertools.count(): - if f"model.layers.{i}.self_attn.q_proj.weight" in model: - print(f"Permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) - # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] - elif f"model.layers.{i}.self_attn.W_pack.weight" in model: - print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) - tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) - del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] - else: - break - - out: LazyModel = {} - for name, lazy_tensor in model.items(): - tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) - if name_new is None: - raise Exception(f"Unexpected tensor name: {name}") - - if tensor_type in should_skip: - print(f"skipping tensor {name_new}") - continue - - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") - out[name_new] = lazy_tensor - - return out - - -def nth_multifile_path(path: Path, n: int) -> Path | None: - '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return - the nth path in the model. - ''' - # Support the following patterns: - patterns: list[tuple[str, str]] = [ - # - x.00.pth, x.01.pth, etc. - (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), - # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. - (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'), - # x.bin, x.bin.1, etc. - (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}') - ] - for regex, replacement in patterns: - if re.search(regex, path.name): - new_path = path.with_name(re.sub(regex, replacement, path.name)) - if new_path.exists(): - return new_path - return None - - -def find_multifile_paths(path: Path) -> list[Path]: - '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return - the whole list of paths in the model. - ''' - ret: list[Path] = [] - for i in itertools.count(): - nth_path = nth_multifile_path(path, i) - if nth_path is None: - break - ret.append(nth_path) - if not ret: - # No matches. This should only happen if the file was named, e.g., - # foo.0, and there was no file named foo. Oh well, try to process it - # as a single file. - return [path] - return ret - - -def load_some_model(path: Path) -> ModelPlus: - '''Load a model of any supported format.''' - # Be extra-friendly and accept either a file or a directory: - if path.is_dir(): - # Check if it's a set of safetensors files first - globs = ["model-00001-of-*.safetensors", "model.safetensors"] - files = [file for glob in globs for file in path.glob(glob)] - if not files: - # Try the PyTorch patterns too, with lower priority - globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] - files = [file for glob in globs for file in path.glob(glob)] - if not files: - raise Exception(f"Can't find model in directory {path}") - if len(files) > 1: - raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") - path = files[0] - - paths = find_multifile_paths(path) - models_plus: list[ModelPlus] = [] - for path in paths: - print(f"Loading model file {path}") - models_plus.append(lazy_load_file(path)) - - model_plus = merge_multifile_models(models_plus) - return model_plus - - -class VocabFactory: - def __init__(self, path: Path): - self.path = path - self.files: dict[str, Path | None] = { - "tokenizer.model": None, - "vocab.json": None, - "tokenizer.json": None, - } - self._detect_files() - - def _detect_files(self): - for file in self.files.keys(): - file_path = self.path / file - parent_file_path = self.path.parent / file - if file_path.exists(): - self.files[file] = file_path - elif parent_file_path.exists(): - self.files[file] = parent_file_path - print(f"Found vocab files: {self.files}") - - def _select_file(self, vocabtype: str | None) -> Path: - if vocabtype in ["spm", "bpe"]: - for file_key in self.files.keys(): - if (file := self.files[file_key]) is not None: - return file - raise FileNotFoundError(f"{vocabtype} vocab not found.") - if vocabtype == "hfft": - # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file - return self.path - raise ValueError(f"Unsupported vocabulary type {vocabtype}") - - def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: - load_merges = vocabtype == "bpe" - n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None - return gguf.SpecialVocab( - model_parent_path, - load_merges=load_merges, - special_token_types=None, # Predetermined or passed as a parameter - n_vocab=n_vocab, - ) - - def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: - path = self._select_file(vocabtype) - print(f"Loading vocab file '{path}', type '{vocabtype}'") - - added_tokens_path = path.parent / "added_tokens.json" - vocab: Vocab - if vocabtype == "bpe": - vocab = BpeVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - elif vocabtype == "spm": - vocab = SentencePieceVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - elif vocabtype == "hfft": - vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") - # FIXME: Respect --vocab-dir? - special_vocab = self._create_special_vocab( - vocab, - vocabtype, - model_parent_path, - ) - return vocab, special_vocab - - -def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: - namestr = { - GGMLFileType.AllF32: "f32", - GGMLFileType.MostlyF16: "f16", - GGMLFileType.MostlyQ8_0:"q8_0", - }[file_type] - ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" - if ret in model_paths: - sys.stderr.write( - f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n") - sys.exit(1) - return ret - - -def do_dump_model(model_plus: ModelPlus) -> None: - print(f"model_plus.paths = {model_plus.paths!r}") - print(f"model_plus.format = {model_plus.format!r}") - print(f"model_plus.vocab = {model_plus.vocab!r}") - for name, lazy_tensor in model_plus.model.items(): - print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") - - -def main(args_in: list[str] | None = None) -> None: - output_choices = ["f32", "f16"] - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # We currently only support Q8_0 output on little endian systems. - output_choices.append("q8_0") - vocab_types = ["spm", "bpe", "hfft"] - parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") - parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) - parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") - parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") - parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") - parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") - parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") - parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) - parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") - parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") - - args = parser.parse_args(args_in) - if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" - if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") - args.model = tmp_model_path - - if args.dump_single: - model_plus = lazy_load_file(args.model) - do_dump_model(model_plus) - return - - if not args.vocab_only: - model_plus = load_some_model(args.model) - else: - model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) - - if args.dump: - do_dump_model(model_plus) - return - endianess = gguf.GGUFEndian.LITTLE - if args.big_endian: - endianess = gguf.GGUFEndian.BIG - - params = Params.load(model_plus) - if params.n_ctx == -1: - if args.ctx is None: - raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" - "Please specify one with --ctx:\n" - " - LLaMA v1: --ctx 2048\n" - " - LLaMA v2: --ctx 4096\n") - params.n_ctx = args.ctx - - if args.outtype: - params.ftype = { - "f32": GGMLFileType.AllF32, - "f16": GGMLFileType.MostlyF16, - "q8_0": GGMLFileType.MostlyQ8_0, - }[args.outtype] - - print(f"params = {params}") - - model_parent_path = model_plus.paths[0].parent - vocab_path = Path(args.vocab_dir or args.model or model_parent_path) - vocab_factory = VocabFactory(vocab_path) - vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) - - if args.vocab_only: - if not args.outfile: - raise ValueError("need --outfile if using --vocab-only") - outfile = args.outfile - OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, - endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") - return - - if model_plus.vocab is not None and args.vocab_dir is None: - vocab = model_plus.vocab - - print(f"Vocab info: {vocab}") - print(f"Special vocab info: {special_vocab}") - - model = model_plus.model - model = convert_model_names(model, params) - ftype = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype) - - params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") - - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/gguf-py/LICENSE b/extensions/model-extension/scripts/gguf-py/LICENSE deleted file mode 100644 index 76f67efdc..000000000 --- a/extensions/model-extension/scripts/gguf-py/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2023 Georgi Gerganov - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/extensions/model-extension/scripts/gguf-py/README.md b/extensions/model-extension/scripts/gguf-py/README.md deleted file mode 100644 index 22d7ffa52..000000000 --- a/extensions/model-extension/scripts/gguf-py/README.md +++ /dev/null @@ -1,81 +0,0 @@ -## gguf - -This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) -(GGML Universal File) format. - -See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) -as an example for its usage. - -## Installation -```sh -pip install gguf -``` - -## API Examples/Simple Tools - -[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. - -[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console. - -[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key. - -[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files. - -## Development -Maintainers who participate in development of this package are advised to install it in editable mode: - -```sh -cd /path/to/llama.cpp/gguf-py - -pip install --editable . -``` - -**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`. -In this case, upgrade Pip to the latest: - -```sh -pip install --upgrade pip -``` - -## Automatic publishing with CI - -There's a GitHub workflow to make a release automatically upon creation of tags in a specified format. - -1. Bump the version in `pyproject.toml`. -2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number. - -```sh -git tag -a gguf-v1.0.0 -m "Version 1.0 release" -``` - -3. Push the tags. - -```sh -git push origin --tags -``` - -## Manual publishing -If you want to publish the package manually for any reason, you need to have `twine` and `build` installed: - -```sh -pip install build twine -``` - -Then, follow these steps to release a new version: - -1. Bump the version in `pyproject.toml`. -2. Build the package: - -```sh -python -m build -``` - -3. Upload the generated distribution archives: - -```sh -python -m twine upload dist/* -``` - -## TODO -- [ ] Add tests -- [ ] Include conversion scripts as command line entry points in this package. diff --git a/extensions/model-extension/scripts/gguf-py/examples/writer.py b/extensions/model-extension/scripts/gguf-py/examples/writer.py deleted file mode 100755 index f39eed1af..000000000 --- a/extensions/model-extension/scripts/gguf-py/examples/writer.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -import sys -from pathlib import Path - -import numpy as np - -# Necessary to load the local gguf package -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf import GGUFWriter # noqa: E402 - - -# Example usage: -def writer_example() -> None: - # Example usage with a file - gguf_writer = GGUFWriter("example.gguf", "llama") - - gguf_writer.add_architecture() - gguf_writer.add_block_count(12) - gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer - gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float - gguf_writer.add_custom_alignment(64) - - tensor1 = np.ones((32,), dtype=np.float32) * 100.0 - tensor2 = np.ones((64,), dtype=np.float32) * 101.0 - tensor3 = np.ones((96,), dtype=np.float32) * 102.0 - - gguf_writer.add_tensor("tensor1", tensor1) - gguf_writer.add_tensor("tensor2", tensor2) - gguf_writer.add_tensor("tensor3", tensor3) - - gguf_writer.write_header_to_file() - gguf_writer.write_kv_data_to_file() - gguf_writer.write_tensors_to_file() - - gguf_writer.close() - - -if __name__ == '__main__': - writer_example() diff --git a/extensions/model-extension/scripts/gguf-py/gguf/__init__.py b/extensions/model-extension/scripts/gguf-py/gguf/__init__.py deleted file mode 100644 index 110ab342c..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .constants import * -from .gguf_reader import * -from .gguf_writer import * -from .tensor_mapping import * -from .vocab import * diff --git a/extensions/model-extension/scripts/gguf-py/gguf/constants.py b/extensions/model-extension/scripts/gguf-py/gguf/constants.py deleted file mode 100644 index 1cfd41c0b..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/constants.py +++ /dev/null @@ -1,665 +0,0 @@ -from __future__ import annotations - -import sys -from enum import Enum, IntEnum, auto -from typing import Any - -# -# constants -# - -GGUF_MAGIC = 0x46554747 # "GGUF" -GGUF_VERSION = 3 -GGUF_DEFAULT_ALIGNMENT = 32 - -# -# metadata keys -# - - -class Keys: - class General: - ARCHITECTURE = "general.architecture" - QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - NAME = "general.name" - AUTHOR = "general.author" - URL = "general.url" - DESCRIPTION = "general.description" - LICENSE = "general.license" - SOURCE_URL = "general.source.url" - SOURCE_HF_REPO = "general.source.huggingface.repository" - FILE_TYPE = "general.file_type" - - class LLM: - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - BLOCK_COUNT = "{arch}.block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" - USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - - class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" - LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - - class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - - class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" - - -# -# recommended mapping of model tensor names for storage in gguf -# - - -class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - PERSIMMON = auto() - REFACT = auto() - BERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - PHI2 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - - -class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - - -MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.PERSIMMON: "persimmon", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", -} - -TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", -} - -MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { - MODEL_ARCH.LLAMA: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_GATE_INP, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.FFN_GATE_EXP, - MODEL_TENSOR.FFN_DOWN_EXP, - MODEL_TENSOR.FFN_UP_EXP, - ], - MODEL_ARCH.GPTNEOX: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.FALCON: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_NORM_2, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.BAICHUAN: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.STARCODER: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.BERT: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.TOKEN_TYPES, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.MPT: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.FFN_ACT, - ], - MODEL_ARCH.GPTJ: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.PERSIMMON: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.ATTN_Q_NORM, - MODEL_TENSOR.ATTN_K_NORM, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.REFACT: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.BLOOM: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.TOKEN_EMBD_NORM, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.STABLELM: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.QWEN: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.QWEN2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.PLAMO: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.GPT2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.PHI2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.CODESHELL: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.ORION: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.INTERNLM2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.MINICPM: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_GATE_INP, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.FFN_GATE_EXP, - MODEL_TENSOR.FFN_DOWN_EXP, - MODEL_TENSOR.FFN_UP_EXP, - ], - # TODO -} - -# tensors that will not be serialized -MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { - MODEL_ARCH.LLAMA: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.BAICHUAN: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.PERSIMMON: [ - MODEL_TENSOR.ROPE_FREQS, - ], - MODEL_ARCH.QWEN: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.CODESHELL: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.ORION: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], -} - -# -# types -# - - -class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' - - -class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 - - -class GGUFEndian(IntEnum): - LITTLE = 0 - BIG = 1 - - -class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 - FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 - FLOAT64 = 12 - - @staticmethod - def get_type(val: Any) -> GGUFValueType: - if isinstance(val, (str, bytes, bytearray)): - return GGUFValueType.STRING - elif isinstance(val, list): - return GGUFValueType.ARRAY - elif isinstance(val, float): - return GGUFValueType.FLOAT32 - elif isinstance(val, bool): - return GGUFValueType.BOOL - elif isinstance(val, int): - return GGUFValueType.INT32 - # TODO: need help with 64-bit types in Python - else: - print("Unknown type:", type(val)) - sys.exit() - - -# Note: Does not support GGML_QKK_64 -QK_K = 256 -# Items here are (block size, type size) -GGML_QUANT_SIZES = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), -} - - -# Aliases for backward compatibility. - -# general -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE -KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE - -# LLM -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH -KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT - -# attention -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS -KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS - -# RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR -KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED - -# tokenization -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST -KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf.py deleted file mode 100644 index 651a81eb8..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/gguf.py +++ /dev/null @@ -1,15 +0,0 @@ -# This file left for compatibility. If you want to use the GGUF API from Python -# then don't import gguf/gguf.py directly. If you're looking for examples, see the -# examples/ directory for gguf-py - -import importlib -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent)) - -# Compatibility for people trying to import gguf/gguf.py directly instead of as a package. -importlib.invalidate_caches() -import gguf # noqa: E402 - -importlib.reload(gguf) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py deleted file mode 100644 index 5b6d4ba6b..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py +++ /dev/null @@ -1,264 +0,0 @@ -# -# GGUF file reading/modification support. For API usage information, -# please see the files scripts/ for some fairly simple examples. -# -from __future__ import annotations - -import os -from collections import OrderedDict -from typing import Any, Literal, NamedTuple, TypeVar, Union - -import numpy as np -import numpy.typing as npt - -if __name__ == "__main__": - import sys - from pathlib import Path - - # Allow running file in package as a script. - sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf.constants import ( - GGML_QUANT_SIZES, - GGUF_DEFAULT_ALIGNMENT, - GGUF_MAGIC, - GGUF_VERSION, - GGMLQuantizationType, - GGUFValueType, -) - - -READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] - - -class ReaderField(NamedTuple): - # Offset to start of this field. - offset: int - - # Name of the field (not necessarily from file data). - name: str - - # Data parts. Some types have multiple components, such as strings - # that consist of a length followed by the string data. - parts: list[npt.NDArray[Any]] = [] - - # Indexes into parts that we can call the actual data. For example - # an array of strings will be populated with indexes to the actual - # string data. - data: list[int] = [-1] - - types: list[GGUFValueType] = [] - - -class ReaderTensor(NamedTuple): - name: str - tensor_type: GGMLQuantizationType - shape: npt.NDArray[np.uint32] - n_elements: int - n_bytes: int - data_offset: int - data: npt.NDArray[Any] - field: ReaderField - - -class GGUFReader: - # I - same as host, S - swapped - byte_order: Literal['I' | 'S'] = 'I' - alignment: int = GGUF_DEFAULT_ALIGNMENT - - # Note: Internal helper, API may change. - gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = { - GGUFValueType.UINT8: np.uint8, - GGUFValueType.INT8: np.int8, - GGUFValueType.UINT16: np.uint16, - GGUFValueType.INT16: np.int16, - GGUFValueType.UINT32: np.uint32, - GGUFValueType.INT32: np.int32, - GGUFValueType.FLOAT32: np.float32, - GGUFValueType.UINT64: np.uint64, - GGUFValueType.INT64: np.int64, - GGUFValueType.FLOAT64: np.float64, - GGUFValueType.BOOL: np.bool_, - } - - def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'): - self.data = np.memmap(path, mode = mode) - offs = 0 - if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC: - raise ValueError('GGUF magic invalid') - offs += 4 - temp_version = self._get(offs, np.uint32) - if temp_version[0] & 65535 == 0: - # If we get 0 here that means it's (probably) a GGUF file created for - # the opposite byte order of the machine this script is running on. - self.byte_order = 'S' - temp_version = temp_version.newbyteorder(self.byte_order) - version = temp_version[0] - if version not in READER_SUPPORTED_VERSIONS: - raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle') - self.fields: OrderedDict[str, ReaderField] = OrderedDict() - self.tensors: list[ReaderTensor] = [] - offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32])) - temp_counts = self._get(offs, np.uint64, 2) - offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64])) - offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64])) - tensor_count, kv_count = temp_counts - offs = self._build_fields(offs, kv_count) - offs, tensors_fields = self._build_tensors_fields(offs, tensor_count) - new_align = self.fields.get('general.alignment') - if new_align is not None: - if new_align.types != [GGUFValueType.UINT32]: - raise ValueError('Bad type for general.alignment field') - self.alignment = new_align.parts[-1][0] - padding = offs % self.alignment - if padding != 0: - offs += self.alignment - padding - self._build_tensors(offs, tensors_fields) - - _DT = TypeVar('_DT', bound = npt.DTypeLike) - - # Fetch a key/value metadata field by key. - def get_field(self, key: str) -> Union[ReaderField, None]: - return self.fields.get(key, None) - - # Fetch a tensor from the list by index. - def get_tensor(self, idx: int) -> ReaderTensor: - return self.tensors[idx] - - def _get( - self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None, - ) -> npt.NDArray[Any]: - count = int(count) - itemsize = int(np.empty([], dtype = dtype).itemsize) - end_offs = offset + itemsize * count - return ( - self.data[offset:end_offs] - .view(dtype = dtype)[:count] - .newbyteorder(override_order or self.byte_order) - ) - - def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: - if field.name in self.fields: - raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') - self.fields[field.name] = field - return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts) - - def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]: - slen = self._get(offset, np.uint64) - return slen, self._get(offset + 8, np.uint8, slen[0]) - - def _get_field_parts( - self, orig_offs: int, raw_type: int, - ) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]: - offs = orig_offs - types: list[GGUFValueType] = [] - gtype = GGUFValueType(raw_type) - types.append(gtype) - # Handle strings. - if gtype == GGUFValueType.STRING: - sparts: list[npt.NDArray[Any]] = list(self._get_str(offs)) - size = sum(int(part.nbytes) for part in sparts) - return size, sparts, [1], types - # Check if it's a simple scalar type. - nptype = self.gguf_scalar_to_np.get(gtype) - if nptype is not None: - val = self._get(offs, nptype) - return int(val.nbytes), [val], [0], types - # Handle arrays. - if gtype == GGUFValueType.ARRAY: - raw_itype = self._get(offs, np.uint32) - offs += int(raw_itype.nbytes) - alen = self._get(offs, np.uint64) - offs += int(alen.nbytes) - aparts: list[npt.NDArray[Any]] = [raw_itype, alen] - data_idxs: list[int] = [] - for idx in range(alen[0]): - curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0]) - if idx == 0: - types += curr_types - idxs_offs = len(aparts) - aparts += curr_parts - data_idxs += (idx + idxs_offs for idx in curr_idxs) - offs += curr_size - return offs - orig_offs, aparts, data_idxs, types - # We can't deal with this one. - raise ValueError('Unknown/unhandled field type {gtype}') - - def _get_tensor(self, orig_offs: int) -> ReaderField: - offs = orig_offs - name_len, name_data = self._get_str(offs) - offs += int(name_len.nbytes + name_data.nbytes) - n_dims = self._get(offs, np.uint32) - offs += int(n_dims.nbytes) - dims = self._get(offs, np.uint64, n_dims[0]) - offs += int(dims.nbytes) - raw_dtype = self._get(offs, np.uint32) - offs += int(raw_dtype.nbytes) - offset_tensor = self._get(offs, np.uint64) - offs += int(offset_tensor.nbytes) - return ReaderField( - orig_offs, - str(bytes(name_data), encoding = 'utf-8'), - [name_len, name_data, n_dims, dims, raw_dtype, offset_tensor], - [1, 3, 4, 5], - ) - - def _build_fields(self, offs: int, count: int) -> int: - for _ in range(count): - orig_offs = offs - kv_klen, kv_kdata = self._get_str(offs) - offs += int(kv_klen.nbytes + kv_kdata.nbytes) - raw_kv_type = self._get(offs, np.uint32) - offs += int(raw_kv_type.nbytes) - parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type] - idxs_offs = len(parts) - field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0]) - parts += field_parts - self._push_field(ReaderField( - orig_offs, - str(bytes(kv_kdata), encoding = 'utf-8'), - parts, - [idx + idxs_offs for idx in field_idxs], - field_types, - ), skip_sum = True) - offs += field_size - return offs - - def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]: - tensor_fields = [] - for _ in range(count): - field = self._get_tensor(offs) - offs += sum(int(part.nbytes) for part in field.parts) - tensor_fields.append(field) - return offs, tensor_fields - - def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: - tensors = [] - for field in fields: - _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts - ggml_type = GGMLQuantizationType(raw_dtype[0]) - n_elems = np.prod(dims) - block_size, type_size = GGML_QUANT_SIZES[ggml_type] - n_bytes = n_elems * type_size // block_size - data_offs = int(start_offs + offset_tensor[0]) - item_type: npt.DTypeLike - if ggml_type == GGMLQuantizationType.F32: - item_count = n_elems - item_type = np.float32 - elif ggml_type == GGMLQuantizationType.F16: - item_count = n_elems - item_type = np.float16 - else: - item_count = n_bytes - item_type = np.uint8 - tensors.append(ReaderTensor( - name = str(bytes(name_data), encoding = 'utf-8'), - tensor_type = ggml_type, - shape = dims, - n_elements = n_elems, - n_bytes = n_bytes, - data_offset = data_offs, - data = self._get(data_offs, item_type, item_count), - field = field, - )) - self.tensors = tensors diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py deleted file mode 100644 index 16808196e..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py +++ /dev/null @@ -1,427 +0,0 @@ -from __future__ import annotations - -import os -import shutil -import struct -import tempfile -from enum import Enum, auto -from io import BufferedWriter -from typing import IO, Any, Sequence - -import numpy as np - -from .constants import ( - GGUF_DEFAULT_ALIGNMENT, - GGUF_MAGIC, - GGUF_VERSION, - GGMLQuantizationType, - GGUFEndian, - GGUFValueType, - Keys, - RopeScalingType, - TokenType, -) - - -class WriterState(Enum): - EMPTY = auto() - HEADER = auto() - KV_DATA = auto() - TI_DATA = auto() - - -class GGUFWriter: - fout: BufferedWriter - temp_file: tempfile.SpooledTemporaryFile[bytes] | None - tensors: list[np.ndarray[Any, Any]] - _simple_value_packing = { - GGUFValueType.UINT8: "B", - GGUFValueType.INT8: "b", - GGUFValueType.UINT16: "H", - GGUFValueType.INT16: "h", - GGUFValueType.UINT32: "I", - GGUFValueType.INT32: "i", - GGUFValueType.FLOAT32: "f", - GGUFValueType.UINT64: "Q", - GGUFValueType.INT64: "q", - GGUFValueType.FLOAT64: "d", - GGUFValueType.BOOL: "?", - } - - def __init__( - self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True, - endianess: GGUFEndian = GGUFEndian.LITTLE, - ): - self.fout = open(path, "wb") - self.arch = arch - self.endianess = endianess - self.offset_tensor = 0 - self.data_alignment = GGUF_DEFAULT_ALIGNMENT - self.kv_data = bytearray() - self.kv_data_count = 0 - self.ti_data = bytearray() - self.ti_data_count = 0 - self.use_temp_file = use_temp_file - self.temp_file = None - self.tensors = [] - print("gguf: This GGUF file is for {0} Endian only".format( - "Big" if self.endianess == GGUFEndian.BIG else "Little", - )) - self.state = WriterState.EMPTY - - self.add_architecture() - - def write_header_to_file(self) -> None: - if self.state is not WriterState.EMPTY: - raise ValueError(f'Expected output file to be empty, got {self.state}') - - self._write_packed(" None: - if self.state is not WriterState.HEADER: - raise ValueError(f'Expected output file to contain the header, got {self.state}') - - self.fout.write(self.kv_data) - self.flush() - self.state = WriterState.KV_DATA - - def write_ti_data_to_file(self) -> None: - if self.state is not WriterState.KV_DATA: - raise ValueError(f'Expected output file to contain KV data, got {self.state}') - - self.fout.write(self.ti_data) - self.flush() - self.state = WriterState.TI_DATA - - def add_key(self, key: str) -> None: - self.add_val(key, GGUFValueType.STRING, add_vtype=False) - - def add_uint8(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT8) - - def add_int8(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT8) - - def add_uint16(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT16) - - def add_int16(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT16) - - def add_uint32(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT32) - - def add_int32(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT32) - - def add_float32(self, key: str, val: float) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.FLOAT32) - - def add_uint64(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT64) - - def add_int64(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT64) - - def add_float64(self, key: str, val: float) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.FLOAT64) - - def add_bool(self, key: str, val: bool) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.BOOL) - - def add_string(self, key: str, val: str) -> None: - if not val: - return - self.add_key(key) - self.add_val(val, GGUFValueType.STRING) - - def add_array(self, key: str, val: Sequence[Any]) -> None: - if not isinstance(val, Sequence): - raise ValueError("Value must be a sequence for array type") - - self.add_key(key) - self.add_val(val, GGUFValueType.ARRAY) - - def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None: - if vtype is None: - vtype = GGUFValueType.get_type(val) - - if add_vtype: - self.kv_data += self._pack("I", vtype) - self.kv_data_count += 1 - - pack_fmt = self._simple_value_packing.get(vtype) - if pack_fmt is not None: - self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL) - elif vtype == GGUFValueType.STRING: - encoded_val = val.encode("utf8") if isinstance(val, str) else val - self.kv_data += self._pack("Q", len(encoded_val)) - self.kv_data += encoded_val - elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: - ltype = GGUFValueType.get_type(val[0]) - if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): - raise ValueError("All items in a GGUF array should be of the same type") - self.kv_data += self._pack("I", ltype) - self.kv_data += self._pack("Q", len(val)) - for item in val: - self.add_val(item, add_vtype=False) - else: - raise ValueError("Invalid GGUF metadata value type or value") - - @staticmethod - def ggml_pad(x: int, n: int) -> int: - return ((x + n - 1) // n) * n - - def add_tensor_info( - self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], - tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None, - ) -> None: - if self.state is not WriterState.EMPTY: - raise ValueError(f'Expected output file to be empty, got {self.state}') - - if raw_dtype is None and tensor_dtype not in (np.float32, np.float16): - raise ValueError("Only F32 and F16 tensors are supported for now") - - encoded_name = name.encode("utf8") - self.ti_data += self._pack("Q", len(encoded_name)) - self.ti_data += encoded_name - n_dims = len(tensor_shape) - self.ti_data += self._pack("I", n_dims) - for i in range(n_dims): - self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) - if raw_dtype is None: - dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 - else: - dtype = raw_dtype - self.ti_data += self._pack("I", dtype) - self.ti_data += self._pack("Q", self.offset_tensor) - self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) - self.ti_data_count += 1 - - def add_tensor( - self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, - raw_dtype: GGMLQuantizationType | None = None, - ) -> None: - if self.endianess == GGUFEndian.BIG: - tensor.byteswap(inplace=True) - if self.use_temp_file and self.temp_file is None: - fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) - fp.seek(0) - self.temp_file = fp - - shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape - self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) - - if self.temp_file is None: - self.tensors.append(tensor) - return - - tensor.tofile(self.temp_file) - self.write_padding(self.temp_file, tensor.nbytes) - - def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None: - pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n - if pad != 0: - fp.write(bytes([0] * pad)) - - def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: - if self.state is not WriterState.TI_DATA: - raise ValueError(f'Expected output file to contain tensor info, got {self.state}') - - if self.endianess == GGUFEndian.BIG: - tensor.byteswap(inplace=True) - self.write_padding(self.fout, self.fout.tell()) - tensor.tofile(self.fout) - self.write_padding(self.fout, tensor.nbytes) - - def write_tensors_to_file(self) -> None: - self.write_ti_data_to_file() - - self.write_padding(self.fout, self.fout.tell()) - - if self.temp_file is None: - while True: - try: - tensor = self.tensors.pop(0) - except IndexError: - break - tensor.tofile(self.fout) - self.write_padding(self.fout, tensor.nbytes) - return - - self.temp_file.seek(0) - - shutil.copyfileobj(self.temp_file, self.fout) - self.flush() - self.temp_file.close() - - def flush(self) -> None: - self.fout.flush() - - def close(self) -> None: - self.fout.close() - - def add_architecture(self) -> None: - self.add_string(Keys.General.ARCHITECTURE, self.arch) - - def add_author(self, author: str) -> None: - self.add_string(Keys.General.AUTHOR, author) - - def add_tensor_data_layout(self, layout: str) -> None: - self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) - - def add_url(self, url: str) -> None: - self.add_string(Keys.General.URL, url) - - def add_description(self, description: str) -> None: - self.add_string(Keys.General.DESCRIPTION, description) - - def add_source_url(self, url: str) -> None: - self.add_string(Keys.General.SOURCE_URL, url) - - def add_source_hf_repo(self, repo: str) -> None: - self.add_string(Keys.General.SOURCE_HF_REPO, repo) - - def add_file_type(self, ftype: int) -> None: - self.add_uint32(Keys.General.FILE_TYPE, ftype) - - def add_name(self, name: str) -> None: - self.add_string(Keys.General.NAME, name) - - def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None: - self.add_uint32( - Keys.General.QUANTIZATION_VERSION, quantization_version) - - def add_custom_alignment(self, alignment: int) -> None: - self.data_alignment = alignment - self.add_uint32(Keys.General.ALIGNMENT, alignment) - - def add_context_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) - - def add_embedding_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) - - def add_block_count(self, length: int) -> None: - self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) - - def add_feed_forward_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) - - def add_parallel_residual(self, use: bool) -> None: - self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) - - def add_head_count(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) - - def add_head_count_kv(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) - - def add_key_length(self, length: int) -> None: - self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length) - - def add_value_length(self, length: int) -> None: - self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) - - def add_max_alibi_bias(self, bias: float) -> None: - self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) - - def add_clamp_kqv(self, value: float) -> None: - self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) - - def add_expert_count(self, count: int) -> None: - self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) - - def add_expert_used_count(self, count: int) -> None: - self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count) - - def add_layer_norm_eps(self, value: float) -> None: - self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) - - def add_layer_norm_rms_eps(self, value: float) -> None: - self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) - - def add_rope_dimension_count(self, count: int) -> None: - self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) - - def add_rope_freq_base(self, value: float) -> None: - self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) - - def add_rope_scaling_type(self, value: RopeScalingType) -> None: - self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value) - - def add_rope_scaling_factor(self, value: float) -> None: - self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) - - def add_rope_scaling_orig_ctx_len(self, value: int) -> None: - self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) - - def add_rope_scaling_finetuned(self, value: bool) -> None: - self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) - - def add_tokenizer_model(self, model: str) -> None: - self.add_string(Keys.Tokenizer.MODEL, model) - - def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: - self.add_array(Keys.Tokenizer.LIST, tokens) - - def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: - self.add_array(Keys.Tokenizer.MERGES, merges) - - def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: - self.add_array(Keys.Tokenizer.TOKEN_TYPE, types) - - def add_token_scores(self, scores: Sequence[float]) -> None: - self.add_array(Keys.Tokenizer.SCORES, scores) - - def add_bos_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.BOS_ID, id) - - def add_eos_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.EOS_ID, id) - - def add_unk_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.UNK_ID, id) - - def add_sep_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SEP_ID, id) - - def add_pad_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PAD_ID, id) - - def add_add_bos_token(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_BOS, value) - - def add_add_eos_token(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_EOS, value) - - def add_add_space_prefix(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) - - def add_chat_template(self, value: str) -> None: - self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) - - def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: - pack_prefix = '' - if not skip_pack_prefix: - pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>' - return struct.pack(f'{pack_prefix}{fmt}', value) - - def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: - self.fout.write(self._pack(fmt, value, skip_pack_prefix)) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/py.typed b/extensions/model-extension/scripts/gguf-py/gguf/py.typed deleted file mode 100644 index e69de29bb..000000000 diff --git a/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py b/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py deleted file mode 100644 index 4f16d8504..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py +++ /dev/null @@ -1,332 +0,0 @@ -from __future__ import annotations - -from typing import Sequence - -from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES - - -class TensorNameMap: - mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - # Token embeddings - MODEL_TENSOR.TOKEN_EMBD: ( - "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen - "transformer.word_embeddings", # falcon - "word_embeddings", # bloom - "model.embed_tokens", # llama-hf - "tok_embeddings", # llama-pth - "embeddings.word_embeddings", # bert - "language_model.embedding.word_embeddings", # persimmon - "wte", # gpt2 - "transformer.embd.wte", # phi2 - "model.tok_embeddings", # internlm2 - ), - - # Token type embeddings - MODEL_TENSOR.TOKEN_TYPES: ( - "embeddings.token_type_embeddings", # bert - ), - - # Normalization of token embeddings - MODEL_TENSOR.TOKEN_EMBD_NORM: ( - "word_embeddings_layernorm", # bloom - ), - - # Position embeddings - MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", # gpt2 - "embeddings.position_embeddings", # bert - "wpe", # gpt2 - ), - - # Output - MODEL_TENSOR.OUTPUT: ( - "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen - "output", # llama-pth bloom internlm2 - "word_embeddings_for_head", # persimmon - "lm_head.linear", # phi2 - ), - - # Output norm - MODEL_TENSOR.OUTPUT_NORM: ( - "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon - "model.norm", # llama-hf baichuan internlm2 - "norm", # llama-pth - "embeddings.LayerNorm", # bert - "transformer.norm_f", # mpt - "ln_f", # refact bloom qwen gpt2 - "language_model.encoder.final_layernorm", # persimmon - "model.final_layernorm", # persimmon - "lm_head.ln", # phi2 - ), - - # Rope frequencies - MODEL_TENSOR.ROPE_FREQS: ( - "rope.freqs", # llama-pth - ), - } - - block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - # Attention norm - MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth - "encoder.layer.{bid}.attention.output.LayerNorm", # bert - "language_model.encoder.layers.{bid}.input_layernorm", # persimmon - "model.layers.{bid}.ln1", # yi - "h.{bid}.ln_1", # gpt2 - "transformer.h.{bid}.ln", # phi2 - "model.layers.layers.{bid}.norm", # plamo - "model.layers.{bid}.attention_norm", # internlm2 - ), - - # Attention norm 2 - MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b - ), - - # Attention query-key-value - MODEL_TENSOR.ATTN_QKV: ( - "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 qwen - "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.h.{bid}.self_attention.query_key_value", # falcon - "h.{bid}.self_attention.query_key_value", # bloom - "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon - "model.layers.{bid}.self_attn.query_key_value", # persimmon - "h.{bid}.attn.c_attn", # gpt2 - "transformer.h.{bid}.mixer.Wqkv", # phi2 - ), - - # Attention query - MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq" # internlm2 - ), - - # Attention key - MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk" # internlm2 - ), - - # Attention value - MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv" # internlm2 - ), - - # Attention output - MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - ), - - # Rotary embeddings - MODEL_TENSOR.ATTN_ROT_EMBD: ( - "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf - "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth - "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo - "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell - ), - - # Feed-forward norm - MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen - "h.{bid}.post_attention_layernorm", # bloom - "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf - "layers.{bid}.ffn_norm", # llama-pth - "encoder.layer.{bid}.output.LayerNorm", # bert - "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon - "model.layers.{bid}.ln2", # yi - "h.{bid}.ln_2", # gpt2 - "model.layers.{bid}.ffn_norm", # internlm2 - ), - - MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral - ), - - # Feed-forward up - MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 - "transformer.blocks.{bid}.ffn.up_proj", # mpt - "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon - "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact - "layers.{bid}.feed_forward.w3", # llama-pth - "encoder.layer.{bid}.intermediate.dense", # bert - "transformer.h.{bid}.mlp.fc_in", # gpt-j - "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "transformer.h.{bid}.mlp.w1", # qwen - "h.{bid}.mlp.c_fc", # gpt2 - "transformer.h.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.layers.{bid}.mlp.up_proj", # plamo - "model.layers.{bid}.feed_forward.w3", # internlm2 - ), - - MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral - ), - - # AWQ-activation gate - MODEL_TENSOR.FFN_ACT: ( - "transformer.blocks.{bid}.ffn.act", # mpt - ), - - # Feed-forward gate - MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - ), - - MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral - ), - - # Feed-forward down - MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.ffn.down_proj", # mpt - "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon - "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf - "layers.{bid}.feed_forward.w2", # llama-pth - "encoder.layer.{bid}.output.dense", # bert - "transformer.h.{bid}.mlp.fc_out", # gpt-j - "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "h.{bid}.mlp.c_proj", # gpt2 - "transformer.h.{bid}.mlp.fc2", # phi2 - "model.layers.{bid}.mlp.fc2", # phi2 - "model.layers.layers.{bid}.mlp.down_proj", # plamo - "model.layers.{bid}.feed_forward.w2", # internlm2 - ), - - MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral - ), - - MODEL_TENSOR.ATTN_Q_NORM: ( - "language_model.encoder.layers.{bid}.self_attention.q_layernorm", - "model.layers.{bid}.self_attn.q_layernorm", # persimmon - ), - - MODEL_TENSOR.ATTN_K_NORM: ( - "language_model.encoder.layers.{bid}.self_attention.k_layernorm", - "model.layers.{bid}.self_attn.k_layernorm", # persimmon - ), - - MODEL_TENSOR.ROPE_FREQS: ( - "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon - ), - } - - mapping: dict[str, tuple[MODEL_TENSOR, str]] - - def __init__(self, arch: MODEL_ARCH, n_blocks: int): - self.mapping = {} - for tensor, keys in self.mappings_cfg.items(): - if tensor not in MODEL_TENSORS[arch]: - continue - tensor_name = TENSOR_NAMES[tensor] - self.mapping[tensor_name] = (tensor, tensor_name) - for key in keys: - self.mapping[key] = (tensor, tensor_name) - for bid in range(n_blocks): - for tensor, keys in self.block_mappings_cfg.items(): - if tensor not in MODEL_TENSORS[arch]: - continue - # TODO: make this configurable - n_experts = 8 - for xid in range(n_experts): - tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) - self.mapping[tensor_name] = (tensor, tensor_name) - for key in keys: - key = key.format(bid = bid, xid = xid) - self.mapping[key] = (tensor, tensor_name) - - def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: - result = self.mapping.get(key) - if result is not None: - return result - for suffix in try_suffixes: - if key.endswith(suffix): - result = self.mapping.get(key[:-len(suffix)]) - if result is not None: - return result[0], result[1] + suffix - return None - - def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) - if result is None: - return None - return result[1] - - def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) - if result is None: - return None - return result[0] - - def __getitem__(self, key: str) -> str: - try: - return self.mapping[key][1] - except KeyError: - raise KeyError(key) - - def __contains__(self, key: str) -> bool: - return key in self.mapping - - def __repr__(self) -> str: - return repr(self.mapping) - - -def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: - return TensorNameMap(arch, n_blocks) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/vocab.py b/extensions/model-extension/scripts/gguf-py/gguf/vocab.py deleted file mode 100644 index cd1942975..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/vocab.py +++ /dev/null @@ -1,185 +0,0 @@ -from __future__ import annotations - -import json -import os -import sys -from pathlib import Path -from typing import Any, Callable - -from .gguf_writer import GGUFWriter - - -class SpecialVocab: - merges: list[str] - add_special_token: dict[str, bool] - special_token_ids: dict[str, int] - chat_template: str | None - - def __init__( - self, path: str | os.PathLike[str], load_merges: bool = False, - special_token_types: tuple[str, ...] | None = None, - n_vocab: int | None = None, - ): - self.special_token_ids = {} - self.add_special_token = {} - self.n_vocab = n_vocab - self.load_merges = load_merges - self.merges = [] - self.chat_template = None - if special_token_types is not None: - self.special_token_types = special_token_types - else: - self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad') - self._load(Path(path)) - - def __repr__(self) -> str: - return ''.format( - len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset", - ) - - def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: - if self.merges: - if not quiet: - print(f'gguf: Adding {len(self.merges)} merge(s).') - gw.add_token_merges(self.merges) - elif self.load_merges: - print( - 'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.', - file = sys.stderr, - ) - for typ, tokid in self.special_token_ids.items(): - id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) - if id_handler is None: - print( - f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', - file = sys.stderr, - ) - continue - if not quiet: - print(f'gguf: Setting special token type {typ} to {tokid}') - id_handler(tokid) - for typ, value in self.add_special_token.items(): - add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) - if add_handler is None: - print( - f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping', - file = sys.stderr, - ) - continue - if not quiet: - print(f'gguf: Setting add_{typ}_token to {value}') - add_handler(value) - if self.chat_template is not None: - if not quiet: - print(f'gguf: Setting chat_template to {self.chat_template}') - gw.add_chat_template(self.chat_template) - - def _load(self, path: Path) -> None: - self._try_load_from_tokenizer_json(path) - self._try_load_from_config_json(path) - if self.load_merges and not self.merges: - self._try_load_merges_txt(path) - - def _try_load_merges_txt(self, path: Path) -> bool: - merges_file = path / 'merges.txt' - if not merges_file.is_file(): - return False - with open(merges_file, 'r', encoding = 'utf-8') as fp: - first_line = next(fp, '').strip() - if not first_line.startswith('#'): - fp.seek(0) - line_num = 0 - else: - line_num = 1 - merges = [] - for line in fp: - line_num += 1 - line = line.strip() - if not line: - continue - parts = line.split(None, 3) - if len(parts) != 2: - print( - f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring', - file = sys.stderr, - ) - continue - merges.append(f'{parts[0]} {parts[1]}') - self.merges = merges - return True - - def _set_special_token(self, typ: str, tid: Any) -> None: - if not isinstance(tid, int): - return - if tid < 0: - raise ValueError(f'invalid value for special token type {typ}: {tid}') - if self.n_vocab is None or tid < self.n_vocab: - if typ in self.special_token_ids: - return - self.special_token_ids[typ] = tid - return - print( - f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping', - file = sys.stderr, - ) - - def _try_load_from_tokenizer_json(self, path: Path) -> bool: - tokenizer_file = path / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, encoding = 'utf-8') as f: - tokenizer = json.load(f) - if self.load_merges: - merges = tokenizer.get('model', {}).get('merges') - if isinstance(merges, list) and merges and isinstance(merges[0], str): - self.merges = merges - added_tokens = tokenizer.get('added_tokens', {}) - else: - added_tokens = {} - tokenizer_config_file = path / 'tokenizer_config.json' - if not tokenizer_config_file.is_file(): - return True - with open(tokenizer_config_file, encoding = 'utf-8') as f: - tokenizer_config = json.load(f) - chat_template = tokenizer_config.get('chat_template') - if chat_template is None or isinstance(chat_template, str): - self.chat_template = chat_template - else: - print( - f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', - file = sys.stderr - ) - for typ in self.special_token_types: - add_entry = tokenizer_config.get(f'add_{typ}_token') - if isinstance(add_entry, bool): - self.add_special_token[typ] = add_entry - if not added_tokens: - # We will need this to get the content for the token, so if it's empty - # may as well just give up. - continue - entry = tokenizer_config.get(f'{typ}_token') - if isinstance(entry, str): - tc_content = entry - elif isinstance(entry, dict): - entry_content = entry.get('content') - if not isinstance(entry_content, str): - continue - tc_content = entry_content - else: - continue - # We only need the first match here. - maybe_token_id = next( - (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content), - None, - ) - self._set_special_token(typ, maybe_token_id) - return True - - def _try_load_from_config_json(self, path: Path) -> bool: - config_file = path / 'config.json' - if not config_file.is_file(): - return False - with open(config_file, encoding = 'utf-8') as f: - config = json.load(f) - for typ in self.special_token_types: - self._set_special_token(typ, config.get(f'{typ}_token_id')) - return True diff --git a/extensions/model-extension/scripts/gguf-py/pyproject.toml b/extensions/model-extension/scripts/gguf-py/pyproject.toml deleted file mode 100644 index 9789c2c87..000000000 --- a/extensions/model-extension/scripts/gguf-py/pyproject.toml +++ /dev/null @@ -1,35 +0,0 @@ -[tool.poetry] -name = "gguf" -version = "0.7.0" -description = "Read and write ML models in GGUF for GGML" -authors = ["GGML "] -packages = [ - {include = "gguf"}, - {include = "gguf/py.typed"}, - {include = "scripts"}, -] -readme = "README.md" -homepage = "https://ggml.ai" -repository = "https://github.com/ggerganov/llama.cpp" -keywords = ["ggml", "gguf", "llama.cpp"] -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] - -[tool.poetry.dependencies] -python = ">=3.8" -numpy = ">=1.17" - -[tool.poetry.dev-dependencies] -pytest = "^5.2" - -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" - -[tool.poetry.scripts] -gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" -gguf-dump = "scripts:gguf_dump_entrypoint" -gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" diff --git a/extensions/model-extension/scripts/gguf-py/scripts/__init__.py b/extensions/model-extension/scripts/gguf-py/scripts/__init__.py deleted file mode 100644 index 77132db7a..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import os - -from importlib import import_module - - -os.environ["NO_LOCAL_GGUF"] = "TRUE" - -gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main -gguf_dump_entrypoint = import_module("scripts.gguf-dump").main -gguf_set_metadata_entrypoint = import_module("scripts.gguf-set-metadata").main - -del import_module, os diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py deleted file mode 100755 index 10a16ad06..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import os -import sys -from pathlib import Path - -import numpy as np - -# Necessary to load the local gguf package -if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): - sys.path.insert(0, str(Path(__file__).parent.parent)) - -import gguf - - -def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # Host is little endian - host_endian = "little" - swapped_endian = "big" - else: - # Sorry PDP or other weird systems that don't use BE or LE. - host_endian = "big" - swapped_endian = "little" - if reader.byte_order == "S": - file_endian = swapped_endian - else: - file_endian = host_endian - order = host_endian if args.order == "native" else args.order - print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") - if file_endian == order: - print(f"* File is already {order.upper()} endian. Nothing to do.") - sys.exit(0) - print("* Checking tensors for conversion compatibility") - for tensor in reader.tensors: - if tensor.tensor_type not in ( - gguf.GGMLQuantizationType.F32, - gguf.GGMLQuantizationType.F16, - gguf.GGMLQuantizationType.Q8_0, - ): - raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") - print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") - if args.dry_run: - return - print("\n*** Warning *** Warning *** Warning **") - print("* This conversion process may damage the file. Ensure you have a backup.") - if order != host_endian: - print("* Requested endian differs from host, you will not be able to load the model on this machine.") - print("* The file will be modified immediately, so if conversion fails or is interrupted") - print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") - response = input("YES, I am sure> ") - if response != "YES": - print("You didn't enter YES. Okay then, see ya!") - sys.exit(0) - print(f"\n* Converting fields ({len(reader.fields)})") - for idx, field in enumerate(reader.fields.values()): - print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") - for part in field.parts: - part.byteswap(inplace=True) - print(f"\n* Converting tensors ({len(reader.tensors)})") - for idx, tensor in enumerate(reader.tensors): - print( - f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, " - f"elements={tensor.n_elements}... ", - end="", - ) - tensor_type = tensor.tensor_type - for part in tensor.field.parts: - part.byteswap(inplace=True) - if tensor_type != gguf.GGMLQuantizationType.Q8_0: - tensor.data.byteswap(inplace=True) - print() - continue - # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes - block_size = 34 - n_blocks = len(tensor.data) // block_size - for block_num in range(n_blocks): - block_offs = block_num * block_size - # I know I said f16, but it doesn't matter here - any simple 16 bit type works. - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - if block_num % 100000 == 0: - print(f"[{(n_blocks - block_num) // 1000}K]", end="") - sys.stdout.flush() - print() - print("* Completion") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Convert GGUF file byte order") - parser.add_argument( - "model", type=str, - help="GGUF format model filename", - ) - parser.add_argument( - "order", type=str, choices=['big', 'little', 'native'], - help="Requested byte order", - ) - parser.add_argument( - "--dry-run", action="store_true", - help="Don't actually change anything", - ) - args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') - reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') - convert_byteorder(reader, args) - - -if __name__ == "__main__": - main() diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py deleted file mode 100755 index dbf891508..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import os -import sys -from pathlib import Path -from typing import Any - -import numpy as np - -# Necessary to load the local gguf package -if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): - sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf import GGUFReader, GGUFValueType # noqa: E402 - - -def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: - host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' - if reader.byte_order == 'S': - file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE' - else: - file_endian = host_endian - return (host_endian, file_endian) - - -# For more information about what field.parts and field.data represent, -# please see the comments in the modify_gguf.py example. -def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: - host_endian, file_endian = get_file_host_endian(reader) - print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') - print(f'\n* Dumping {len(reader.fields)} key/value pair(s)') - for n, field in enumerate(reader.fields.values(), 1): - if not field.types: - pretty_type = 'N/A' - elif field.types[0] == GGUFValueType.ARRAY: - nest_count = len(field.types) - 1 - pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count - else: - pretty_type = str(field.types[-1].name) - print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '') - if len(field.types) == 1: - curr_type = field.types[0] - if curr_type == GGUFValueType.STRING: - print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '') - elif field.types[0] in reader.gguf_scalar_to_np: - print(' = {0}'.format(field.parts[-1][0]), end = '') - print() - if args.no_tensors: - return - print(f'\n* Dumping {len(reader.tensors)} tensor(s)') - for n, tensor in enumerate(reader.tensors, 1): - prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) - print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') - - -def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: - import json - host_endian, file_endian = get_file_host_endian(reader) - metadata: dict[str, Any] = {} - tensors: dict[str, Any] = {} - result = { - "filename": args.model, - "endian": file_endian, - "metadata": metadata, - "tensors": tensors, - } - for idx, field in enumerate(reader.fields.values()): - curr: dict[str, Any] = { - "index": idx, - "type": field.types[0].name if field.types else 'UNKNOWN', - "offset": field.offset, - } - metadata[field.name] = curr - if field.types[:1] == [GGUFValueType.ARRAY]: - curr["array_types"] = [t.name for t in field.types][1:] - if not args.json_array: - continue - itype = field.types[-1] - if itype == GGUFValueType.STRING: - curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data] - else: - curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()] - elif field.types[0] == GGUFValueType.STRING: - curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8") - else: - curr["value"] = field.parts[-1].tolist()[0] - if not args.no_tensors: - for idx, tensor in enumerate(reader.tensors): - tensors[tensor.name] = { - "index": idx, - "shape": tensor.shape.tolist(), - "type": tensor.tensor_type.name, - "offset": tensor.field.offset, - } - json.dump(result, sys.stdout) - - -def main() -> None: - parser = argparse.ArgumentParser(description="Dump GGUF file metadata") - parser.add_argument("model", type=str, help="GGUF format model filename") - parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") - parser.add_argument("--json", action="store_true", help="Produce JSON output") - parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") - args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - if not args.json: - print(f'* Loading: {args.model}') - reader = GGUFReader(args.model, 'r') - if args.json: - dump_metadata_json(reader, args) - else: - dump_metadata(reader, args) - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py deleted file mode 100755 index 3ebdfa898..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import os -import sys -from pathlib import Path - -# Necessary to load the local gguf package -if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): - sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf import GGUFReader # noqa: E402 - - -def minimal_example(filename: str) -> None: - reader = GGUFReader(filename, 'r+') - field = reader.fields['tokenizer.ggml.bos_token_id'] - if field is None: - return - part_index = field.data[0] - field.parts[part_index][0] = 2 # Set tokenizer.ggml.bos_token_id to 2 - # - # So what's this field.data thing? It's helpful because field.parts contains - # _every_ part of the GGUF field. For example, tokenizer.ggml.bos_token_id consists - # of: - # - # Part index 0: Key length (27) - # Part index 1: Key data ("tokenizer.ggml.bos_token_id") - # Part index 2: Field type (4, the id for GGUFValueType.UINT32) - # Part index 3: Field value - # - # Note also that each part is an NDArray slice, so even a part that - # is only a single value like the key length will be a NDArray of - # the key length type (numpy.uint32). - # - # The .data attribute in the Field is a list of relevant part indexes - # and doesn't contain internal GGUF details like the key length part. - # In this case, .data will be [3] - just the part index of the - # field value itself. - - -def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: - field = reader.get_field(args.key) - if field is None: - print(f'! Field {repr(args.key)} not found', file = sys.stderr) - sys.exit(1) - # Note that field.types is a list of types. This is because the GGUF - # format supports arrays. For example, an array of UINT32 would - # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] - handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None - if handler is None: - print( - f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}', - file = sys.stderr, - ) - sys.exit(1) - current_value = field.parts[field.data[0]][0] - new_value = handler(args.value) - print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') - if current_value == new_value: - print(f'- Key {repr(args.key)} already set to requested value {current_value}') - sys.exit(0) - if args.dry_run: - sys.exit(0) - if not args.force: - print('*** Warning *** Warning *** Warning **') - print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') - print('* Enter exactly YES if you are positive you want to proceed:') - response = input('YES, I am sure> ') - if response != 'YES': - print("You didn't enter YES. Okay then, see ya!") - sys.exit(0) - field.parts[field.data[0]][0] = new_value - print('* Field changed. Successful completion.') - - -def main() -> None: - parser = argparse.ArgumentParser(description="Set a simple value in GGUF file metadata") - parser.add_argument("model", type=str, help="GGUF format model filename") - parser.add_argument("key", type=str, help="Metadata key to set") - parser.add_argument("value", type=str, help="Metadata value to set") - parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") - parser.add_argument("--force", action="store_true", help="Change the field without confirmation") - args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') - reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') - set_metadata(reader, args) - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py b/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py deleted file mode 100644 index 0adeb7d55..000000000 --- a/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py +++ /dev/null @@ -1,7 +0,0 @@ -import gguf # noqa: F401 - -# TODO: add tests - - -def test_write_gguf() -> None: - pass diff --git a/extensions/model-extension/scripts/install_deps.py b/extensions/model-extension/scripts/install_deps.py deleted file mode 100644 index 2dfabed07..000000000 --- a/extensions/model-extension/scripts/install_deps.py +++ /dev/null @@ -1,14 +0,0 @@ -import subprocess -import sys - -deps = [ - 'numpy~=1.24.4', - 'sentencepiece~=0.1.98', - 'transformers>=4.35.2,<5.0.0', - 'gguf>=0.1.0', - 'protobuf>=4.21.0,<5.0.0', - 'torch~=2.1.1', - 'packaging>=20.0', - 'tiktoken~=0.5.0' -] -subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', '--force-reinstall', *deps]) diff --git a/extensions/model-extension/scripts/version.txt b/extensions/model-extension/scripts/version.txt deleted file mode 100644 index f743d6c4a..000000000 --- a/extensions/model-extension/scripts/version.txt +++ /dev/null @@ -1 +0,0 @@ -b2106 \ No newline at end of file diff --git a/extensions/model-extension/src/index.ts b/extensions/model-extension/src/index.ts index 7561ee6ed..e2f68a58c 100644 --- a/extensions/model-extension/src/index.ts +++ b/extensions/model-extension/src/index.ts @@ -19,8 +19,6 @@ import { DownloadRequest, executeOnMain, HuggingFaceRepoData, - Quantization, - log, getFileSize, AllQuantizations, ModelEvent, @@ -353,7 +351,7 @@ export default class JanModelExtension extends ModelExtension { } /** - * Saves a machine learning model. + * Saves a model file. * @param model - The model to save. * @returns A Promise that resolves when the model is saved. */ @@ -565,6 +563,19 @@ export default class JanModelExtension extends ModelExtension { } const defaultModel = (await this.getDefaultModel()) as Model + const metadata = await executeOnMain( + NODE, + 'retrieveGGUFMetadata', + await joinPath([ + await getJanDataFolderPath(), + 'models', + dirName, + binaryFileName, + ]) + ) + + const eos_id = metadata?.['tokenizer.ggml.eos_token_id'] + if (!defaultModel) { console.error('Unable to find default model') return @@ -581,8 +592,20 @@ export default class JanModelExtension extends ModelExtension { filename: binaryFileName, }, ], + parameters: { + ...defaultModel.parameters, + stop: eos_id + ? [metadata['tokenizer.ggml.tokens'][eos_id] ?? ''] + : defaultModel.parameters.stop, + }, settings: { ...defaultModel.settings, + prompt_template: + metadata?.parsed_chat_template ?? + defaultModel.settings.prompt_template, + ctx_len: + metadata?.['llama.context_length'] ?? defaultModel.settings.ctx_len, + ngl: (metadata?.['llama.block_count'] ?? 32) + 1, llama_model_path: binaryFileName, }, created: Date.now(), @@ -657,6 +680,13 @@ export default class JanModelExtension extends ModelExtension { return } + const metadata = await executeOnMain( + NODE, + 'retrieveGGUFMetadata', + modelBinaryPath + ) + const eos_id = metadata?.['tokenizer.ggml.eos_token_id'] + const binaryFileName = await baseName(modelBinaryPath) const model: Model = { @@ -669,8 +699,21 @@ export default class JanModelExtension extends ModelExtension { filename: binaryFileName, }, ], + parameters: { + ...defaultModel.parameters, + stop: eos_id + ? [metadata?.['tokenizer.ggml.tokens'][eos_id] ?? ''] + : defaultModel.parameters.stop, + }, + settings: { ...defaultModel.settings, + prompt_template: + metadata?.parsed_chat_template ?? + defaultModel.settings.prompt_template, + ctx_len: + metadata?.['llama.context_length'] ?? defaultModel.settings.ctx_len, + ngl: (metadata?.['llama.block_count'] ?? 32) + 1, llama_model_path: binaryFileName, }, created: Date.now(), @@ -710,9 +753,17 @@ export default class JanModelExtension extends ModelExtension { const updatedModel: Model = { ...model, ...modelInfo, + parameters: { + ...model.parameters, + ...modelInfo.parameters, + }, + settings: { + ...model.settings, + ...modelInfo.settings, + }, metadata: { ...model.metadata, - tags: modelInfo.metadata?.tags ?? [], + ...modelInfo.metadata, }, } @@ -826,218 +877,4 @@ export default class JanModelExtension extends ModelExtension { importedModels ) } - - private getGgufFileList( - repoData: HuggingFaceRepoData, - selectedQuantization: Quantization - ): string[] { - return repoData.siblings - .map((file) => file.rfilename) - .filter((file) => file.indexOf(selectedQuantization) !== -1) - .filter((file) => file.endsWith('.gguf')) - } - - private getFileList(repoData: HuggingFaceRepoData): string[] { - // SafeTensors first, if not, then PyTorch - const modelFiles = repoData.siblings - .map((file) => file.rfilename) - .filter((file) => - JanModelExtension._safetensorsRegexs.some((regex) => regex.test(file)) - ) - if (modelFiles.length === 0) { - repoData.siblings.forEach((file) => { - if ( - JanModelExtension._pytorchRegexs.some((regex) => - regex.test(file.rfilename) - ) - ) { - modelFiles.push(file.rfilename) - } - }) - } - - const vocabFiles = [ - 'tokenizer.model', - 'vocab.json', - 'tokenizer.json', - ].filter((file) => - repoData.siblings.some((sibling) => sibling.rfilename === file) - ) - - const etcFiles = repoData.siblings - .map((file) => file.rfilename) - .filter( - (file) => - (file.endsWith('.json') && !vocabFiles.includes(file)) || - file.endsWith('.txt') || - file.endsWith('.py') || - file.endsWith('.tiktoken') - ) - - return [...modelFiles, ...vocabFiles, ...etcFiles] - } - - private async getModelDirPath(repoID: string): Promise { - const modelName = repoID.split('/').slice(1).join('/') - return joinPath([await getJanDataFolderPath(), 'models', modelName]) - } - - private async getConvertedModelPath(repoID: string): Promise { - const modelName = repoID.split('/').slice(1).join('/') - const modelDirPath = await this.getModelDirPath(repoID) - return joinPath([modelDirPath, modelName + '.gguf']) - } - - private async getQuantizedModelPath( - repoID: string, - quantization: Quantization - ): Promise { - const modelName = repoID.split('/').slice(1).join('/') - const modelDirPath = await this.getModelDirPath(repoID) - return joinPath([ - modelDirPath, - modelName + `-${quantization.toLowerCase()}.gguf`, - ]) - } - private getCtxLength(config: { - max_sequence_length?: number - max_position_embeddings?: number - n_ctx?: number - }): number { - if (config.max_sequence_length) return config.max_sequence_length - if (config.max_position_embeddings) return config.max_position_embeddings - if (config.n_ctx) return config.n_ctx - return 2048 - } - - /** - * Converts a Hugging Face model to GGUF. - * @param repoID - The repo ID of the model to convert. - * @returns A promise that resolves when the conversion is complete. - */ - async convert(repoID: string): Promise { - if (this.interrupted) return - const modelDirPath = await this.getModelDirPath(repoID) - const modelOutPath = await this.getConvertedModelPath(repoID) - if (!(await fs.existsSync(modelDirPath))) { - throw new Error('Model dir not found') - } - if (await fs.existsSync(modelOutPath)) return - - await executeOnMain(NODE, 'installDeps') - if (this.interrupted) return - - try { - await executeOnMain( - NODE, - 'convertHf', - modelDirPath, - modelOutPath + '.temp' - ) - } catch (err) { - log(`[Conversion]::Debug: Error using hf-to-gguf.py, trying convert.py`) - - let ctx = 2048 - try { - const config = await fs.readFileSync( - await joinPath([modelDirPath, 'config.json']), - 'utf8' - ) - const configParsed = JSON.parse(config) - ctx = this.getCtxLength(configParsed) - configParsed.max_sequence_length = ctx - await fs.writeFileSync( - await joinPath([modelDirPath, 'config.json']), - JSON.stringify(configParsed, null, 2) - ) - } catch (err) { - log(`${err}`) - // ignore missing config.json - } - - const bpe = await fs.existsSync( - await joinPath([modelDirPath, 'vocab.json']) - ) - - await executeOnMain( - NODE, - 'convert', - modelDirPath, - modelOutPath + '.temp', - { - ctx, - bpe, - } - ) - } - await executeOnMain( - NODE, - 'renameSync', - modelOutPath + '.temp', - modelOutPath - ) - - for (const file of await fs.readdirSync(modelDirPath)) { - if ( - modelOutPath.endsWith(file) || - (file.endsWith('config.json') && !file.endsWith('_config.json')) - ) - continue - await fs.unlinkSync(await joinPath([modelDirPath, file])) - } - } - - /** - * Quantizes a GGUF model. - * @param repoID - The repo ID of the model to quantize. - * @param quantization - The quantization to use. - * @returns A promise that resolves when the quantization is complete. - */ - async quantize(repoID: string, quantization: Quantization): Promise { - if (this.interrupted) return - const modelDirPath = await this.getModelDirPath(repoID) - const modelOutPath = await this.getQuantizedModelPath(repoID, quantization) - if (!(await fs.existsSync(modelDirPath))) { - throw new Error('Model dir not found') - } - if (await fs.existsSync(modelOutPath)) return - - await executeOnMain( - NODE, - 'quantize', - await this.getConvertedModelPath(repoID), - modelOutPath + '.temp', - quantization - ) - await executeOnMain( - NODE, - 'renameSync', - modelOutPath + '.temp', - modelOutPath - ) - - await fs.unlinkSync(await this.getConvertedModelPath(repoID)) - } - - /** - * Cancels the convert of current Hugging Face model. - * @param repoID - The repository ID to cancel. - * @param repoData - The repository data to cancel. - * @returns {Promise} A promise that resolves when the download has been cancelled. - */ - async cancelConvert( - repoID: string, - repoData: HuggingFaceRepoData - ): Promise { - this.interrupted = true - const modelDirPath = await this.getModelDirPath(repoID) - const files = this.getFileList(repoData) - for (const file of files) { - const filePath = file - const localPath = await joinPath([modelDirPath, filePath]) - await abortDownload(localPath) - } - - executeOnMain(NODE, 'killProcesses') - } } diff --git a/extensions/model-extension/src/node/index.ts b/extensions/model-extension/src/node/index.ts index 991548e00..2b498f424 100644 --- a/extensions/model-extension/src/node/index.ts +++ b/extensions/model-extension/src/node/index.ts @@ -1,182 +1,47 @@ -import { PythonShell } from 'python-shell' -import { spawn, ChildProcess } from 'child_process' -import { resolve as presolve, join as pjoin } from 'path' -import { log, Quantization } from '@janhq/core/node' -import { statSync } from 'fs' -export { renameSync } from 'fs' +import { closeSync, openSync, readSync } from 'fs' +import { Template } from '@huggingface/jinja' +/** + * This is to retrieve the metadata from a GGUF file + * It uses hyllama and jinja from @huggingface module + */ +export const retrieveGGUFMetadata = async (ggufPath: string) => { + try { + const { ggufMetadata } = await import('hyllama') + // Read first 10mb of gguf file + const fd = openSync(ggufPath, 'r') + const buffer = new Uint8Array(10_000_000) + readSync(fd, buffer, 0, 10_000_000, 0) + closeSync(fd) -let pythonShell: PythonShell | undefined = undefined -let quantizeProcess: ChildProcess | undefined = undefined + // Parse metadata and tensor info + const { metadata } = ggufMetadata(buffer.buffer) -export const getSize = (path: string): number => statSync(path).size - -export const killProcesses = () => { - if (pythonShell) { - pythonShell.kill() - pythonShell = undefined - } - if (quantizeProcess) { - quantizeProcess.kill() - quantizeProcess = undefined + const template = new Template(metadata['tokenizer.chat_template']) + const eos_id = metadata['tokenizer.ggml.eos_token_id'] + const bos_id = metadata['tokenizer.ggml.bos_token_id'] + const eos_token = metadata['tokenizer.ggml.tokens'][eos_id] + const bos_token = metadata['tokenizer.ggml.tokens'][bos_id] + // Parse jinja template + const renderedTemplate = template.render({ + add_generation_prompt: true, + eos_token, + bos_token, + messages: [ + { + role: 'system', + content: '{system_message}', + }, + { + role: 'user', + content: '{prompt}', + }, + ], + }) + return { + ...metadata, + parsed_chat_template: renderedTemplate, + } + } catch (e) { + console.log('[MODEL_EXT]', e) } } - -export const getQuantizeExecutable = (): string => { - let binaryFolder = pjoin(__dirname, '..', 'bin') // Current directory by default - let binaryName = 'quantize' - /** - * The binary folder is different for each platform. - */ - if (process.platform === 'win32') { - binaryFolder = pjoin(binaryFolder, 'win') - binaryName = 'quantize.exe' - } else if (process.platform === 'darwin') { - /** - * For MacOS: mac-universal both Silicon and InteL - */ - binaryFolder = pjoin(binaryFolder, 'mac-universal') - } else { - binaryFolder = pjoin(binaryFolder, 'linux-cpu') - } - return pjoin(binaryFolder, binaryName) -} - -export const installDeps = (): Promise => { - return new Promise((resolve, reject) => { - const _pythonShell = new PythonShell( - presolve(__dirname, '..', 'scripts', 'install_deps.py') - ) - _pythonShell.on('message', (message) => { - log(`[Install Deps]::Debug: ${message}`) - }) - _pythonShell.on('stderr', (stderr) => { - log(`[Install Deps]::Error: ${stderr}`) - }) - _pythonShell.on('error', (err) => { - pythonShell = undefined - log(`[Install Deps]::Error: ${err}`) - reject(err) - }) - _pythonShell.on('close', () => { - const exitCode = _pythonShell.exitCode - pythonShell = undefined - log( - `[Install Deps]::Debug: Deps installation exited with code: ${exitCode}` - ) - exitCode === 0 ? resolve() : reject(exitCode) - }) - }) -} - -export const convertHf = async ( - modelDirPath: string, - outPath: string -): Promise => { - return await new Promise((resolve, reject) => { - const _pythonShell = new PythonShell( - presolve(__dirname, '..', 'scripts', 'convert-hf-to-gguf.py'), - { - args: [modelDirPath, '--outfile', outPath], - } - ) - pythonShell = _pythonShell - _pythonShell.on('message', (message) => { - log(`[Conversion]::Debug: ${message}`) - }) - _pythonShell.on('stderr', (stderr) => { - log(`[Conversion]::Error: ${stderr}`) - }) - _pythonShell.on('error', (err) => { - pythonShell = undefined - log(`[Conversion]::Error: ${err}`) - reject(err) - }) - _pythonShell.on('close', () => { - const exitCode = _pythonShell.exitCode - pythonShell = undefined - if (exitCode !== 0) { - log(`[Conversion]::Debug: Conversion exited with code: ${exitCode}`) - reject(exitCode) - } else { - resolve() - } - }) - }) -} - -export const convert = async ( - modelDirPath: string, - outPath: string, - { ctx, bpe }: { ctx?: number; bpe?: boolean } -): Promise => { - const args = [modelDirPath, '--outfile', outPath] - if (ctx) { - args.push('--ctx') - args.push(ctx.toString()) - } - if (bpe) { - args.push('--vocab-type') - args.push('bpe') - } - return await new Promise((resolve, reject) => { - const _pythonShell = new PythonShell( - presolve(__dirname, '..', 'scripts', 'convert.py'), - { - args, - } - ) - _pythonShell.on('message', (message) => { - log(`[Conversion]::Debug: ${message}`) - }) - _pythonShell.on('stderr', (stderr) => { - log(`[Conversion]::Error: ${stderr}`) - }) - _pythonShell.on('error', (err) => { - pythonShell = undefined - log(`[Conversion]::Error: ${err}`) - reject(err) - }) - _pythonShell.on('close', () => { - const exitCode = _pythonShell.exitCode - pythonShell = undefined - if (exitCode !== 0) { - log(`[Conversion]::Debug: Conversion exited with code: ${exitCode}`) - reject(exitCode) - } else { - resolve() - } - }) - }) -} - -export const quantize = async ( - modelPath: string, - outPath: string, - quantization: Quantization -): Promise => { - return await new Promise((resolve, reject) => { - const quantizeExecutable = getQuantizeExecutable() - const _quantizeProcess = spawn(quantizeExecutable, [ - modelPath, - outPath, - quantization, - ]) - quantizeProcess = _quantizeProcess - - _quantizeProcess.stdout?.on('data', (data) => { - log(`[Quantization]::Debug: ${data}`) - }) - _quantizeProcess.stderr?.on('data', (data) => { - log(`[Quantization]::Error: ${data}`) - }) - - _quantizeProcess.on('close', (code) => { - if (code !== 0) { - log(`[Quantization]::Debug: Quantization exited with code: ${code}`) - reject(code) - } else { - resolve() - } - }) - }) -} diff --git a/extensions/monitoring-extension/resources/settings.json b/extensions/monitoring-extension/resources/settings.json index 4e1d8d9d8..40b0b97f9 100644 --- a/extensions/monitoring-extension/resources/settings.json +++ b/extensions/monitoring-extension/resources/settings.json @@ -1,8 +1,8 @@ [ { "key": "log-enabled", - "title": "App Logging Enabled", - "description": "We recommend enabling this setting to help us improve the app. Your data will be kept private on your computer, and you can opt out at any time.", + "title": "Enable App Logs", + "description": "Saves app logs locally on your computer. This enables you to send us crash reports.", "controllerType": "checkbox", "controllerProps": { "value": true @@ -11,7 +11,7 @@ { "key": "log-cleaning-interval", "title": "Log Cleaning Interval", - "description": "Log cleaning interval in milliseconds.", + "description": "Automatically delete local logs after a certain time interval (in milliseconds).", "controllerType": "input", "controllerProps": { "value": "120000", @@ -19,4 +19,4 @@ "textAlign": "right" } } -] +] \ No newline at end of file diff --git a/joi/src/core/Input/index.tsx b/joi/src/core/Input/index.tsx index d82099e9c..99b7fe8ab 100644 --- a/joi/src/core/Input/index.tsx +++ b/joi/src/core/Input/index.tsx @@ -2,17 +2,30 @@ import React, { ReactNode, forwardRef } from 'react' import { twMerge } from 'tailwind-merge' import './styles.scss' +import { Cross2Icon } from '@radix-ui/react-icons' export interface Props extends React.InputHTMLAttributes { textAlign?: 'left' | 'right' prefixIcon?: ReactNode suffixIcon?: ReactNode onCLick?: () => void + clearable?: boolean + onClear?: () => void } const Input = forwardRef( ( - { className, type, textAlign, prefixIcon, suffixIcon, onClick, ...props }, + { + className, + type, + textAlign, + prefixIcon, + suffixIcon, + onClick, + onClear, + clearable, + ...props + }, ref ) => { return ( @@ -27,6 +40,11 @@ const Input = forwardRef( {suffixIcon} )} + {clearable && ( +
+ +
+ )} -
{title}
+ + {title} + {content} {!hideClose && ( diff --git a/joi/src/core/Modal/styles.scss b/joi/src/core/Modal/styles.scss index 755daaf3d..fcbf07105 100644 --- a/joi/src/core/Modal/styles.scss +++ b/joi/src/core/Modal/styles.scss @@ -42,7 +42,7 @@ fieldset, } &__title { - @apply line-clamp-1; + @apply leading-relaxed; margin: 0 0 8px 0; padding-right: 16px; font-weight: 600; diff --git a/joi/src/core/ScrollArea/index.tsx b/joi/src/core/ScrollArea/index.tsx index 3a2ffaaa8..2d44b4af8 100644 --- a/joi/src/core/ScrollArea/index.tsx +++ b/joi/src/core/ScrollArea/index.tsx @@ -9,7 +9,7 @@ const ScrollArea = React.forwardRef< React.ComponentPropsWithoutRef >(({ className, children, onScroll, ...props }, ref) => ( diff --git a/joi/src/core/ScrollArea/styles.scss b/joi/src/core/ScrollArea/styles.scss index cb5832c53..3ab0bd306 100644 --- a/joi/src/core/ScrollArea/styles.scss +++ b/joi/src/core/ScrollArea/styles.scss @@ -53,8 +53,8 @@ } ::-webkit-scrollbar { - width: 6px; - height: 6px; + width: 8px; + height: 8px; } ::-webkit-scrollbar-track, ::-webkit-scrollbar-thumb { diff --git a/joi/src/core/Tooltip/styles.scss b/joi/src/core/Tooltip/styles.scss index 04fb841c6..1ec9a5699 100644 --- a/joi/src/core/Tooltip/styles.scss +++ b/joi/src/core/Tooltip/styles.scss @@ -10,7 +10,7 @@ animation-timing-function: cubic-bezier(0.16, 1, 0.3, 1); will-change: transform, opacity; font-weight: 500; - z-index: 100; + z-index: 999999999; max-width: 240px; @apply text-sm leading-normal; } diff --git a/package.json b/package.json index 68c11c68c..c86405182 100644 --- a/package.json +++ b/package.json @@ -41,14 +41,16 @@ "build": "yarn build:web && yarn build:electron", "build:publish": "yarn copy:assets && yarn build:web && yarn workspace jan build:publish", "dev:joi": "yarn workspace @janhq/joi install && yarn workspace @janhq/joi dev", - "build:joi": "yarn workspace @janhq/joi install && yarn workspace @janhq/joi build" + "build:joi": "yarn workspace @janhq/joi install && yarn workspace @janhq/joi build", + "prepare": "husky" }, "devDependencies": { "concurrently": "^8.2.1", "cpx": "^1.5.0", + "husky": "^9.1.5", "rimraf": "^3.0.2", - "wait-on": "^7.0.1", - "run-script-os": "^1.1.6" + "run-script-os": "^1.1.6", + "wait-on": "^7.0.1" }, "version": "0.0.0" } diff --git a/specs/Makefile b/specs/Makefile deleted file mode 100644 index 4646dfdcd..000000000 --- a/specs/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -spec: - @echo "Initiating a Spec..." - @last_number=$$(ls $(CURDIR)/jan-[0-9][0-9][0-9]-* | sort -V | tail -n 1 | cut -d '-' -f 2); \ - last_number=$$(echo $$last_number | sed 's/^0*//'); \ - next_number=$$(printf "%03d" $$(( $$last_number + 1 ))); \ - read -p "Enter Spec title: " title; \ - title=$$(echo $$title | tr ' ' '-'); \ - cp $(CURDIR)/spec-template.md $(CURDIR)/jan-$$next_number-$$title.md; \ - date=$$(date +%Y-%m-%d); \ - usernames=$$(git config user.name); \ - sed -i '' 's/{SPEC-NUM}/'$$next_number'/g' $(CURDIR)/jan-$$next_number-$$title.md; \ - sed -i '' 's/{TITLE}/'$$title'/g' $(CURDIR)/jan-$$next_number-$$title.md; \ - sed -i '' 's/{DATE}/'$$date'/g' $(CURDIR)/jan-$$next_number-$$title.md; \ - sed -i '' 's/{USERNAMES}/'$$usernames'/g' $(CURDIR)/jan-$$next_number-$$title.md \ No newline at end of file diff --git a/specs/QA-checklist.md b/specs/QA-checklist.md new file mode 100644 index 000000000..8bb5168dd --- /dev/null +++ b/specs/QA-checklist.md @@ -0,0 +1,188 @@ +# Regression test + +**Release Version:** v0.6.0 + +**Operating System:** + +--- + +## A. Installation, Update, and Uninstallation + +### 1. Users install app (New user flow) + +- [ ] :rocket: Installation package is not corrupted and passes all security checks. +- [ ] :key: App launches successfully after installation. + +### 2. Users update app (Existing user flow) + +- [ ] :key: Validate that the update does not corrupt user data or settings. +- [ ] :key: App restarts or prompts the user to restart after an update. +- [ ] When updating the app, check if the `/models` directory has any JSON/YML files that change according to the update. +- [ ] Updating the app also updates extensions correctly, test functionality changes. + +### 3. Users uninstall / close app + +- [ ] :key: After closing the app, all models are unloaded. +- [ ] :key::warning: Uninstallation process removes the app successfully from the system. +- [ ] Clean the data folder and open the app to check if it creates all the necessary folders, especially models and extensions. + + +## B. Overview + +### 1. Shortcut key + +- [ ] :key: Test each shortcut key to confirm it works as described (My models, navigating, opening, closing, etc.). + +### 2. Users check the `active model` + +- [ ] :key: The app correctly displays the state of the loading model (e.g., loading, ready, error). +- [ ] :key: Confirm that the app allows users to switch between models if multiple are available. +- [ ] Check that the app provides feedback or instructions if the model fails to load. +- [ ] Verify the troubleshooting assistant correctly capture hardware / log info [#1784](https://github.com/janhq/jan/issues/1784) + +## C. Thread + +### 1. Users can chat with Jan, the default assistant + +- [ ] :key: Sending a message enables users to receive responses from model. +- [ ] :key: Conversation thread is maintained without any loss of data upon sending multiple messages. +- [ ] ‌Users should be able to edit msg and the assistant will re-generate the answer based on the edited version of the message. +- [ ] Test for the ability to send different types of messages (e.g., text, emojis, code blocks). +- [ ] Check the output format of the AI (code blocks, JSON, markdown, ...). +- [ ] :key: Validate the scroll functionality in the chat window for lengthy conversations. +- [ ] User can copy / delete the response. +- [ ] :key: Check the `clear message` / `delete entire chat` button works. +- [ ] Deleting all the chat retains the model instruction and settings. +- [ ] :key: Appropriate error handling and messaging if the assistant fails to respond. +- [ ] Test assistant's ability to maintain context over multiple exchanges. +- [ ] :key: Check the `create new chat` button, and new conversation will have an automatically generated thread title based on users msg. +- [ ] Changing `models` mid-thread the app can still handle it. +- [ ] Check the `regenerate` button renews the response (single / multiple times). +- [ ] Check the `Instructions` update correctly after the user updates it midway (mid-thread). + +### 2. Users can customize chat settings like model parameters via both the GUI & model.yml + +- [ ] Adjust model parameters (e.g., Temperature, Top K, Top P) from the GUI and verify they are reflected in the chat behavior. +- [ ] :key: Changes can be saved and persisted between sessions. +- [ ] Users can access and modify the model.yml file. +- [ ] :key: Changes made in model.yml are correctly applied to the chat session upon reload or restart. +- [ ] Check the maximum and minimum limits of the adjustable parameters and how they affect the assistant's responses. +- [ ] :key: Users switch between threads with different models, the app can handle it. + +### 3. Model dropdown +- :key: Model list should highlight recommended based on user RAM (this is not really correct, I think it's based on static formula) +- [ ] Model size should display (for both installed and imported models) + +### 4. Users can click on a history thread +- [ ] Chat window displays the entire conversation from the selected history thread without any missing messages. +- [ ] Historical threads reflect the exact state of the chat at that time, including settings. +- [ ] :key: Ability to delete or clean old threads. +- [ ] Changing the title of the thread updates correctly. + +### 5. Users can config instructions for the assistant. +- [ ] Instructions set by the user are being followed by the assistant in subsequent conversations. +- [ ] :key: Changes to instructions are updated in real time and do not require a restart of the application or session. +- [ ] :key: Ability to reset instructions to default or clear them completely. +- [ ] :key: RAG - Users can import documents and the system should process queries about the uploaded file, providing accurate and appropriate responses in the conversation thread. +- [ ] :key: Jan can see - Users can import image and Model with vision can generate responses (e.g. LLaVa model). [#294](https://github.com/janhq/jan/issues/294) + + +## D. Hub + +### 1. Users can discover recommended models +- :key: Each model's recommendations are consistent with the user’s activity and preferences. +- [ ] Search models and verify results / action on the results + +### 2. Users can download models suitable for their devices, e.g. compatible with their RAM + +- [ ] Model list should be in order: Featured > Remote > Local +- [ ] :key: Ensure that models are labeled with RAM requirements. +- [ ] :key: Check the download model functionality and validate if the cancel download feature works correctly. + +### 3. Users can download models via a HuggingFace URL [#1740](https://github.com/janhq/jan/issues/1740) + +- [ ] :key: Import via Hugging Face Id / full HuggingFace URL, check the progress bar reflects the download process +- [ ] :key: Test deeplink import [#2876](https://github.com/janhq/jan/issues/2876) +- [ ] :key: Users can use / remove the imported model. + +### 4. Users can import new models to the Hub + +- [ ] :key: Ensure import successfully via drag / drop or upload GGUF. +- [ ] :key: Verify Move model binary file / Keep Original Files & Symlink option are working +- [ ] Users can add more info to the imported model / edit name +- [ ] :key: Ensure the new model updates after restarting the app. + + +### 5. Users can use the model as they want + +- [ ] :key: Check `start` / `stop` / `delete` button response exactly what it does. +- [ ] Check if starting another model stops the other model entirely. +- [ ] :rocket: Navigate to `hub` > Click `Use` button to use model. Expect to jump to thread and see the model in dropdown model selector. +- [ ] :key: Check when deleting a model it will delete all the files on the user's computer. +- [ ] :warning:The recommended tags should present right for the user's hardware. + +### 6. Users can Integrate With a Remote Server +- [ ] :key: Import openAI GPT model https://jan.ai/guides/using-models/integrate-with-remote-server/ and the model displayed in Hub / Thread dropdown +- [ ] Users can use the remote model properly (openAI GPT, Groq) + +## E. System Monitor + +### 1. Users can see disk and RAM utilization + +- [ ] :key: Verify that the RAM and VRAM utilization graphs accurately reported in real time. +- [ ] :key: Validate that the utilization percentages reflect the actual usage compared to the system's total available resources. +- [ ] :key: Ensure that the system monitors updates dynamically as the models run and stop. + +### 2. Users can start and stop models based on system health + +- [ ] :key: Verify the `Start/Stop` action for a model, the system resource usage reflects this change. +- [ ] Confirm that any changes in model status (start/stop) are logged or reported to the user for transparency. +- [ ] :key: Check the functionality of `App log` to ensure it opens the correct folder in the system file explorer. + +## F. Settings + +### 1. Appearance + +- [ ] :key: Test the `Light`, `Dark`, and `System` theme settings to ensure they are functioning as expected. +- [ ] Confirm that the application saves the theme preference and persists it across sessions. +- [ ] Validate that all elements of the UI are compatible with the theme changes and maintain legibility and contrast. + +### 2. Extensions [TBU] + +- Validate the `Install Extensions` process by selecting and installing a plugin file. +- [ ] Enable / disable extensions and the UI should reflex accordingly + +### 3. Extension group + +- [ ] :key: Users can set valid Endpoint and API Key to use remote models +- [ ] Monitoring extension should allow users to enable / disable log and set log Cleaning Interval + + +### 4. Advanced settings + +- [ ] :key: Test the `Experimental Mode` toggle to confirm it enables or disables experimental features as intended. +- [ ] :key: Check the functionality of `Open App Directory` to ensure it opens the correct folder in the system file explorer. +- [ ] Users can move **Jan data folder** +- [ ] Validate that changes in advanced settings are applied immediately or provide appropriate instructions if a restart is needed. +- [ ] Attemp to test downloading model from hub using **HTTP Proxy** [guideline](https://github.com/janhq/jan/pull/1562) +- [ ] Logs that are older than 7 days or exceed 1MB in size will be automatically cleared upon starting the application. +- [ ] Users can click on Reset button to **factory reset** app settings to its original state & delete all usage data. + - [ ] Keep the current app data location + - [ ] Reset the current app data location +- [ ] Users can enable the setting and chat using quick ask. + +### 5. Engine +- [ ] :key: TensorRT Engine - Users able to chat with the model +- [ ] :key: Onnx Engine - Users able to chat with the model +- [ ] :key: Other remote Engine - Users able to chat with the model + +## G. Local API server + +### 1. Local Server Usage with Server Options +- [ ] :key: Explore API Reference: Swagger API for sending/receiving requests + - [ ] Use default server option + - [ ] Configure and use custom server options +- [ ] Test starting/stopping the local API server with different Model/Model settings +- [ ] Server logs captured with correct Server Options provided +- [ ] Verify functionality of Open logs/Clear feature +- [ ] Ensure that threads and other functions impacting the model are disabled while the local server is running diff --git a/specs/README.md b/specs/README.md deleted file mode 100644 index 25b342ef5..000000000 --- a/specs/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Jan Improvement Proposals - -This is a repo of key architecture decisions for Jan. - -[Read more about ADRs](https://github.com/joelparkerhenderson/architecture-decision-record) - -### Get started: - -```sh -# In root: -make newadr -``` - -### Template: -- **Status**: `pending`, `approved`, or `rejected` -- **Context**: a clearly defined problem/goal -- **Decisions**: the proposed architecture choices & changes -- **Consequences**: pros and cons of the decision -- **References**: any relevant materials to read \ No newline at end of file diff --git a/specs/adrs/adr-001-jan-deployable-cloud-native.md b/specs/adrs/adr-001-jan-deployable-cloud-native.md deleted file mode 100644 index d07bd26ff..000000000 --- a/specs/adrs/adr-001-jan-deployable-cloud-native.md +++ /dev/null @@ -1,54 +0,0 @@ -# ADR #001: Jan deployable cloud-native - -## Changelog - -- 23.10.03: Initial unfinished draft -- 23.10.16: Remove authentication - -## Authors - -- @nam-john-ho -- @louis - -## Context - -### Status Quo - -* User doesn't have a local GPU machine but wants to run Jan on a rented server -* User wants a quick, fast way to experiment with Jan on a rented GPU -* https://github.com/janhq/jan/issues/255 - -## Decision - -* This ADR aims to outline design decisions for deploying Jan in cloud native environments such as: Runpod, AWS, Azure, GCP in a fast and simple way. -* The current code-base should not change too much. -* The current plugins must be reusable across environments (Desktop, Cloud-native). - - -### Key Design Decisions -![Key Design](images/adr-001-02.png "Key Design") -#### Why middleware -* The /web codebase needs to operate in both browser and electron environments -* The /web codebase needs to route plugin routes accordingly, either to /server or /electron -* Middleware takes care of this -* We will have a /server codebase that takes care of routing to plugins -#### Unsuitable Alternatives -* Not possible to just run electron headless -* /web is on a different chromium window -* Does not have all the electron handlers -* Does not have the IPC handler - -## Alternative Approaches -Separated server process runs along side with electron. https://github.com/janhq/jan/pull/184/commits/6005409a945bb0e80a61132b9eb77f47f19d0aa6 - -## Considerations -* Due to the limitation of accessing the file system in web browsers, the first version of the web app will load all the current plugins by default, and users will not be able to add, remove, or update plugins. -* Simple authentication will be implemented as a plugin. - -## References - -- https://www.runpod.io/console/templates -- https://repost.aws/articles/ARQ0Tz9eorSL6EAus7XPMG-Q/how-to-install-textgen-webui-on-aws -- https://www.youtube.com/watch?v=_59AsSyMERQ -- https://gpus.llm-utils.org/running-llama-2-on-runpod-with-oobaboogas-text-generation-webui/ -- https://medium.com/@jarimh1984/installing-oobabooga-and-oobabooga-api-to-runpod-cloud-step-by-step-tutorial-47457974dfa5 diff --git a/specs/adrs/adr-002-jan-ai-apps.md b/specs/adrs/adr-002-jan-ai-apps.md deleted file mode 100644 index 61dfde847..000000000 --- a/specs/adrs/adr-002-jan-ai-apps.md +++ /dev/null @@ -1,55 +0,0 @@ -# ADR #002: Jan AI apps - -## Changelog -- Oct 4th 2023: Initial draft -- Oct 6th 2023: Update sample API - -## Authors -- @vuonghoainam - Hiro -- @louis-jan - -## Status -Proposed - -## Context - -### Business context -Jan can be a platform and let builders build their own `AI app` using existing tools -- Use-case 1: Medical AI startup uploads "case notes" to Jan, wants to ask it questions (i.e. medical audit) -- Use-case 2: Legal e-discovery: very large amount of documents (~10-15k pages) are uploaded, data is very private and cannot be leaked -- Use-case 3: Jan wants to use Jan to have a QnA chatbot to answer questions on docs -- Use-case 4: Jan wants to use Jan to have a codellama RAG on its own codebase, to generate new PRs - -### Extra context -- There are many use cases that the community can develop and sell to the users through Jan as plugin. Jan needs to streamline higher value chain. -- This brings more value and more option to all kind of user -- This can help building ecosystem and streamline value end to end (Jan, plugins/ model creators, Jan users - enterprise/ individual) -- We at Jan cannot build plugins more on our own, but this one should serve as featured example like [OpenAI Retrieval plugin](https://github.com/openai/chatgpt-retrieval-plugin) does. -- [#232](https://github.com/janhq/jan/issues/232) - -## Decision - -- User can browse and install plugins (with recommended model - llama2, claude, openai …) - This requires plugin dependencies. -- Jan provide consistent interface for plugin developer to use: - - Use LLM (this can be switched in runtime) - i.e Dev in llama2-7b but user can use with llama2-70b. Can choose another model as well - - Plugin can have API for CRUD indices in vectorDB/ DB, and Jan only exposes corresponding data to the app - - A place for a plugin to store the files for persistence -- This works seamlessly on desktop/ Jan hosted version with Jan API abstraction. - -### Simple UX -![UX](images/adr-002-01.png "UX") - -### Component design -![Component design](images/adr-002-02.png "Component design") - -## API -- `jan.plugin..(**args)` - -- `jan.core.db.sql.command()` -> CRUD/ query -- `jan.plugin.vectra.(**args)` -> CRUD/ query for -## Consequences -- Jan user can build their own AI apps (and buy from others too) in an easy way -- Clear design for plugin and Jan platform development - -## Reference -- [ADR-003](adr-003-jan-plugins.md) \ No newline at end of file diff --git a/specs/adrs/adr-003-jan-plugins.md b/specs/adrs/adr-003-jan-plugins.md deleted file mode 100644 index 8dd5b282a..000000000 --- a/specs/adrs/adr-003-jan-plugins.md +++ /dev/null @@ -1,65 +0,0 @@ -# ADR 003: JAN PLUGINS - -## Changelog - -- Oct 5th 2023: Initial draft - -## Status - -Accepted - -## Context - -Modular Architecture w/ Plugins: - -- Jan will have an architecture similar to VSCode or k8Lens -- "Desktop Application" whose functionality can be extended thru plugins -- Jan's architecture will need to accommodate plugins for (a) Persistence(b) IAM(c) Teams and RBAC(d) Policy engines(e) "Apps" (i.e. higher-order business logic)(f) Themes (UI) -- Nitro's architecture will need to accommodate plugins for different "model backends"(a) llama.cpp(b) rkwk (and others)(c) 3rd-party AIs - -## Decision - -![Architecture](./images/adr-003-01.png) - -## Consequences - -What becomes easier or more difficult to do because of this change? - -## CoreService API - -Jan frontend components will communicate with plugin functions via Service Interfaces: - -All of the available APIs are listed in [CoreService](../web/shared/coreService.ts) - -- Data Service: - - - GET_CONVERSATIONS: retrieve all of the conversations - - CREATE_CONVERSATION: start a new conversation - - DELETE_CONVERSATION: delete an existing conversation - - GET_CONVERSATION_MESSAGES: retrieve a certain conversation messages - - CREATE_MESSAGE: store a new message (both sent & received) - - UPDATE_MESSAGE: update an existing message (streaming) - - STORE_MODEL: store new model information (when clicking download) - - UPDATE_FINISHED_DOWNLOAD: mark a model as downloaded - - GET_UNFINISHED_DOWNLOAD_MODELS: retrieve all unfinished downloading model (TBD) - - GET_FINISHED_DOWNLOAD_MODELS: retrieve all finished downloading model (TBD) - - DELETE_DOWNLOAD_MODEL: delete a model (TBD) - - GET_MODEL_BY_ID: retrieve model information by its ID - -- Inference Service: - - - INFERENCE_URL: retrieve inference endpoint served by plugin - - INIT_MODEL: runs a model - - STOP_MODEL: stop a running model - -- Model Management Service: (TBD) - - - GET_AVAILABLE_MODELS: retrieve available models (deprecate soon) - - GET_DOWNLOADED_MODELS: (deprecated) - - DELETE_MODEL: (deprecated) - - DOWNLOAD_MODEL: start to download a model - - SEARCH_MODELS: explore models with search query on HuggingFace (TBD) - -- Monitoring service: - - GET_RESOURCES_INFORMATION: retrieve total & used memory information - - GET_CURRENT_LOAD_INFORMATION: retrieve CPU load information diff --git a/specs/adrs/adr-004-UI-Service.md b/specs/adrs/adr-004-UI-Service.md deleted file mode 100644 index cb15e9f99..000000000 --- a/specs/adrs/adr-004-UI-Service.md +++ /dev/null @@ -1,52 +0,0 @@ -# ADR 004: UI Service - -## Changelog - -- 10 Oct 2023: initial vision @dan-jan @0xSage - -## Status - -Proposed - -## Context - -Plugin devs need an API to change the Jan UI. Before we layer on more features, let's ensure good devex for feature building. - -## Decision - -![Jan UI Framework](./images/jan-ui-framework.png) - -- Side-Ribbon: Jan Apps - - - This is a protected area, for Apps - - Apps can define Left Panel, Center, and Right Panel - - We will only have 1 App for now (no need to build this abstraction yet) - - Future: Server mode (see LMStudio), Art Studio (Stable Diffusion) - -- Side-Ribbon: Global Settings - - - These will all open in a modal - - Currently: Model Store, Running Models - - Currently: User Login, Settings - -- Main Window and Right Panel - - - These will mainly be session-based - -- Console: production logs - -## UiService API - -We need a UI API for Plugins - -- e.g. Model Store plugin -> Registers "Global Settings" Icon, defines what will show up in the Modal -- e.g. Model Runner plugin -> Inference Parameters - -## Consequences - -- Increased code complexity - -## Reference - -- VSCode -- Obsidian diff --git a/specs/adrs/adr-005-model-installation.md b/specs/adrs/adr-005-model-installation.md deleted file mode 100644 index f0f45ffb1..000000000 --- a/specs/adrs/adr-005-model-installation.md +++ /dev/null @@ -1,48 +0,0 @@ -# ADR 005: model-installation - -## Changelog - -- 2023-10-18: Initial draft - -## Authors - -- 0xSage - -## Status - -Proposed - -## Context - -There are a few issues with our current model installation method (hardcoding jsons in /models repo): - -- Users want to add their own model binaries -- Maintaining /models is too manual - -## Decision - -Let Users download models on their own & manually import them to Jan via a "add a model" UI - -Links: - -- Github issue: https://github.com/janhq/jan/issues/359 -- Related issue: https://github.com/janhq/jan/issues/304 -- Designs: https://www.figma.com/file/JdK7cNIBeVdYeHxKiYeWtk/JAN---Web?type=design&node-id=4092-58218&mode=design&t=8OmFSG0E6I8Y3IjY-0 - -## Consequences - -Closed alternate solutions: - -- https://github.com/janhq/jan/issues/328 - -## Alternatives - -Thinking through the model selection experience, there are a few possibilities: - -1. [current] We hardcode models (via Github) to show up in Explore Models => unnecessarily manual, missing models users want -1. We mirror HF models for a faster download => users can also do nitro add llama2 -1. [CHOSEN] Users download models on their own & manually import them to Jan via a "add a model" UI => I like this option actually -1. [LATER] Users paste in a HF link and download the model in Explore Models => do we still render model cards for them? -1. Users manage their own models folder, e.g. /Users/nicole/models, then they set folder path in Jan. => this one needs a lot of designs/fe work - -## Reference diff --git a/specs/adrs/adr-006-jan-core-module.md b/specs/adrs/adr-006-jan-core-module.md deleted file mode 100644 index 5f3dd1ccb..000000000 --- a/specs/adrs/adr-006-jan-core-module.md +++ /dev/null @@ -1,36 +0,0 @@ -# ADR 006: jan-core-module - -## Changelog - -- 2023-10-19: Initial draft - -## Authors - -- Louis - -## Status - -Accepted - -## Context - -Currently, developers face several challenges while writing a plugin, which include: -- Registering functions using the function name as a string -- Invoking anonymous functions -- No access to native APIs or common functions for data insertion or retrieval -- Lack of communication between the app and plugins. - -## Decision - -Let developers install and import an npm module to develop our Plugin easier. - -Upon boot, Web plugs in window modules. Its components and plugins can then import the core to access exposed functions. - -![Jan Core Module](./images/jan-core-module.png) -## Consequences - -Separate PRs should be created for updating the core and app. For instance, if a new app enhancement requires the core module to expose a new API, a new core update must be published on npm to prevent CI failure. - -## Alternatives - -## Reference diff --git a/specs/adrs/adr-007-jan-plugin-catalog.md b/specs/adrs/adr-007-jan-plugin-catalog.md deleted file mode 100644 index 0ba495471..000000000 --- a/specs/adrs/adr-007-jan-plugin-catalog.md +++ /dev/null @@ -1,35 +0,0 @@ -# ADR 007: jan-plugin-catalog - -## Changelog - -- 2023-10-19: Initial draft - -## Authors - -- Louis - -## Status - -Proposed - -## Context - -Users should be able to explore plugins, and developers need a channel to publish their plugins - -Lesson learned from the Model Catalog: we hosted everything on Github and attempted to retrieve it anonymously, which cost us a lot of effort and led to a limit rate issue. Let's say there are N items in the catalog, and we attempted to send N+1 requests at a time. It was costly and led to an API limit rate issue. - -## Decision - -1. Combine all JSON items in the catalog into one JSON catalog. Now we just need to work with one catalog file, which means only one request, but the rate limit issue still exists. -2. CDN - there are cool services out there which support OSS projects, such as [JSDELIVR](https://www.jsdelivr.com). -3. Downloading a JSON file is not a good approach, though. Exporting a module works better. Webpack + DefinePlugin should work. -4. Since we have created a new module, we want to publish it as well. Let's publish it on npm so everyone can install and use it. This is also to add a versioning feature. -5. Installing this npm module would require the user to update their app to the latest version. Instead, let's import the remote module via CDN, which requires just a few lines of code. - -![Jan Plugin Catalog](./images/jan-plugin-catalog.png) - -## Consequences - -## Alternatives - -## Reference diff --git a/specs/adrs/adr-008-Extensible-Jan-with-Docker.md b/specs/adrs/adr-008-Extensible-Jan-with-Docker.md deleted file mode 100644 index 05e72956a..000000000 --- a/specs/adrs/adr-008-Extensible-Jan-with-Docker.md +++ /dev/null @@ -1,36 +0,0 @@ -# ADR 008: Extensible-Jan-with-Docker - -## Changelog - -- 2023-10-24: Initial draft - -## Authors - -- @vuonghoainam - -## Status -Proposed - -## Context - -What is the issue that we're seeing that is motivating this decision or change? -- The A.I world is moving fast with multiple runtime/ prebaked environment. We or the builder cannot cover just everything but rather we should adopt it and facilitate it as much as possible within Jan. -- For `Run your own A.I`: Builder can build app on Jan (NodeJS env) and connect to external endpoint which serves the real A.I - - e.g 1: Nitro acting as proxy to `triton-inference-server` running within a Docker container controlled by Jan app - - e.g 2: Original models can be in many formats (pytorch, paddlepaddle). In order to run it with the most optimized version locally, there must be a step to transpile the model ([Ollama import model](https://github.com/jmorganca/ollama/blob/main/docs/import.md), Tensorrt). Btw Jan can prebuilt it and let user pull but later -- For `Build your own A.I`: User can fine tune model locally (of course Jan help it with remote but later) - -## Decision - -What is the change that we're proposing and/or doing? -- Add Docker client as Core module - [Docker node](https://github.com/apocas/dockerode) -- 2 example A.I app (adr-002) to demonstrate it and actually use! - -## Consequences - -What becomes easier or more difficult to do because of this change? -- We can extend limitlessly :D - -## Alternatives - -## Reference diff --git a/specs/images/adr-001-01.png b/specs/images/adr-001-01.png deleted file mode 100644 index c56ebb530..000000000 Binary files a/specs/images/adr-001-01.png and /dev/null differ diff --git a/specs/images/adr-001-02.png b/specs/images/adr-001-02.png deleted file mode 100644 index 8649b671a..000000000 Binary files a/specs/images/adr-001-02.png and /dev/null differ diff --git a/specs/images/adr-002-01.png b/specs/images/adr-002-01.png deleted file mode 100644 index 18b1df775..000000000 Binary files a/specs/images/adr-002-01.png and /dev/null differ diff --git a/specs/images/adr-002-02.png b/specs/images/adr-002-02.png deleted file mode 100644 index 2c2cb1b52..000000000 Binary files a/specs/images/adr-002-02.png and /dev/null differ diff --git a/specs/images/adr-003-01.png b/specs/images/adr-003-01.png deleted file mode 100644 index 6328ede26..000000000 Binary files a/specs/images/adr-003-01.png and /dev/null differ diff --git a/specs/images/jan-core-module.png b/specs/images/jan-core-module.png deleted file mode 100644 index 2aa6272ba..000000000 Binary files a/specs/images/jan-core-module.png and /dev/null differ diff --git a/specs/images/jan-plugin-catalog.png b/specs/images/jan-plugin-catalog.png deleted file mode 100644 index e6885e60e..000000000 Binary files a/specs/images/jan-plugin-catalog.png and /dev/null differ diff --git a/specs/images/jan-ui-framework.png b/specs/images/jan-ui-framework.png deleted file mode 100644 index 24beeefd5..000000000 Binary files a/specs/images/jan-ui-framework.png and /dev/null differ diff --git a/specs/jan-001-log-framework.md b/specs/jan-001-log-framework.md deleted file mode 100644 index 5af88d423..000000000 --- a/specs/jan-001-log-framework.md +++ /dev/null @@ -1,101 +0,0 @@ -# jan-001: Application Logs Framework - -| Proposal | jan-001 | -| ---------- | ----------------------------------------------------- | -| Title | App Logging | -| Authors | @louis-jan | -| Permalink | | -| Discussion | [issue #528](https://github.com/janhq/jan/issues/528) | -| Status | Idea | - -## Changelog - -| Date | Author | Changes | -| ------------ | ---------- | ------------- | -| Nov 2nd 2023 | @louis-jan | Initial Draft | - -## Summary - -This proposal suggests the implementation of an "App logging as file and log window" feature, which aims to address the problem of limited visibility into the operation of a production application. Currently, logs (info, verbose, error) are hidden, making it challenging for both users and developers to debug and support the application. The proposed solution involves logging application-wide activities to a file while also enabling real-time log monitoring through a dedicated log window within the application. - -## Motivation - -### Problem Description -The lack of proper logging in production applications results in several challenges: - -1. Debugging Difficulty: When an issue arises in a production environment, developers have limited access to essential information about what happened, making it challenging to diagnose and resolve problems effectively. -2. Support Challenges: Users often encounter errors or unexpected behavior, and support teams struggle to gather the necessary logs to understand the issue and provide a solution promptly. -3. Lack of Real-time Insights: Real-time monitoring is essential for identifying and responding to critical events. The absence of a log window within the application prevents timely reactions to events. - -### Use Case Example -Consider an e-commerce application. In the current state, when a user faces an issue during checkout, there's no easy way for support or development teams to see what went wrong in real time. This results in frustration for the user and a loss of business for the company - -```ts -# Current Status (Without the Feature) -try: - # Checkout logic - # ... -except Exception as e: - # Error handling - console.log(err) - # Insufficient logging -``` - -Without proper logging, it is challenging to diagnose the issue and provide immediate assistance. - -## Proposed solution - -### High-level overview -The proposed solution introduces the following key changes: - -1. Application-wide Logging: Implement a logging mechanism that logs application-wide activities to a designated log file. This ensures that all relevant information is captured for later analysis and debugging. -2. Real-time Log Window: Create a log window within the application that displays log entries in real time. Users and developers can open this window to monitor logs, allowing them to react promptly to events and errors. - -```ts -# With the Proposed Feature -try: - # Checkout logic - # ... -except Exception as e: - # Error handling - log.error(f"Error when downloading model: {e}") - # Proper logging - -``` - -![Image](https://github.com/janhq/jan/assets/133622055/b60f6976-8138-438e-aa4f-7e103037e124) - - -### Specification - -- The logging system will support different log levels (e.g., info, verbose, error) to ensure that the right level of detail is captured. -- Log entries will be timestamped and categorized to aid in the analysis and debugging process. -- The log window will provide options for filtering and searching log entries for ease of use. - - -### Compatibility - -This proposal aims to preserve backward compatibility by ensuring that the new logging system does not break existing functionality or affect existing applications negatively. It should not alter the semantics of valid programs. - - -### Other concerns - -- Implementation: Careful consideration should be given to the choice of logging framework and implementation details. -- Security: Access to logs and log window functionality should be restricted to authorized users to prevent potential security risks. - -### Open questions - -- What will be the default log file location, and how will it be configurable? -- Should log entries be persisted and rotated over time to prevent excessive file size? - -## Alternatives - -Alternative approaches may involve integrating with existing third-party logging systems or cloud-based log management platforms. However, this proposal focuses on a built-in solution for application-wide logging and real-time monitoring. - -## Related work - -This proposal is inspired by similar features in various application development frameworks and tools. - -## FAQ - -No frequently asked questions at this time. \ No newline at end of file diff --git a/specs/spec-template.md b/specs/spec-template.md deleted file mode 100644 index 32929affb..000000000 --- a/specs/spec-template.md +++ /dev/null @@ -1,33 +0,0 @@ -# jan-{SPEC-NUM}: {TITLE} - -| Proposal | jan-{SPEC-NUM} | -| ---------- | -------------- | -| Title | {TITLE} | -| Authors | | -| Permalink | | -| Discussion | | -| Status | Idea | - -## Changelog - -| Date | Author | Changes | -| ---- | ------ | ------------- | -| | | Initial Draft | - -## Abstract - -Summary. Please keep it very short. - -## Motivation - -Why? - -## Specification - -What, how? -- UX Mockups -- Code Interfaces - -## Appendix - -Everything else goes here. \ No newline at end of file diff --git a/web/app/search/layout.tsx b/web/app/search/layout.tsx index dedbe22f5..6c491c381 100644 --- a/web/app/search/layout.tsx +++ b/web/app/search/layout.tsx @@ -2,7 +2,7 @@ import { useEffect } from 'react' -import { AppConfiguration, getUserHomePath, joinPath } from '@janhq/core' +import { AppConfiguration, getUserHomePath } from '@janhq/core' import { useSetAtom } from 'jotai' @@ -38,8 +38,7 @@ export default function RootLayout() { useEffect(() => { async function getDefaultJanDataFolder() { - const homePath = await getUserHomePath() - const defaultJanDataFolder = await joinPath([homePath, 'jan']) + const defaultJanDataFolder = await getUserHomePath() setJanDefaultDataFolder(defaultJanDataFolder) } diff --git a/web/containers/BlankState/index.tsx b/web/containers/BlankState/index.tsx new file mode 100644 index 000000000..bcfa2b306 --- /dev/null +++ b/web/containers/BlankState/index.tsx @@ -0,0 +1,24 @@ +import { ReactNode } from 'react' + +import LogoMark from '@/containers/Brand/Logo/Mark' + +type Props = { + title: string + description?: string + action?: ReactNode +} + +const BlankState = ({ title, description, action }: Props) => { + return ( +
+ +

{title}

+ {description && ( +

{description}

+ )} + {action && action} +
+ ) +} + +export default BlankState diff --git a/web/containers/CenterPanelContainer/index.tsx b/web/containers/CenterPanelContainer/index.tsx index dd8fa0ae4..9ce81f184 100644 --- a/web/containers/CenterPanelContainer/index.tsx +++ b/web/containers/CenterPanelContainer/index.tsx @@ -9,9 +9,7 @@ import { reduceTransparentAtom } from '@/helpers/atoms/Setting.atom' const CenterPanelContainer = ({ children }: PropsWithChildren) => { const reduceTransparent = useAtomValue(reduceTransparentAtom) return ( -
+
{ + const [copyOverInstructionEnabled, setCopyOverInstructionEnabled] = useAtom( + copyOverInstructionEnabledAtom + ) + + const onSwitchToggled = useCallback( + (e: ChangeEvent) => { + setCopyOverInstructionEnabled(e.target.checked) + }, + [setCopyOverInstructionEnabled] + ) + + return ( +
+
Save instructions for new threads
+ +
+ ) +} + +export default CopyOverInstruction diff --git a/web/containers/Layout/BottomPanel/SystemMonitor/TableActiveModel/index.tsx b/web/containers/Layout/BottomPanel/SystemMonitor/TableActiveModel/index.tsx index 5ea32558c..c9d86e5e8 100644 --- a/web/containers/Layout/BottomPanel/SystemMonitor/TableActiveModel/index.tsx +++ b/web/containers/Layout/BottomPanel/SystemMonitor/TableActiveModel/index.tsx @@ -1,5 +1,3 @@ -import { Fragment } from 'react' - import { Tooltip, Button, Badge } from '@janhq/joi' import { useAtom } from 'jotai' @@ -8,19 +6,22 @@ import { useActiveModel } from '@/hooks/useActiveModel' import { toGibibytes } from '@/utils/converter' +import { localEngines } from '@/utils/modelEngine' + import { serverEnabledAtom } from '@/helpers/atoms/LocalServer.atom' -const Column = ['Name', 'Size', ''] +const Column = ['Model', 'Size', ''] const TableActiveModel = () => { const { activeModel, stateModel, stopModel } = useActiveModel() + const [serverEnabled, setServerEnabled] = useAtom(serverEnabledAtom) return ( -
-
+
+
- + {Column.map((col, i) => { return ( @@ -34,47 +35,53 @@ const TableActiveModel = () => { })} - {activeModel && ( - - - - - - + + + + - - - + disabled={!serverEnabled} + /> + + + + ) : ( + + + + + )}
-

{activeModel.name}

-
- - {toGibibytes(activeModel.metadata.size)} - - - { - stopModel() - window.core?.api?.stopServer() - setServerEnabled(false) - }} - > - Stop - - } - content="The API server is running, stop the model will + {activeModel && localEngines.includes(activeModel.engine) ? ( +
+

{activeModel.name}

+
+ + {activeModel.metadata.size + ? toGibibytes(activeModel.metadata.size) + : '-'} + + + { + stopModel() + window.core?.api?.stopServer() + setServerEnabled(false) + }} + > + Stop + + } + content="The API server is running, stop the model will also stop the server" - disabled={!serverEnabled} - /> -
No on-device model running
diff --git a/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx b/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx index 9d6311e73..7fb20b0a4 100644 --- a/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx +++ b/web/containers/Layout/BottomPanel/SystemMonitor/index.tsx @@ -88,7 +88,7 @@ const SystemMonitor = () => {
{
-
+
diff --git a/web/containers/Layout/TopPanel/index.tsx b/web/containers/Layout/TopPanel/index.tsx index 6dd9ba8a5..213f7dfa9 100644 --- a/web/containers/Layout/TopPanel/index.tsx +++ b/web/containers/Layout/TopPanel/index.tsx @@ -12,22 +12,29 @@ import { SquareIcon, PaletteIcon, XIcon, + PenSquareIcon, } from 'lucide-react' import { twMerge } from 'tailwind-merge' import LogoMark from '@/containers/Brand/Logo/Mark' +import { toaster } from '@/containers/Toast' + import { MainViewState } from '@/constants/screens' +import { useCreateNewThread } from '@/hooks/useCreateNewThread' + import { mainViewStateAtom, showLeftPanelAtom, showRightPanelAtom, } from '@/helpers/atoms/App.atom' +import { assistantsAtom } from '@/helpers/atoms/Assistant.atom' import { reduceTransparentAtom, selectedSettingAtom, } from '@/helpers/atoms/Setting.atom' +import { activeTabThreadRightPanelAtom } from '@/helpers/atoms/ThreadRightPanel.atom' const TopPanel = () => { const [showLeftPanel, setShowLeftPanel] = useAtom(showLeftPanelAtom) @@ -35,6 +42,21 @@ const TopPanel = () => { const [mainViewState, setMainViewState] = useAtom(mainViewStateAtom) const setSelectedSetting = useSetAtom(selectedSettingAtom) const reduceTransparent = useAtomValue(reduceTransparentAtom) + const { requestCreateNewThread } = useCreateNewThread() + const assistants = useAtomValue(assistantsAtom) + const [activeTabThreadRightPanel, setActiveTabThreadRightPanel] = useAtom( + activeTabThreadRightPanelAtom + ) + + const onCreateNewThreadClick = () => { + if (!assistants.length) + return toaster({ + title: 'No assistant available.', + description: `Could not create a new thread. Please add an assistant.`, + type: 'error', + }) + requestCreateNewThread(assistants[0]) + } return (
{ )} )} + {mainViewState === MainViewState.Thread && ( + + )}
{mainViewState !== MainViewState.Hub && mainViewState !== MainViewState.Settings && ( {showRightPanel ? ( - ) : ( - )} diff --git a/web/containers/LeftPanelContainer/index.tsx b/web/containers/LeftPanelContainer/index.tsx index 9cde257ed..3991757f3 100644 --- a/web/containers/LeftPanelContainer/index.tsx +++ b/web/containers/LeftPanelContainer/index.tsx @@ -106,7 +106,7 @@ const LeftPanelContainer = ({ children }: Props) => {
{ const currentScrollTop = event.currentTarget.scrollTop if (prevScrollTop.current > currentScrollTop) { - console.debug('User is manually scrolling up') isUserManuallyScrollingUp.current = true } else { const currentScrollTop = event.currentTarget.scrollTop @@ -23,7 +22,6 @@ const ListContainer = ({ children }: Props) => { const clientHeight = event.currentTarget.clientHeight if (currentScrollTop + clientHeight >= scrollHeight) { - console.debug('Scrolled to the bottom') isUserManuallyScrollingUp.current = false } } diff --git a/web/containers/ModelDropdown/index.tsx b/web/containers/ModelDropdown/index.tsx index c19fb64bd..07c2cba3a 100644 --- a/web/containers/ModelDropdown/index.tsx +++ b/web/containers/ModelDropdown/index.tsx @@ -1,11 +1,25 @@ import { useState, useMemo, useEffect, useCallback, useRef } from 'react' +import Image from 'next/image' + import { InferenceEngine } from '@janhq/core' -import { Badge, Input, ScrollArea, Select, useClickOutside } from '@janhq/joi' +import { + Badge, + Button, + Input, + ScrollArea, + Select, + useClickOutside, +} from '@janhq/joi' import { useAtom, useAtomValue, useSetAtom } from 'jotai' -import { ChevronDownIcon, DownloadCloudIcon, XIcon } from 'lucide-react' +import { + ChevronDownIcon, + ChevronUpIcon, + DownloadCloudIcon, + XIcon, +} from 'lucide-react' import { twMerge } from 'tailwind-merge' import ProgressCircle from '@/containers/Loader/ProgressCircle' @@ -22,13 +36,22 @@ import useUpdateModelParameters from '@/hooks/useUpdateModelParameters' import { formatDownloadPercentage, toGibibytes } from '@/utils/converter' +import { + getLogoEngine, + getTitleByEngine, + localEngines, + priorityEngine, +} from '@/utils/modelEngine' + import { extensionManager } from '@/extension' +import { preserveModelSettingsAtom } from '@/helpers/atoms/AppConfig.atom' import { inActiveEngineProviderAtom } from '@/helpers/atoms/Extension.atom' import { configuredModelsAtom, getDownloadingModelAtom, selectedModelAtom, + showEngineListModelAtom, } from '@/helpers/atoms/Model.atom' import { activeThreadAtom, @@ -41,14 +64,6 @@ type Props = { disabled?: boolean } -const engineHasLogo = [ - InferenceEngine.anthropic, - InferenceEngine.cohere, - InferenceEngine.martian, - InferenceEngine.mistral, - InferenceEngine.openai, -] - const ModelDropdown = ({ disabled, chatInputMode, @@ -75,12 +90,17 @@ const ModelDropdown = ({ const featuredModel = configuredModels.filter((x) => x.metadata.tags.includes('Featured') ) + const preserveModelSettings = useAtomValue(preserveModelSettingsAtom) useClickOutside(() => !filterOptionsOpen && setOpen(false), null, [ dropdownOptions, toggle, ]) + const [showEngineListModel, setShowEngineListModel] = useAtom( + showEngineListModelAtom + ) + const filteredDownloadedModels = useMemo( () => configuredModels @@ -92,16 +112,10 @@ const ModelDropdown = ({ return e.engine } if (searchFilter === 'local') { - return ( - e.engine === InferenceEngine.nitro || - e.engine === InferenceEngine.nitro_tensorrt_llm - ) + return localEngines.includes(e.engine) } if (searchFilter === 'remote') { - return ( - e.engine !== InferenceEngine.nitro && - e.engine !== InferenceEngine.nitro_tensorrt_llm - ) + return !localEngines.includes(e.engine) } }) .sort((a, b) => a.name.localeCompare(b.name)) @@ -149,14 +163,25 @@ const ModelDropdown = ({ if (activeThread) { // Default setting ctx_len for the model for a better onboarding experience // TODO: When Cortex support hardware instructions, we should remove this + const defaultContextLength = preserveModelSettings + ? model?.metadata?.default_ctx_len + : 2048 + const defaultMaxTokens = preserveModelSettings + ? model?.metadata?.default_max_tokens + : 2048 const overriddenSettings = model?.settings.ctx_len && model.settings.ctx_len > 2048 - ? { ctx_len: 2048 } + ? { ctx_len: defaultContextLength ?? 2048 } + : {} + const overriddenParameters = + model?.parameters.max_tokens && model.parameters.max_tokens + ? { max_tokens: defaultMaxTokens ?? 2048 } : {} const modelParams = { ...model?.parameters, ...model?.settings, + ...overriddenParameters, ...overriddenSettings, } @@ -178,6 +203,7 @@ const ModelDropdown = ({ setSelectedModel, setThreadModelParams, updateModelParameter, + preserveModelSettings, ] ) @@ -230,10 +256,37 @@ const ModelDropdown = ({ .filter((x) => !inActiveEngineProvider.includes(x.engine)) .map((x) => x.engine) - const groupByEngine = findByEngine.filter(function (item, index) { - if (findByEngine.indexOf(item) === index) - return item !== InferenceEngine.nitro - }) + const groupByEngine = findByEngine + .filter(function (item, index) { + if (findByEngine.indexOf(item) === index) return item + }) + .sort((a, b) => { + if (priorityEngine.includes(a) && priorityEngine.includes(b)) { + return priorityEngine.indexOf(a) - priorityEngine.indexOf(b) + } else if (priorityEngine.includes(a)) { + return -1 + } else if (priorityEngine.includes(b)) { + return 1 + } else { + return 0 // Leave the rest in their original order + } + }) + + const getEngineStatusReady: InferenceEngine[] = extensionHasSettings + ?.filter((e) => e.apiKey.length > 0) + .map((x) => x.provider as InferenceEngine) + + useEffect(() => { + setShowEngineListModel((prev) => [ + ...prev, + ...(getEngineStatusReady as InferenceEngine[]), + ]) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [setShowEngineListModel, extensionHasSettings]) + + const isDownloadALocalModel = downloadedModels.some((x) => + localEngines.includes(x.engine) + ) if (strictedThread && !activeThread) { return null @@ -245,6 +298,7 @@ const ModelDropdown = ({ {chatInputMode ? ( setOpen(!open)} > @@ -312,219 +366,249 @@ const ModelDropdown = ({
- {searchFilter !== 'remote' && ( -
-
-
- Cortex -
-
- {filteredDownloadedModels - .filter((x) => { - if (searchText.length === 0) { - return downloadedModels.find((c) => c.id === x.id) - } else { - return x - } - }) - .filter((x) => x.engine === InferenceEngine.nitro).length !== - 0 ? ( -
    - {filteredDownloadedModels - ? filteredDownloadedModels - .filter((x) => x.engine === InferenceEngine.nitro) - .filter((x) => { - if (searchText.length === 0) { - return downloadedModels.find((c) => c.id === x.id) - } else { - return x - } - }) - .map((model) => { - const isDownloading = downloadingModels.some( - (md) => md.id === model.id - ) - const isdDownloaded = downloadedModels.some( - (c) => c.id === model.id - ) - return ( -
  • { - if (isdDownloaded) { - onClickModelItem(model.id) - } - }} - > -
    -

    - {model.name} -

    - -
    -
    - {!isdDownloaded && ( - - {toGibibytes(model.metadata.size)} - - )} - {!isDownloading && !isdDownloaded ? ( - downloadModel(model)} - /> - ) : ( - Object.values(downloadStates) - .filter((x) => x.modelId === model.id) - .map((item) => ( - - )) - )} -
    -
  • - ) - }) - : null} -
- ) : ( -
    - {featuredModel.map((model) => { - const isDownloading = downloadingModels.some( - (md) => md.id === model.id - ) - return ( -
  • -
    -

    - {model.name} -

    - -
    -
    - - {toGibibytes(model.metadata.size)} - - {!isDownloading ? ( - downloadModel(model)} - /> - ) : ( - Object.values(downloadStates) - .filter((x) => x.modelId === model.id) - .map((item) => ( - - )) - )} -
    -
  • - ) - })} -
- )} -
- )} - {groupByEngine.map((engine, i) => { - const apiKey = - extensionHasSettings.filter((x) => x.provider === engine)[0] - ?.apiKey.length > 1 + const apiKey = !localEngines.includes(engine) + ? extensionHasSettings.filter((x) => x.provider === engine)[0] + ?.apiKey.length > 1 + : true + const engineLogo = getLogoEngine(engine as InferenceEngine) + const showModel = showEngineListModel.includes(engine) + const onClickChevron = () => { + if (showModel) { + setShowEngineListModel((prev) => + prev.filter((item) => item !== engine) + ) + } else { + setShowEngineListModel((prev) => [...prev, engine]) + } + } return (
-
-
- {engine} -
-
- +
+
+ {engineLogo && ( + logo + )} +
+ {getTitleByEngine(engine)} +
+
+
+ {!localEngines.includes(engine) && ( + + )} + {!showModel ? ( + + ) : ( + + )}
+ + {engine === InferenceEngine.nitro && + !isDownloadALocalModel && + showModel && ( + <> + {!searchText.length ? ( +
    + {featuredModel.map((model) => { + const isDownloading = downloadingModels.some( + (md) => md.id === model.id + ) + return ( +
  • +
    +

    + {model.name} +

    + +
    +
    + + {toGibibytes(model.metadata.size)} + + {!isDownloading ? ( + downloadModel(model)} + /> + ) : ( + Object.values(downloadStates) + .filter((x) => x.modelId === model.id) + .map((item) => ( + + )) + )} +
    +
  • + ) + })} +
+ ) : ( + <> + {filteredDownloadedModels + .filter( + (x) => x.engine === InferenceEngine.nitro + ) + .filter((x) => { + if (searchText.length === 0) { + return downloadedModels.find( + (c) => c.id === x.id + ) + } else { + return x + } + }) + .map((model) => { + const isDownloading = downloadingModels.some( + (md) => md.id === model.id + ) + const isdDownloaded = downloadedModels.some( + (c) => c.id === model.id + ) + return ( +
  • { + if (isdDownloaded) { + onClickModelItem(model.id) + } + }} + > +
    +

    + {model.name} +

    + +
    +
    + {!isdDownloaded && ( + + {toGibibytes(model.metadata.size)} + + )} + {!isDownloading && !isdDownloaded ? ( + downloadModel(model)} + /> + ) : ( + Object.values(downloadStates) + .filter( + (x) => x.modelId === model.id + ) + .map((item) => ( + + )) + )} +
    +
  • + ) + })} + + )} + + )} +
      {filteredDownloadedModels .filter((x) => x.engine === engine) + .filter((y) => { + if (localEngines.includes(y.engine)) { + return downloadedModels.find((c) => c.id === y.id) + } else { + return y + } + }) .map((model) => { + if (!showModel) return null return (
    • { if ( - apiKey || - model.engine === - InferenceEngine.nitro_tensorrt_llm - ) { - onClickModelItem(model.id) - } + !apiKey && + !localEngines.includes(model.engine) + ) + return null + onClickModelItem(model.id) }} >
      - {engineHasLogo.map((x) => { - if (x === model.engine) { - return ( -
      - Model Provider -
      - ) - } - })} -

      +

      {model.name}

      diff --git a/web/containers/ModelSearch/index.tsx b/web/containers/ModelSearch/index.tsx index 9413030f4..10b6466a6 100644 --- a/web/containers/ModelSearch/index.tsx +++ b/web/containers/ModelSearch/index.tsx @@ -64,6 +64,11 @@ const ModelSearch = ({ onSearchLocal }: Props) => { [debounced] ) + const onClear = useCallback(() => { + setSearchText('') + debounced() + }, [debounced]) + const onKeyDown = useCallback( (e: React.KeyboardEvent) => { if (e.key === 'Enter' && !e.shiftKey && !e.nativeEvent.isComposing) { @@ -80,6 +85,9 @@ const ModelSearch = ({ onSearchLocal }: Props) => { placeholder="Search or paste Hugging Face URL" onChange={onSearchChanged} onKeyDown={onKeyDown} + value={searchText} + clearable={searchText.length > 0} + onClear={onClear} /> ) } diff --git a/web/containers/Providers/DataLoader.tsx b/web/containers/Providers/DataLoader.tsx index 269d2f877..4319c5eed 100644 --- a/web/containers/Providers/DataLoader.tsx +++ b/web/containers/Providers/DataLoader.tsx @@ -2,7 +2,7 @@ import { Fragment, ReactNode, useEffect } from 'react' -import { AppConfiguration, getUserHomePath, joinPath } from '@janhq/core' +import { AppConfiguration, getUserHomePath } from '@janhq/core' import { useSetAtom } from 'jotai' import useAssistants from '@/hooks/useAssistants' @@ -47,8 +47,7 @@ const DataLoader: React.FC = ({ children }) => { useEffect(() => { async function getDefaultJanDataFolder() { - const homePath = await getUserHomePath() - const defaultJanDataFolder = await joinPath([homePath, 'jan']) + const defaultJanDataFolder = await getUserHomePath() setJanDefaultDataFolder(defaultJanDataFolder) } diff --git a/web/containers/Providers/KeyListener.tsx b/web/containers/Providers/KeyListener.tsx index 2731846df..e76a84d61 100644 --- a/web/containers/Providers/KeyListener.tsx +++ b/web/containers/Providers/KeyListener.tsx @@ -2,7 +2,7 @@ import { Fragment, ReactNode, useEffect } from 'react' -import { useAtomValue, useSetAtom } from 'jotai' +import { useAtom, useAtomValue, useSetAtom } from 'jotai' import { MainViewState } from '@/constants/screens' @@ -14,6 +14,11 @@ import { showRightPanelAtom, } from '@/helpers/atoms/App.atom' import { assistantsAtom } from '@/helpers/atoms/Assistant.atom' +import { + activeThreadAtom, + modalActionThreadAtom, + ThreadModalAction, +} from '@/helpers/atoms/Thread.atom' type Props = { children: ReactNode @@ -22,31 +27,52 @@ type Props = { export default function KeyListener({ children }: Props) { const setShowLeftPanel = useSetAtom(showLeftPanelAtom) const setShowRightPanel = useSetAtom(showRightPanelAtom) - const setMainViewState = useSetAtom(mainViewStateAtom) + const [mainViewState, setMainViewState] = useAtom(mainViewStateAtom) const { requestCreateNewThread } = useCreateNewThread() const assistants = useAtomValue(assistantsAtom) + const activeThread = useAtomValue(activeThreadAtom) + const setModalActionThread = useSetAtom(modalActionThreadAtom) useEffect(() => { const onKeyDown = (e: KeyboardEvent) => { const prefixKey = isMac ? e.metaKey : e.ctrlKey - if (e.key === 'b' && prefixKey && e.shiftKey) { + if (e.code === 'KeyB' && prefixKey && e.shiftKey) { setShowRightPanel((showRightideBar) => !showRightideBar) return } - if (e.key === 'n' && prefixKey) { + if (e.code === 'Backspace' && prefixKey && e.shiftKey) { + if (!activeThread || mainViewState !== MainViewState.Thread) return + setModalActionThread({ + showModal: ThreadModalAction.Delete, + thread: activeThread, + }) + return + } + + if (e.code === 'KeyC' && prefixKey && e.shiftKey) { + if (!activeThread || mainViewState !== MainViewState.Thread) return + setModalActionThread({ + showModal: ThreadModalAction.Clean, + thread: activeThread, + }) + return + } + + if (e.code === 'KeyN' && prefixKey) { + if (mainViewState !== MainViewState.Thread) return requestCreateNewThread(assistants[0]) setMainViewState(MainViewState.Thread) return } - if (e.key === 'b' && prefixKey) { + if (e.code === 'KeyB' && prefixKey) { setShowLeftPanel((showLeftSideBar) => !showLeftSideBar) return } - if (e.key === ',' && prefixKey) { + if (e.code === 'Comma' && prefixKey) { setMainViewState(MainViewState.Settings) return } @@ -54,9 +80,12 @@ export default function KeyListener({ children }: Props) { document.addEventListener('keydown', onKeyDown) return () => document.removeEventListener('keydown', onKeyDown) }, [ + activeThread, assistants, + mainViewState, requestCreateNewThread, setMainViewState, + setModalActionThread, setShowLeftPanel, setShowRightPanel, ]) diff --git a/web/containers/RightPanelContainer/index.tsx b/web/containers/RightPanelContainer/index.tsx index e62d822bf..133130017 100644 --- a/web/containers/RightPanelContainer/index.tsx +++ b/web/containers/RightPanelContainer/index.tsx @@ -109,7 +109,7 @@ const RightPanelContainer = ({ children }: Props) => {
      { ) } + const apiKey = !localEngines.includes(engine) + ? extensionHasSettings.filter((x) => x.provider === engine)[0]?.apiKey + .length > 1 + : true + return ( ) } diff --git a/web/helpers/atoms/AppConfig.atom.ts b/web/helpers/atoms/AppConfig.atom.ts index f4acc7dc2..e7b7efaec 100644 --- a/web/helpers/atoms/AppConfig.atom.ts +++ b/web/helpers/atoms/AppConfig.atom.ts @@ -7,6 +7,7 @@ const VULKAN_ENABLED = 'vulkanEnabled' const IGNORE_SSL = 'ignoreSSLFeature' const HTTPS_PROXY_FEATURE = 'httpsProxyFeature' const QUICK_ASK_ENABLED = 'quickAskEnabled' +const PRESERVE_MODEL_SETTINGS = 'preserveModelSettings' export const janDataFolderPathAtom = atom('') @@ -23,3 +24,9 @@ export const vulkanEnabledAtom = atomWithStorage(VULKAN_ENABLED, false) export const quickAskEnabledAtom = atomWithStorage(QUICK_ASK_ENABLED, false) export const hostAtom = atom('http://localhost:1337/') + +// This feature is to allow user to cache model settings on thread creation +export const preserveModelSettingsAtom = atomWithStorage( + PRESERVE_MODEL_SETTINGS, + false +) diff --git a/web/helpers/atoms/Model.atom.ts b/web/helpers/atoms/Model.atom.ts index 7ad65a15e..77b1bfa4e 100644 --- a/web/helpers/atoms/Model.atom.ts +++ b/web/helpers/atoms/Model.atom.ts @@ -1,6 +1,8 @@ -import { ImportingModel, Model } from '@janhq/core' +import { ImportingModel, Model, InferenceEngine } from '@janhq/core' import { atom } from 'jotai' +import { localEngines } from '@/utils/modelEngine' + export const stateModel = atom({ state: 'start', loading: false, model: '' }) export const activeAssistantModelAtom = atom(undefined) @@ -32,6 +34,17 @@ export const removeDownloadingModelAtom = atom( export const downloadedModelsAtom = atom([]) +export const updateDownloadedModelAtom = atom( + null, + (get, set, updatedModel: Model) => { + const models: Model[] = get(downloadedModelsAtom).map((c) => + c.id === updatedModel.id ? updatedModel : c + ) + + set(downloadedModelsAtom, models) + } +) + export const removeDownloadedModelAtom = atom( null, (get, set, modelId: string) => { @@ -132,3 +145,5 @@ export const updateImportingModelAtom = atom( ) export const selectedModelAtom = atom(undefined) + +export const showEngineListModelAtom = atom(localEngines) diff --git a/web/helpers/atoms/Thread.atom.ts b/web/helpers/atoms/Thread.atom.ts index c3fdb8260..57d9d08cb 100644 --- a/web/helpers/atoms/Thread.atom.ts +++ b/web/helpers/atoms/Thread.atom.ts @@ -7,6 +7,13 @@ import { } from '@janhq/core' import { atom } from 'jotai' +import { atomWithStorage } from 'jotai/utils' + +export enum ThreadModalAction { + Clean = 'clean', + Delete = 'delete', + EditTitle = 'edit-title', +} export const engineParamsUpdateAtom = atom(false) @@ -131,3 +138,17 @@ export const setThreadModelParamsAtom = atom( set(threadModelParamsAtom, currentState) } ) + +const ACTIVE_SETTING_INPUT_BOX = 'activeSettingInputBox' +export const activeSettingInputBoxAtom = atomWithStorage( + ACTIVE_SETTING_INPUT_BOX, + false +) + +export const modalActionThreadAtom = atom<{ + showModal: ThreadModalAction | undefined + thread: Thread | undefined +}>({ + showModal: undefined, + thread: undefined, +}) diff --git a/web/helpers/atoms/ThreadRightPanel.atom.ts b/web/helpers/atoms/ThreadRightPanel.atom.ts index 88cbe0e21..904e08552 100644 --- a/web/helpers/atoms/ThreadRightPanel.atom.ts +++ b/web/helpers/atoms/ThreadRightPanel.atom.ts @@ -1,4 +1,6 @@ import { atom } from 'jotai' // Store tabs menu active state -export const activeTabThreadRightPanelAtom = atom('assistant') +export const activeTabThreadRightPanelAtom = atom( + 'assistant' +) diff --git a/web/hooks/useCreateNewThread.ts b/web/hooks/useCreateNewThread.ts index 5a1a32cb1..2e4760051 100644 --- a/web/hooks/useCreateNewThread.ts +++ b/web/hooks/useCreateNewThread.ts @@ -12,8 +12,11 @@ import { } from '@janhq/core' import { atom, useAtomValue, useSetAtom } from 'jotai' +import { copyOverInstructionEnabledAtom } from '@/containers/CopyInstruction' import { fileUploadAtom } from '@/containers/Providers/Jotai' +import { toaster } from '@/containers/Toast' + import { generateThreadId } from '@/utils/thread' import { useActiveModel } from './useActiveModel' @@ -23,7 +26,10 @@ import useSetActiveThread from './useSetActiveThread' import { extensionManager } from '@/extension' -import { experimentalFeatureEnabledAtom } from '@/helpers/atoms/AppConfig.atom' +import { + experimentalFeatureEnabledAtom, + preserveModelSettingsAtom, +} from '@/helpers/atoms/AppConfig.atom' import { selectedModelAtom } from '@/helpers/atoms/Model.atom' import { threadsAtom, @@ -31,6 +37,7 @@ import { updateThreadAtom, setThreadModelParamsAtom, isGeneratingResponseAtom, + activeThreadAtom, } from '@/helpers/atoms/Thread.atom' const createNewThreadAtom = atom(null, (get, set, newThread: Thread) => { @@ -57,6 +64,11 @@ export const useCreateNewThread = () => { const setFileUpload = useSetAtom(fileUploadAtom) const setSelectedModel = useSetAtom(selectedModelAtom) const setThreadModelParams = useSetAtom(setThreadModelParamsAtom) + const copyOverInstructionEnabled = useAtomValue( + copyOverInstructionEnabledAtom + ) + const preserveModelSettings = useAtomValue(preserveModelSettingsAtom) + const activeThread = useAtomValue(activeThreadAtom) const experimentalEnabled = useAtomValue(experimentalFeatureEnabledAtom) const setIsGeneratingResponse = useSetAtom(isGeneratingResponseAtom) @@ -83,7 +95,11 @@ export const useCreateNewThread = () => { const lastMessage = threads[0]?.metadata?.lastMessage if (!lastMessage && threads.length) { - return null + return toaster({ + title: 'No new thread created.', + description: `To avoid piling up empty threads, please reuse previous one before creating new.`, + type: 'warning', + }) } } @@ -93,18 +109,26 @@ export const useCreateNewThread = () => { enabled: true, settings: assistant.tools && assistant.tools[0].settings, } - + const defaultContextLength = preserveModelSettings + ? defaultModel?.metadata?.default_ctx_len + : 2048 + const defaultMaxTokens = preserveModelSettings + ? defaultModel?.metadata?.default_max_tokens + : 2048 const overriddenSettings = defaultModel?.settings.ctx_len && defaultModel.settings.ctx_len > 2048 - ? { ctx_len: 2048 } + ? { ctx_len: defaultContextLength ?? 2048 } : {} - const overriddenParameters = - defaultModel?.parameters.max_tokens && defaultModel.parameters.max_tokens - ? { max_tokens: 2048 } - : {} + const overriddenParameters = defaultModel?.parameters.max_tokens + ? { max_tokens: defaultMaxTokens ?? 2048 } + : {} const createdAt = Date.now() + let instructions: string | undefined = undefined + if (copyOverInstructionEnabled) { + instructions = activeThread?.assistants[0]?.instructions ?? undefined + } const assistantInfo: ThreadAssistantInfo = { assistant_id: assistant.id, assistant_name: assistant.name, @@ -116,7 +140,7 @@ export const useCreateNewThread = () => { { ...defaultModel?.parameters, ...overriddenParameters } ?? {}, engine: defaultModel?.engine, }, - instructions: assistant.instructions, + instructions, } const threadId = generateThreadId(assistant.id) diff --git a/web/hooks/useFactoryReset.ts b/web/hooks/useFactoryReset.ts index 8364ca10d..a8e3efb9a 100644 --- a/web/hooks/useFactoryReset.ts +++ b/web/hooks/useFactoryReset.ts @@ -34,7 +34,16 @@ export default function useFactoryReset() { } const janDataFolderPath = appConfiguration!.data_folder + // 1: Stop running model + setFactoryResetState(FactoryResetState.StoppingModel) + await stopModel() + await new Promise((resolve) => setTimeout(resolve, 4000)) + // 2: Delete the old jan data folder + setFactoryResetState(FactoryResetState.DeletingData) + await fs.rm(janDataFolderPath) + + // 3: Set the default jan data folder if (!keepCurrentFolder) { // set the default jan data folder to user's home directory const configuration: AppConfiguration = { @@ -44,17 +53,12 @@ export default function useFactoryReset() { await window.core?.api?.updateAppConfiguration(configuration) } - setFactoryResetState(FactoryResetState.StoppingModel) - await stopModel() - await new Promise((resolve) => setTimeout(resolve, 4000)) - - setFactoryResetState(FactoryResetState.DeletingData) - await fs.rm(janDataFolderPath) - + // 4: Clear app local storage setFactoryResetState(FactoryResetState.ClearLocalStorage) // reset the localStorage localStorage.clear() + // 5: Relaunch the app await window.core?.api?.relaunch() }, [defaultJanDataFolder, stopModel, setFactoryResetState] diff --git a/web/hooks/useImportModel.ts b/web/hooks/useImportModel.ts index 170f03b5e..b23f5a6fb 100644 --- a/web/hooks/useImportModel.ts +++ b/web/hooks/useImportModel.ts @@ -66,44 +66,13 @@ const useImportModel = () => { const sanitizeFilePaths = useCallback( async (filePaths: string[]) => { if (!filePaths || filePaths.length === 0) return - - const sanitizedFilePaths: FilePathWithSize[] = [] - for (const filePath of filePaths) { - const fileStats = await fs.fileStat(filePath, true) - if (!fileStats) continue - - if (!fileStats.isDirectory) { - const fileName = await baseName(filePath) - sanitizedFilePaths.push({ - path: filePath, - name: fileName, - size: fileStats.size, - }) - } else { - // allowing only one level of directory - const files = await fs.readdirSync(filePath) - - for (const file of files) { - const fullPath = await joinPath([filePath, file]) - const fileStats = await fs.fileStat(fullPath, true) - if (!fileStats || fileStats.isDirectory) continue - - sanitizedFilePaths.push({ - path: fullPath, - name: file, - size: fileStats.size, - }) - } - } + const { unsupportedFiles, supportedFiles } = (await fs.getGgufFiles( + filePaths + )) as unknown as { + unsupportedFiles: FilePathWithSize[] + supportedFiles: FilePathWithSize[] } - const unsupportedFiles = sanitizedFilePaths.filter( - (file) => !file.path.endsWith('.gguf') - ) - const supportedFiles = sanitizedFilePaths.filter((file) => - file.path.endsWith('.gguf') - ) - const importingModels: ImportingModel[] = supportedFiles.map( ({ path, name, size }: FilePathWithSize) => ({ importId: uuidv4(), diff --git a/web/hooks/useLoadTheme.ts b/web/hooks/useLoadTheme.ts index 8afba27c4..314aaf8be 100644 --- a/web/hooks/useLoadTheme.ts +++ b/web/hooks/useLoadTheme.ts @@ -42,6 +42,7 @@ export const useLoadTheme = async () => { ) const getThemes = useCallback(async () => { + if (!janDataFolderPath.length) return const folderPath = await joinPath([janDataFolderPath, 'themes']) const installedThemes = await fs.readdirSync(folderPath) diff --git a/web/hooks/useRecommendedModel.ts b/web/hooks/useRecommendedModel.ts index 8122e2b77..21a9c69e7 100644 --- a/web/hooks/useRecommendedModel.ts +++ b/web/hooks/useRecommendedModel.ts @@ -72,9 +72,6 @@ export default function useRecommendedModel() { // if we don't have [lastUsedModelId], then we can just use the first model // in the downloaded list if (!lastUsedModelId) { - console.debug( - `No last used model, using first model in list ${models[0].id}}` - ) setRecommendedModel(models[0]) return } @@ -90,7 +87,6 @@ export default function useRecommendedModel() { return } - console.debug(`Using last used model ${lastUsedModel.id}`) setRecommendedModel(lastUsedModel) // eslint-disable-next-line react-hooks/exhaustive-deps }, [getAndSortDownloadedModels, activeThread]) @@ -99,5 +95,9 @@ export default function useRecommendedModel() { getRecommendedModel() }, [getRecommendedModel]) - return { recommendedModel, downloadedModels: sortedModels } + return { + recommendedModel, + downloadedModels: sortedModels, + setRecommendedModel, + } } diff --git a/web/hooks/useUpdateModelParameters.ts b/web/hooks/useUpdateModelParameters.ts index d819a85ff..79d877456 100644 --- a/web/hooks/useUpdateModelParameters.ts +++ b/web/hooks/useUpdateModelParameters.ts @@ -4,16 +4,24 @@ import { ConversationalExtension, ExtensionTypeEnum, InferenceEngine, + Model, + ModelExtension, Thread, ThreadAssistantInfo, } from '@janhq/core' -import { useAtomValue, useSetAtom } from 'jotai' +import { useAtom, useAtomValue, useSetAtom } from 'jotai' import { toRuntimeParams, toSettingParams } from '@/utils/modelParam' +import useRecommendedModel from './useRecommendedModel' + import { extensionManager } from '@/extension' -import { selectedModelAtom } from '@/helpers/atoms/Model.atom' +import { preserveModelSettingsAtom } from '@/helpers/atoms/AppConfig.atom' +import { + selectedModelAtom, + updateDownloadedModelAtom, +} from '@/helpers/atoms/Model.atom' import { ModelParams, getActiveThreadModelParamsAtom, @@ -28,8 +36,11 @@ export type UpdateModelParameter = { export default function useUpdateModelParameters() { const activeModelParams = useAtomValue(getActiveThreadModelParamsAtom) - const selectedModel = useAtomValue(selectedModelAtom) + const [selectedModel, setSelectedModel] = useAtom(selectedModelAtom) const setThreadModelParams = useSetAtom(setThreadModelParamsAtom) + const updateDownloadedModel = useSetAtom(updateDownloadedModelAtom) + const preserveModelFeatureEnabled = useAtomValue(preserveModelSettingsAtom) + const { recommendedModel, setRecommendedModel } = useRecommendedModel() const updateModelParameter = useCallback( async (thread: Thread, settings: UpdateModelParameter) => { @@ -40,12 +51,11 @@ export default function useUpdateModelParameters() { // update the state setThreadModelParams(thread.id, updatedModelParams) + const runtimeParams = toRuntimeParams(updatedModelParams) + const settingParams = toSettingParams(updatedModelParams) const assistants = thread.assistants.map( (assistant: ThreadAssistantInfo) => { - const runtimeParams = toRuntimeParams(updatedModelParams) - const settingParams = toSettingParams(updatedModelParams) - assistant.model.parameters = runtimeParams assistant.model.settings = settingParams if (selectedModel) { @@ -65,14 +75,58 @@ export default function useUpdateModelParameters() { await extensionManager .get(ExtensionTypeEnum.Conversational) ?.saveThread(updatedThread) + + // Persists default settings to model file + // Do not overwrite ctx_len and max_tokens + if (preserveModelFeatureEnabled) { + const defaultContextLength = settingParams.ctx_len + const defaultMaxTokens = runtimeParams.max_tokens + + // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-unused-vars + const { ctx_len, ...toSaveSettings } = settingParams + // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-unused-vars + const { max_tokens, ...toSaveParams } = runtimeParams + + const updatedModel = { + id: settings.modelId ?? selectedModel?.id, + parameters: { + ...toSaveSettings, + }, + settings: { + ...toSaveParams, + }, + metadata: { + default_ctx_len: defaultContextLength, + default_max_tokens: defaultMaxTokens, + }, + } as Partial + + const model = await extensionManager + .get(ExtensionTypeEnum.Model) + ?.updateModelInfo(updatedModel) + if (model) updateDownloadedModel(model) + if (selectedModel?.id === model?.id) setSelectedModel(model) + if (recommendedModel?.id === model?.id) setRecommendedModel(model) + } }, - [activeModelParams, selectedModel, setThreadModelParams] + [ + activeModelParams, + selectedModel, + setThreadModelParams, + preserveModelFeatureEnabled, + updateDownloadedModel, + setSelectedModel, + recommendedModel, + setRecommendedModel, + ] ) const processStopWords = (params: ModelParams): ModelParams => { if ('stop' in params && typeof params['stop'] === 'string') { // Input as string but stop words accept an array of strings (space as separator) - params['stop'] = (params['stop'] as string).split(' ') + params['stop'] = (params['stop'] as string) + .split(' ') + .filter((e) => e.trim().length) } return params } diff --git a/web/package.json b/web/package.json index 53e04c3e6..b24163bb7 100644 --- a/web/package.json +++ b/web/package.json @@ -13,7 +13,6 @@ "compile": "tsc --noEmit -p . --pretty" }, "dependencies": { - "@headlessui/react": "^1.7.15", "@heroicons/react": "^2.0.18", "@hookform/resolvers": "^3.3.2", "@janhq/core": "link:./core", @@ -22,6 +21,7 @@ "class-variance-authority": "^0.7.0", "framer-motion": "^10.16.4", "highlight.js": "^11.9.0", + "postcss-url": "10.1.3", "jotai": "^2.6.0", "katex": "^0.16.10", "lodash": "^4.17.21", diff --git a/web/postcss.config.js b/web/postcss.config.js index 33ad091d2..fa30f9c8c 100644 --- a/web/postcss.config.js +++ b/web/postcss.config.js @@ -1,6 +1,10 @@ module.exports = { plugins: { - tailwindcss: {}, - autoprefixer: {}, + // eslint-disable-next-line @typescript-eslint/naming-convention + 'postcss-url': { + url: 'inline', + }, + 'tailwindcss': {}, + 'autoprefixer': {}, }, } diff --git a/web/public/images/ModelProvider/anthropic.svg b/web/public/images/ModelProvider/anthropic.svg index 7bb86df4a..1f3f18dcf 100644 --- a/web/public/images/ModelProvider/anthropic.svg +++ b/web/public/images/ModelProvider/anthropic.svg @@ -1,9 +1,9 @@ - - + + - - + + - + diff --git a/web/public/images/ModelProvider/cohere.svg b/web/public/images/ModelProvider/cohere.svg index 543bc2d6c..0ff4f0029 100644 --- a/web/public/images/ModelProvider/cohere.svg +++ b/web/public/images/ModelProvider/cohere.svg @@ -1,30 +1,9 @@ - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + + + + + + diff --git a/web/public/images/ModelProvider/cortex.svg b/web/public/images/ModelProvider/cortex.svg new file mode 100644 index 000000000..c0ebd58bf --- /dev/null +++ b/web/public/images/ModelProvider/cortex.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/web/public/images/ModelProvider/dot.svg b/web/public/images/ModelProvider/dot.svg new file mode 100644 index 000000000..f667c20b1 --- /dev/null +++ b/web/public/images/ModelProvider/dot.svg @@ -0,0 +1,3 @@ + + + diff --git a/web/public/images/ModelProvider/google.svg b/web/public/images/ModelProvider/google.svg new file mode 100644 index 000000000..1c44dd330 --- /dev/null +++ b/web/public/images/ModelProvider/google.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/web/public/images/ModelProvider/groq.svg b/web/public/images/ModelProvider/groq.svg new file mode 100644 index 000000000..9c2e0a34a --- /dev/null +++ b/web/public/images/ModelProvider/groq.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/web/public/images/ModelProvider/hugging-face.svg b/web/public/images/ModelProvider/hugging-face.svg new file mode 100644 index 000000000..9ac72080a --- /dev/null +++ b/web/public/images/ModelProvider/hugging-face.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/web/public/images/ModelProvider/martian.svg b/web/public/images/ModelProvider/martian.svg index f63ded55a..b5ceacdf8 100644 --- a/web/public/images/ModelProvider/martian.svg +++ b/web/public/images/ModelProvider/martian.svg @@ -1,11 +1,11 @@ - - - - + + + + - - + + diff --git a/web/public/images/ModelProvider/meta.svg b/web/public/images/ModelProvider/meta.svg new file mode 100644 index 000000000..91bdf9783 --- /dev/null +++ b/web/public/images/ModelProvider/meta.svg @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/web/public/images/ModelProvider/mistral.svg b/web/public/images/ModelProvider/mistral.svg index 2bb14b9bc..22233c55c 100644 --- a/web/public/images/ModelProvider/mistral.svg +++ b/web/public/images/ModelProvider/mistral.svg @@ -1,32 +1,28 @@ - - - Mistral AI - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/web/public/images/ModelProvider/nitro.svg b/web/public/images/ModelProvider/nitro.svg new file mode 100644 index 000000000..775517a75 --- /dev/null +++ b/web/public/images/ModelProvider/nitro.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/web/public/images/ModelProvider/nvidia.svg b/web/public/images/ModelProvider/nvidia.svg new file mode 100644 index 000000000..09c2194ec --- /dev/null +++ b/web/public/images/ModelProvider/nvidia.svg @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/web/public/images/ModelProvider/openRouter.svg b/web/public/images/ModelProvider/openRouter.svg new file mode 100644 index 000000000..62ff2b424 --- /dev/null +++ b/web/public/images/ModelProvider/openRouter.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/web/public/images/ModelProvider/openai.svg b/web/public/images/ModelProvider/openai.svg index 433ae3d45..8f0785415 100644 --- a/web/public/images/ModelProvider/openai.svg +++ b/web/public/images/ModelProvider/openai.svg @@ -1,24 +1,9 @@ - - - - - - - - - + + + + + + + + + diff --git a/web/public/images/ModelProvider/send.svg b/web/public/images/ModelProvider/send.svg new file mode 100644 index 000000000..28d30299f --- /dev/null +++ b/web/public/images/ModelProvider/send.svg @@ -0,0 +1,3 @@ + + + diff --git a/web/screens/Hub/ModelList/index.tsx b/web/screens/Hub/ModelList/index.tsx index f3f39d373..aea67b4e3 100644 --- a/web/screens/Hub/ModelList/index.tsx +++ b/web/screens/Hub/ModelList/index.tsx @@ -31,14 +31,14 @@ const ModelList = ({ models }: Props) => { } }) featuredModels.sort((m1, m2) => m1.metadata.size - m2.metadata.size) - remoteModels.sort((m1, m2) => m1.name.localeCompare(m2.name)) localModels.sort((m1, m2) => m1.metadata.size - m2.metadata.size) remainingModels.sort((m1, m2) => m1.metadata.size - m2.metadata.size) + remoteModels.sort((m1, m2) => m1.name.localeCompare(m2.name)) return [ ...featuredModels, - ...remoteModels, ...localModels, ...remainingModels, + ...remoteModels, ] }, [models, downloadedModels]) diff --git a/web/screens/Hub/index.tsx b/web/screens/Hub/index.tsx index 190efa136..37adb717c 100644 --- a/web/screens/Hub/index.tsx +++ b/web/screens/Hub/index.tsx @@ -7,6 +7,7 @@ import { ScrollArea, Button, Select } from '@janhq/joi' import { useAtomValue, useSetAtom } from 'jotai' import { UploadIcon } from 'lucide-react' +import BlankState from '@/containers/BlankState' import CenterPanelContainer from '@/containers/CenterPanelContainer' import ModelSearch from '@/containers/ModelSearch' @@ -92,15 +93,19 @@ const HubScreen = () => {
    -
    - { + setSortSelected(value) + }} + options={sortMenus} + /> +
    + )}
    diff --git a/web/screens/Settings/Advanced/DataFolder/index.tsx b/web/screens/Settings/Advanced/DataFolder/index.tsx index 1ce06979c..3bb059a87 100644 --- a/web/screens/Settings/Advanced/DataFolder/index.tsx +++ b/web/screens/Settings/Advanced/DataFolder/index.tsx @@ -94,8 +94,7 @@ const DataFolder = () => {
    Jan Data Folder

    - Where messages, model configurations, and other user data are - placed. + Default location for messages and other user data.

    diff --git a/web/screens/Settings/Advanced/FactoryReset/index.tsx b/web/screens/Settings/Advanced/FactoryReset/index.tsx index faa0390cd..3bbce39ef 100644 --- a/web/screens/Settings/Advanced/FactoryReset/index.tsx +++ b/web/screens/Settings/Advanced/FactoryReset/index.tsx @@ -12,13 +12,14 @@ const FactoryReset = () => {
    -
    Reset to Factory Default
    +
    + Reset to Factory Settings +

    - Reset the application to its original state, deleting all your usage - data, including model customizations and conversation history. This - action is irreversible and recommended only if the application is in a - corrupted state. + Reset the application to its initial state, deleting all your usage + data, including conversation history. This action is irreversible and + recommended only if the application is in a corrupted state.

    - Enable experimental features that may be untested and unstable. + Enable new features that may be unstable.

    {
    )} + {experimentalEnabled && ( +
    +
    +
    +
    + Preserve Model Settings +
    +
    +

    + Save model settings changes directly to the model file so that + new threads will reuse the previous settings. +

    +
    + + setPreserveModelSettings(e.target.checked)} + /> +
    + )} + {/* Proxy */} diff --git a/web/screens/Settings/Appearance/index.tsx b/web/screens/Settings/Appearance/index.tsx index 279e0a816..393af2e47 100644 --- a/web/screens/Settings/Appearance/index.tsx +++ b/web/screens/Settings/Appearance/index.tsx @@ -57,7 +57,7 @@ export default function AppearanceOptions() {
    Appearance

    - Select a color theme + Select a color theme.

    } placeholder="Search" + value={searchText} onChange={(e) => setSearchText(e.target.value)} + clearable={searchText.length > 0} + onClear={() => setSearchText('')} />
    @@ -213,7 +216,7 @@ const ExtensionCatalog = () => { {coreActiveExtensions.length > 0 && (
    - Core Extention + Core Extension
    )} diff --git a/web/screens/Settings/Hotkeys/index.tsx b/web/screens/Settings/Hotkeys/index.tsx index 382efad2e..79227651e 100644 --- a/web/screens/Settings/Hotkeys/index.tsx +++ b/web/screens/Settings/Hotkeys/index.tsx @@ -9,25 +9,35 @@ const availableHotkeys = [ { combination: 'B', modifierKeys: [isMac ? '⌘' : 'Ctrl'], - description: 'Toggle collapsible left panel', + description: 'Toggle left panel', }, { combination: 'Shift B', modifierKeys: [isMac ? '⌘' : 'Ctrl'], - description: 'Toggle collapsible right panel', + description: 'Toggle right panel', + }, + { + combination: 'Shift Backspace', + modifierKeys: [isMac ? '⌘' : 'Ctrl'], + description: 'Delete current active thread', + }, + { + combination: 'Shift C', + modifierKeys: [isMac ? '⌘' : 'Ctrl'], + description: 'Clean current active thread', }, { combination: ',', modifierKeys: [isMac ? '⌘' : 'Ctrl'], - description: 'Navigate to setting page', + description: 'Navigate to settings', }, { combination: 'Enter', - description: 'Send a message', + description: 'Send a message (in input field)', }, { combination: 'Shift Enter', - description: 'Insert new line in input box', + description: 'Insert a new line (in input field)', }, { combination: 'Arrow Up', diff --git a/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadList/index.tsx b/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadList/index.tsx index 3078a8c36..e6285565f 100644 --- a/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadList/index.tsx +++ b/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadList/index.tsx @@ -29,7 +29,7 @@ const ModelDownloadList = () => { return (

    Available Versions

    - + {ggufModels.map((model, index) => { if (!model.downloadUrl) return null return ( diff --git a/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadRow/index.tsx b/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadRow/index.tsx index 7c2c3a2c9..98914c94e 100644 --- a/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadRow/index.tsx +++ b/web/screens/Settings/HuggingFaceRepoDetailModal/ModelDownloadRow/index.tsx @@ -114,7 +114,7 @@ const ModelDownloadRow: React.FC = ({ } return ( -
    +
    {quantization && ( @@ -124,7 +124,9 @@ const ModelDownloadRow: React.FC = ({

    {fileName}

    - {toGibibytes(fileSize)} + + {toGibibytes(fileSize)} +
    {isDownloaded ? ( diff --git a/web/screens/Settings/HuggingFaceRepoDetailModal/ModelSegmentInfo/index.tsx b/web/screens/Settings/HuggingFaceRepoDetailModal/ModelSegmentInfo/index.tsx index 5a63e5902..ba17e9b57 100644 --- a/web/screens/Settings/HuggingFaceRepoDetailModal/ModelSegmentInfo/index.tsx +++ b/web/screens/Settings/HuggingFaceRepoDetailModal/ModelSegmentInfo/index.tsx @@ -72,7 +72,7 @@ const ModelSegmentInfo = () => {
    -
    +
    {importingHuggingFaceRepoData.tags.map((tag) => ( {tag} diff --git a/web/screens/Settings/HuggingFaceRepoDetailModal/index.tsx b/web/screens/Settings/HuggingFaceRepoDetailModal/index.tsx index 95c01d0cf..33367bb18 100644 --- a/web/screens/Settings/HuggingFaceRepoDetailModal/index.tsx +++ b/web/screens/Settings/HuggingFaceRepoDetailModal/index.tsx @@ -40,9 +40,8 @@ const HuggingFaceRepoDetailModal = () => { fullPage content={
    -
    +
    -
    diff --git a/web/screens/Settings/MyModels/MyModelList/index.tsx b/web/screens/Settings/MyModels/MyModelList/index.tsx index 045f454c0..ae9c344cb 100644 --- a/web/screens/Settings/MyModels/MyModelList/index.tsx +++ b/web/screens/Settings/MyModels/MyModelList/index.tsx @@ -16,6 +16,8 @@ import useDeleteModel from '@/hooks/useDeleteModel' import { toGibibytes } from '@/utils/converter' +import { localEngines } from '@/utils/modelEngine' + import { serverEnabledAtom } from '@/helpers/atoms/LocalServer.atom' type Props = { @@ -44,33 +46,10 @@ const MyModelList = ({ model }: Props) => { } } - const engineHasLogo = [ - InferenceEngine.anthropic, - InferenceEngine.cohere, - InferenceEngine.martian, - InferenceEngine.mistral, - InferenceEngine.openai, - ] - return (
    - {engineHasLogo.map((x) => { - if (x === model.engine) { - return ( -
    - Model Provider -
    - ) - } - })}
    {
    - {model.engine === InferenceEngine.nitro && ( + {localEngines.includes(model.engine) && (
    {toGibibytes(model.metadata.size)} diff --git a/web/screens/Settings/MyModels/index.tsx b/web/screens/Settings/MyModels/index.tsx index d90081b6c..8dafd6e20 100644 --- a/web/screens/Settings/MyModels/index.tsx +++ b/web/screens/Settings/MyModels/index.tsx @@ -1,16 +1,24 @@ -import { useCallback, useMemo, useState } from 'react' +import { useCallback, useEffect, useMemo, useState } from 'react' import { useDropzone } from 'react-dropzone' +import Image from 'next/image' + import { InferenceEngine } from '@janhq/core' import { Button, ScrollArea } from '@janhq/joi' -import { useAtomValue, useSetAtom } from 'jotai' -import { UploadCloudIcon, UploadIcon } from 'lucide-react' +import { useAtom, useAtomValue, useSetAtom } from 'jotai' +import { + ChevronDownIcon, + ChevronUpIcon, + UploadCloudIcon, + UploadIcon, +} from 'lucide-react' import { twMerge } from 'tailwind-merge' +import BlankState from '@/containers/BlankState' import ModelSearch from '@/containers/ModelSearch' import SetupRemoteModel from '@/containers/SetupRemoteModel' @@ -18,15 +26,32 @@ import SetupRemoteModel from '@/containers/SetupRemoteModel' import useDropModelBinaries from '@/hooks/useDropModelBinaries' import { setImportModelStageAtom } from '@/hooks/useImportModel' +import { + getLogoEngine, + getTitleByEngine, + localEngines, + priorityEngine, +} from '@/utils/modelEngine' + import MyModelList from './MyModelList' -import { downloadedModelsAtom } from '@/helpers/atoms/Model.atom' +import { extensionManager } from '@/extension' +import { + downloadedModelsAtom, + showEngineListModelAtom, +} from '@/helpers/atoms/Model.atom' const MyModels = () => { const downloadedModels = useAtomValue(downloadedModelsAtom) const setImportModelStage = useSetAtom(setImportModelStageAtom) const { onDropModels } = useDropModelBinaries() const [searchText, setSearchText] = useState('') + const [showEngineListModel, setShowEngineListModel] = useAtom( + showEngineListModelAtom + ) + const [extensionHasSettings, setExtensionHasSettings] = useState< + { name?: string; setting: string; apiKey: string; provider: string }[] + >([]) const filteredDownloadedModels = useMemo( () => @@ -52,11 +77,73 @@ const MyModels = () => { setSearchText(input) }, []) + useEffect(() => { + const getAllSettings = async () => { + const extensionsMenu: { + name?: string + setting: string + apiKey: string + provider: string + }[] = [] + const extensions = extensionManager.getAll() + + for (const extension of extensions) { + if (typeof extension.getSettings === 'function') { + const settings = await extension.getSettings() + + if ( + (settings && settings.length > 0) || + (await extension.installationState()) !== 'NotRequired' + ) { + extensionsMenu.push({ + name: extension.productName, + setting: extension.name, + apiKey: + 'apiKey' in extension && typeof extension.apiKey === 'string' + ? extension.apiKey + : '', + provider: + 'provider' in extension && + typeof extension.provider === 'string' + ? extension.provider + : '', + }) + } + } + } + setExtensionHasSettings(extensionsMenu) + } + getAllSettings() + }, []) + const findByEngine = filteredDownloadedModels.map((x) => x.engine) - const groupByEngine = findByEngine.filter(function (item, index) { - if (findByEngine.indexOf(item) === index) - return item !== InferenceEngine.nitro - }) + const groupByEngine = findByEngine + .filter(function (item, index) { + if (findByEngine.indexOf(item) === index) return item + }) + .sort((a, b) => { + if (priorityEngine.includes(a) && priorityEngine.includes(b)) { + return priorityEngine.indexOf(a) - priorityEngine.indexOf(b) + } else if (priorityEngine.includes(a)) { + return -1 + } else if (priorityEngine.includes(b)) { + return 1 + } else { + return 0 // Leave the rest in their original order + } + }) + + const getEngineStatusReady: InferenceEngine[] = extensionHasSettings + ?.filter((e) => e.apiKey.length > 0) + .map((x) => x.provider as InferenceEngine) + + useEffect(() => { + setShowEngineListModel((prev) => [ + ...prev, + ...(getEngineStatusReady as InferenceEngine[]), + ]) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [setShowEngineListModel, extensionHasSettings]) return (
    @@ -97,46 +184,80 @@ const MyModels = () => {
    - {filteredDownloadedModels.filter( - (x) => x.engine === InferenceEngine.nitro - ).length !== 0 && ( -
    -
    -
    Cortex
    -
    -
    - {filteredDownloadedModels - ? filteredDownloadedModels - .filter((x) => x.engine === InferenceEngine.nitro) - .map((model) => { - return - }) - : null} -
    + {!groupByEngine.length ? ( +
    +
    + ) : ( + groupByEngine.map((engine, i) => { + const engineLogo = getLogoEngine(engine as InferenceEngine) + const showModel = showEngineListModel.includes(engine) + const onClickChevron = () => { + if (showModel) { + setShowEngineListModel((prev) => + prev.filter((item) => item !== engine) + ) + } else { + setShowEngineListModel((prev) => [...prev, engine]) + } + } + return ( +
    +
    +
    + {engineLogo && ( + logo + )} +
    + {getTitleByEngine(engine)} +
    +
    +
    + {!localEngines.includes(engine) && ( + + )} + {!showModel ? ( + + ) : ( + + )} +
    +
    +
    + {filteredDownloadedModels + ? filteredDownloadedModels + .filter((x) => x.engine === engine) + .map((model) => { + if (!showModel) return null + return ( + + ) + }) + : null} +
    +
    + ) + }) )} - - {groupByEngine.map((engine, i) => { - return ( -
    -
    -
    - {engine} -
    - -
    -
    - {filteredDownloadedModels - ? filteredDownloadedModels - .filter((x) => x.engine === engine) - .map((model) => { - return - }) - : null} -
    -
    - ) - })}
    diff --git a/web/screens/Thread/ThreadCenterPanel/ChatInput/index.tsx b/web/screens/Thread/ThreadCenterPanel/ChatInput/index.tsx index c5f35cce8..ff3f19e16 100644 --- a/web/screens/Thread/ThreadCenterPanel/ChatInput/index.tsx +++ b/web/screens/Thread/ThreadCenterPanel/ChatInput/index.tsx @@ -3,15 +3,8 @@ import { useEffect, useRef, useState } from 'react' import { MessageStatus } from '@janhq/core' -import { - TextArea, - Button, - Tooltip, - useClickOutside, - Badge, - useMediaQuery, -} from '@janhq/joi' -import { useAtom, useAtomValue, useSetAtom } from 'jotai' +import { TextArea, Button, Tooltip, useClickOutside, Badge } from '@janhq/joi' +import { useAtom, useAtomValue } from 'jotai' import { FileTextIcon, ImageIcon, @@ -20,7 +13,6 @@ import { SettingsIcon, ChevronUpIcon, Settings2Icon, - ShapesIcon, } from 'lucide-react' import { twMerge } from 'tailwind-merge' @@ -40,6 +32,7 @@ import { experimentalFeatureEnabledAtom } from '@/helpers/atoms/AppConfig.atom' import { getCurrentChatMessagesAtom } from '@/helpers/atoms/ChatMessage.atom' import { spellCheckAtom } from '@/helpers/atoms/Setting.atom' import { + activeSettingInputBoxAtom, activeThreadAtom, getActiveThreadIdAtom, isGeneratingResponseAtom, @@ -52,10 +45,13 @@ const ChatInput = () => { const activeThread = useAtomValue(activeThreadAtom) const { stateModel } = useActiveModel() const messages = useAtomValue(getCurrentChatMessagesAtom) - const [activeSetting, setActiveSetting] = useState(false) + // const [activeSetting, setActiveSetting] = useState(false) const spellCheck = useAtomValue(spellCheckAtom) const [currentPrompt, setCurrentPrompt] = useAtom(currentPromptAtom) + const [activeSettingInputBox, setActiveSettingInputBox] = useAtom( + activeSettingInputBoxAtom + ) const { sendChatMessage } = useSendChatMessage() const activeThreadId = useAtomValue(getActiveThreadIdAtom) @@ -70,7 +66,9 @@ const ChatInput = () => { const threadStates = useAtomValue(threadStatesAtom) const { stopInference } = useActiveModel() - const setActiveTabThreadRightPanel = useSetAtom(activeTabThreadRightPanelAtom) + const [activeTabThreadRightPanel, setActiveTabThreadRightPanel] = useAtom( + activeTabThreadRightPanelAtom + ) const isStreamingResponse = Object.values(threadStates).some( (threadState) => threadState.waitingForResponse @@ -83,8 +81,6 @@ const ChatInput = () => { const refAttachmentMenus = useClickOutside(() => setShowAttacmentMenus(false)) const [showRightPanel, setShowRightPanel] = useAtom(showRightPanelAtom) - const matches = useMediaQuery('(max-width: 880px)') - useEffect(() => { if (isWaitingToSend && activeThreadId) { setIsWaitingToSend(false) @@ -106,12 +102,14 @@ const ChatInput = () => { useEffect(() => { if (textareaRef.current?.clientHeight) { - textareaRef.current.style.height = activeSetting ? '100px' : '40px' + textareaRef.current.style.height = activeSettingInputBox + ? '100px' + : '40px' textareaRef.current.style.height = textareaRef.current.scrollHeight + 'px' textareaRef.current.style.overflow = textareaRef.current.clientHeight >= 390 ? 'auto' : 'hidden' } - }, [textareaRef.current?.clientHeight, currentPrompt, activeSetting]) + }, [textareaRef.current?.clientHeight, currentPrompt, activeSettingInputBox]) const onKeyDown = async (e: React.KeyboardEvent) => { if (e.key === 'Enter' && !e.shiftKey && !e.nativeEvent.isComposing) { @@ -162,11 +160,11 @@ const ChatInput = () => { 'relative max-h-[400px] resize-none pr-20', fileUpload.length && 'rounded-t-none', experimentalFeature && 'pl-10', - activeSetting && 'pb-14 pr-16' + activeSettingInputBox && 'pb-14 pr-16' )} spellCheck={spellCheck} data-testid="txt-input-chat" - style={{ height: activeSetting ? '100px' : '40px' }} + style={{ height: activeSettingInputBox ? '100px' : '40px' }} ref={textareaRef} onKeyDown={onKeyDown} placeholder="Ask me anything" @@ -237,7 +235,7 @@ const ChatInput = () => { ref={refAttachmentMenus} className={twMerge( 'absolute bottom-14 left-0 z-30 w-36 cursor-pointer rounded-lg border border-[hsla(var(--app-border))] bg-[hsla(var(--app-bg))] py-1 shadow-sm', - activeSetting && 'bottom-28' + activeSettingInputBox && 'bottom-28' )} >
      @@ -320,12 +318,12 @@ const ChatInput = () => {
      - {!activeSetting && ( + {!activeSettingInputBox && (
      - {activeSetting && ( + {activeSettingInputBox && (
      { stateModel.loading && 'bg-transparent' )} > -
      +
      - - {experimentalFeature && ( + + {/* Temporary disable it */} + {/* {experimentalFeature && ( { setActiveTabThreadRightPanel('tools') if (matches) { @@ -423,9 +443,12 @@ const ChatInput = () => { /> Tools - )} + )} */}
      - diff --git a/web/screens/Thread/ThreadLeftPanel/ModalDeleteThread/index.tsx b/web/screens/Thread/ThreadLeftPanel/ModalDeleteThread/index.tsx index 75c158c87..84656ad87 100644 --- a/web/screens/Thread/ThreadLeftPanel/ModalDeleteThread/index.tsx +++ b/web/screens/Thread/ThreadLeftPanel/ModalDeleteThread/index.tsx @@ -1,48 +1,41 @@ import { useCallback, memo } from 'react' import { Modal, ModalClose, Button } from '@janhq/joi' -import { Trash2Icon } from 'lucide-react' +import { useAtom } from 'jotai' import useDeleteThread from '@/hooks/useDeleteThread' -type Props = { - threadId: string - closeContextMenu?: () => void -} +import { + modalActionThreadAtom, + ThreadModalAction, +} from '@/helpers/atoms/Thread.atom' -const ModalDeleteThread = ({ threadId, closeContextMenu }: Props) => { +const ModalDeleteThread = () => { const { deleteThread } = useDeleteThread() + const [modalActionThread, setModalActionThread] = useAtom( + modalActionThreadAtom + ) const onDeleteThreadClick = useCallback( (e: React.MouseEvent) => { e.stopPropagation() - deleteThread(threadId) + deleteThread(modalActionThread.thread?.id as string) }, - [deleteThread, threadId] + [deleteThread, modalActionThread.thread?.id] ) + const onCloseModal = useCallback(() => { + setModalActionThread({ + showModal: undefined, + thread: undefined, + }) + }, [setModalActionThread]) + return ( { - if (open && closeContextMenu) { - closeContextMenu() - } - }} - trigger={ -
      e.stopPropagation()} - > - - - Delete thread - -
      - } + onOpenChange={onCloseModal} + open={modalActionThread.showModal === ThreadModalAction.Delete} content={

      diff --git a/web/screens/Thread/ThreadLeftPanel/ModalEditTitleThread/index.tsx b/web/screens/Thread/ThreadLeftPanel/ModalEditTitleThread/index.tsx index 297c8f182..ddeaedf40 100644 --- a/web/screens/Thread/ThreadLeftPanel/ModalEditTitleThread/index.tsx +++ b/web/screens/Thread/ThreadLeftPanel/ModalEditTitleThread/index.tsx @@ -1,57 +1,52 @@ import { useCallback, useLayoutEffect, memo, useState } from 'react' -import { Thread } from '@janhq/core' import { Modal, ModalClose, Button, Input } from '@janhq/joi' -import { PencilIcon } from 'lucide-react' +import { useAtom } from 'jotai' import { useCreateNewThread } from '@/hooks/useCreateNewThread' -type Props = { - thread: Thread - closeContextMenu?: () => void -} +import { + modalActionThreadAtom, + ThreadModalAction, +} from '@/helpers/atoms/Thread.atom' -const ModalEditTitleThread = ({ thread, closeContextMenu }: Props) => { - const [title, setTitle] = useState(thread.title) +const ModalEditTitleThread = () => { const { updateThreadMetadata } = useCreateNewThread() + const [modalActionThread, setModalActionThread] = useAtom( + modalActionThreadAtom + ) + const [title, setTitle] = useState(modalActionThread.thread?.title as string) useLayoutEffect(() => { - if (thread.title) { - setTitle(thread.title) + if (modalActionThread.thread?.title) { + setTitle(modalActionThread.thread?.title) } - }, [thread.title]) + }, [modalActionThread.thread?.title]) const onUpdateTitle = useCallback( (e: React.MouseEvent) => { e.stopPropagation() - + if (!modalActionThread.thread) return null updateThreadMetadata({ - ...thread, + ...modalActionThread?.thread, title: title || 'New Thread', }) }, - [thread, title, updateThreadMetadata] + [modalActionThread?.thread, title, updateThreadMetadata] ) + const onCloseModal = useCallback(() => { + setModalActionThread({ + showModal: undefined, + thread: undefined, + }) + }, [setModalActionThread]) + return ( { - if (open && closeContextMenu) { - closeContextMenu() - } - }} - trigger={ -

      e.stopPropagation()} - > - - - Edit title - -
      - } + onOpenChange={onCloseModal} + open={modalActionThread.showModal === ThreadModalAction.EditTitle} content={
      { - diff --git a/web/screens/Thread/ThreadLeftPanel/index.tsx b/web/screens/Thread/ThreadLeftPanel/index.tsx index 34d15932f..0b866ea26 100644 --- a/web/screens/Thread/ThreadLeftPanel/index.tsx +++ b/web/screens/Thread/ThreadLeftPanel/index.tsx @@ -8,28 +8,27 @@ import { useAtomValue, useSetAtom } from 'jotai' import { GalleryHorizontalEndIcon, MoreHorizontalIcon, - PenSquareIcon, + Paintbrush, + PencilIcon, + Trash2Icon, } from 'lucide-react' import { twMerge } from 'tailwind-merge' import LeftPanelContainer from '@/containers/LeftPanelContainer' -import { toaster } from '@/containers/Toast' import { useCreateNewThread } from '@/hooks/useCreateNewThread' import useRecommendedModel from '@/hooks/useRecommendedModel' import useSetActiveThread from '@/hooks/useSetActiveThread' -import ModalCleanThread from './ModalCleanThread' -import ModalDeleteThread from './ModalDeleteThread' -import ModalEditTitleThread from './ModalEditTitleThread' - import { assistantsAtom } from '@/helpers/atoms/Assistant.atom' import { editMessageAtom } from '@/helpers/atoms/ChatMessage.atom' import { getActiveThreadIdAtom, + modalActionThreadAtom, threadDataReadyAtom, + ThreadModalAction, threadsAtom, } from '@/helpers/atoms/Thread.atom' @@ -42,6 +41,7 @@ const ThreadLeftPanel = () => { const { requestCreateNewThread } = useCreateNewThread() const setEditMessage = useSetAtom(editMessageAtom) const { recommendedModel, downloadedModels } = useRecommendedModel() + const setModalActionThread = useSetAtom(modalActionThreadAtom) const [contextMenu, setContextMenu] = useState<{ visible: boolean @@ -87,18 +87,6 @@ const ThreadLeftPanel = () => { downloadedModels, ]) - const onCreateConversationClick = async () => { - if (assistants.length === 0) { - toaster({ - title: 'No assistant available.', - description: `Could not create a new thread. Please add an assistant.`, - type: 'error', - }) - } else { - requestCreateNewThread(assistants[0]) - } - } - const onContextMenu = (event: React.MouseEvent, thread: Thread) => { event.preventDefault() setContextMenu({ @@ -126,18 +114,6 @@ const ThreadLeftPanel = () => {
      ) : (
      - - {threads.map((thread) => (
      { 'visible' )} > - - - +
      { + setModalActionThread({ + showModal: ThreadModalAction.EditTitle, + thread, + }) + e.stopPropagation() + }} + > + + + Edit title + +
      +
      { + setModalActionThread({ + showModal: ThreadModalAction.Clean, + thread, + }) + e.stopPropagation() + }} + > + + + Clean thread + +
      +
      { + setModalActionThread({ + showModal: ThreadModalAction.Delete, + thread, + }) + e.stopPropagation() + }} + > + + + Delete thread + +
      {activeThreadId === thread.id && ( diff --git a/web/screens/Thread/ThreadRightPanel/Tools/index.tsx b/web/screens/Thread/ThreadRightPanel/Tools/index.tsx index 2b34cbb67..aaa81fe5d 100644 --- a/web/screens/Thread/ThreadRightPanel/Tools/index.tsx +++ b/web/screens/Thread/ThreadRightPanel/Tools/index.tsx @@ -183,19 +183,6 @@ const Tools = () => { your specific use case." /> -
      - - onTimeWeightedRetrieverSwitchUpdate(e.target.checked) - } - /> -
      @@ -223,6 +210,18 @@ const Tools = () => { also considers when they were added to give newer ones more importance." /> +
      + + onTimeWeightedRetrieverSwitchUpdate(e.target.checked) + } + /> +
      { { name: 'Model', value: 'model' }, ...(experimentalFeature ? [{ name: 'Tools', value: 'tools' }] : []), ]} - value={activeTabThreadRightPanel} + value={activeTabThreadRightPanel as string} onValueChange={(value) => setActiveTabThreadRightPanel(value)} > @@ -206,9 +207,11 @@ const ThreadRightPanel = () => { id="assistant-instructions" placeholder="Eg. You are a helpful assistant." value={activeThread?.assistants[0].instructions ?? ''} + rows={8} onChange={onAssistantInstructionChanged} />
      + {experimentalFeature && }
    diff --git a/web/screens/Thread/index.tsx b/web/screens/Thread/index.tsx index ca8aab333..ef125e924 100644 --- a/web/screens/Thread/index.tsx +++ b/web/screens/Thread/index.tsx @@ -1,6 +1,9 @@ import ThreadLeftPanel from '@/screens/Thread/ThreadLeftPanel' import ThreadCenterPanel from './ThreadCenterPanel' +import ModalCleanThread from './ThreadLeftPanel/ModalCleanThread' +import ModalDeleteThread from './ThreadLeftPanel/ModalDeleteThread' +import ModalEditTitleThread from './ThreadLeftPanel/ModalEditTitleThread' import ThreadRightPanel from './ThreadRightPanel' const ThreadScreen = () => { @@ -9,6 +12,11 @@ const ThreadScreen = () => { + + {/* Showing variant modal action for thread screen */} + + +
    ) } diff --git a/web/styles/main.scss b/web/styles/main.scss index 89a64b8cb..8e952af5c 100644 --- a/web/styles/main.scss +++ b/web/styles/main.scss @@ -1,5 +1,4 @@ // Tailwind - @import 'tailwindcss/base'; @import 'tailwindcss/components'; @import 'tailwindcss/utilities'; @@ -8,6 +7,8 @@ @import './base/global.scss'; +@import './vendor/katex.scss'; + @import './components/loader.scss'; @import './components/code-block.scss'; @import './components/message.scss'; diff --git a/web/styles/vendor/katex-fonts.scss b/web/styles/vendor/katex-fonts.scss new file mode 100644 index 000000000..a1b3499bc --- /dev/null +++ b/web/styles/vendor/katex-fonts.scss @@ -0,0 +1,71 @@ +$font-folder: 'katex-fonts' !default; +$use-woff2: true !default; +$use-woff: true !default; +$use-ttf: true !default; + +@function generate-src($family, $family-suffix) { + $src: null; + @if $use-woff2 { + $src: append( + $src, + url('#{$font-folder}/KaTeX_#{$family}-#{$family-suffix}.woff2') + format('woff2'), + comma + ); + } + + @return $src; +} + +@function generate-suffix($weight, $style) { + $suffix: null; + + @if $weight == normal and $style == normal { + $suffix: 'Regular'; + } + @if $weight == normal and $style == italic { + $suffix: 'Italic'; + } + @if $weight == bold and $style == normal { + $suffix: 'Bold'; + } + @if $weight == bold and $style == italic { + $suffix: 'BoldItalic'; + } + + @return $suffix; +} + +@mixin font-face($family, $weight, $style) { + $suffix: generate-suffix($weight, $style); + $src: generate-src($family, $suffix); + + @font-face { + font-family: 'KaTeX_#{$family}'; + src: $src; + font-weight: $weight; + font-style: $style; + font-display: swap; + } +} + +@include font-face('AMS', normal, normal); +@include font-face('Caligraphic', bold, normal); +@include font-face('Caligraphic', normal, normal); +@include font-face('Fraktur', bold, normal); +@include font-face('Fraktur', normal, normal); +@include font-face('Main', bold, normal); +@include font-face('Main', bold, italic); +@include font-face('Main', normal, italic); +@include font-face('Main', normal, normal); +@include font-face('Math', bold, italic); +@include font-face('Math', normal, italic); +@include font-face('SansSerif', bold, normal); +@include font-face('SansSerif', normal, italic); +@include font-face('SansSerif', normal, normal); +@include font-face('Script', normal, normal); +@include font-face('Size1', normal, normal); +@include font-face('Size2', normal, normal); +@include font-face('Size3', normal, normal); +@include font-face('Size4', normal, normal); +@include font-face('Typewriter', normal, normal); diff --git a/web/styles/vendor/katex-fonts/KaTeX_AMS-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_AMS-Regular.woff2 new file mode 100644 index 000000000..0acaaff03 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_AMS-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Caligraphic-Bold.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Caligraphic-Bold.woff2 new file mode 100644 index 000000000..f390922ec Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Caligraphic-Bold.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Caligraphic-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Caligraphic-Regular.woff2 new file mode 100644 index 000000000..75344a1f9 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Caligraphic-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Fraktur-Bold.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Fraktur-Bold.woff2 new file mode 100644 index 000000000..395f28bea Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Fraktur-Bold.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Fraktur-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Fraktur-Regular.woff2 new file mode 100644 index 000000000..735f6948d Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Fraktur-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Main-Bold.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Main-Bold.woff2 new file mode 100644 index 000000000..ab2ad21da Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Main-Bold.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Main-BoldItalic.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Main-BoldItalic.woff2 new file mode 100644 index 000000000..5931794de Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Main-BoldItalic.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Main-Italic.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Main-Italic.woff2 new file mode 100644 index 000000000..b50920e13 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Main-Italic.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Main-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Main-Regular.woff2 new file mode 100644 index 000000000..eb24a7ba2 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Main-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Math-BoldItalic.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Math-BoldItalic.woff2 new file mode 100644 index 000000000..29657023a Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Math-BoldItalic.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Math-Italic.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Math-Italic.woff2 new file mode 100644 index 000000000..215c143fd Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Math-Italic.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Bold.woff2 b/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Bold.woff2 new file mode 100644 index 000000000..cfaa3bda5 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Bold.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Italic.woff2 b/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Italic.woff2 new file mode 100644 index 000000000..349c06dc6 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Italic.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Regular.woff2 new file mode 100644 index 000000000..a90eea85f Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_SansSerif-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Script-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Script-Regular.woff2 new file mode 100644 index 000000000..b3048fc11 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Script-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Size1-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Size1-Regular.woff2 new file mode 100644 index 000000000..c5a8462fb Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Size1-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Size2-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Size2-Regular.woff2 new file mode 100644 index 000000000..e1bccfe24 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Size2-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Size3-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Size3-Regular.woff2 new file mode 100644 index 000000000..249a28662 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Size3-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Size4-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Size4-Regular.woff2 new file mode 100644 index 000000000..680c13085 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Size4-Regular.woff2 differ diff --git a/web/styles/vendor/katex-fonts/KaTeX_Typewriter-Regular.woff2 b/web/styles/vendor/katex-fonts/KaTeX_Typewriter-Regular.woff2 new file mode 100644 index 000000000..771f1af70 Binary files /dev/null and b/web/styles/vendor/katex-fonts/KaTeX_Typewriter-Regular.woff2 differ diff --git a/web/styles/vendor/katex.scss b/web/styles/vendor/katex.scss new file mode 100644 index 000000000..3d7b06688 --- /dev/null +++ b/web/styles/vendor/katex.scss @@ -0,0 +1,671 @@ +/* stylelint-disable font-family-no-missing-generic-family-keyword */ +@import './katex-fonts.scss'; + +// The mu unit is defined as 1/18 em +$mu: calc(1em / 18); + +// The version is dynamically set from package.json via webpack.common.js +$version: '' !default; + +.katex { + font: + normal 1.21em KaTeX_Main, + Times New Roman, + serif; + line-height: 1.2; + + // Protect elements inside .katex from inheriting text-indent. + text-indent: 0; + + // Prevent a rendering bug that misplaces \vec in Chrome. + text-rendering: auto; + + * { + // Prevent background resetting on elements in Windows's high-contrast + // mode, while still allowing background/foreground setting on root .katex + -ms-high-contrast-adjust: none !important; + + // Insulate fraction bars and rules from CSS that sets border-color. + border-color: currentColor; + } + + .katex-version::after { + content: $version; + } + + .katex-mathml { + /* Accessibility hack to only show to screen readers + Found at: http://a11yproject.com/posts/how-to-hide-content/ */ + position: absolute; + clip: rect(1px, 1px, 1px, 1px); + padding: 0; + border: 0; + height: 1px; + width: 1px; + overflow: hidden; + } + + .katex-html { + /* \newline is an empty block at top level, between .base elements */ + > .newline { + display: block; + } + } + + .base { + position: relative; + display: inline-block; + white-space: nowrap; + + // Fix width of containers of negative spaces, working around Chrome bug. + width: min-content; + } + + .strut { + display: inline-block; + } + + // Text font weights + .textbf { + font-weight: bold; + } + + // Text font shapes. + .textit { + font-style: italic; + } + + // Text font families. + .textrm { + font-family: KaTeX_Main; + } + + .textsf { + font-family: KaTeX_SansSerif; + } + + .texttt { + font-family: KaTeX_Typewriter; + } + + // Math fonts. + .mathnormal { + font-family: KaTeX_Math; + font-style: italic; + } + + .mathit { + font-family: KaTeX_Main; + font-style: italic; + } + + .mathrm { + font-style: normal; + } + + .mathbf { + font-family: KaTeX_Main; + font-weight: bold; + } + + .boldsymbol { + font-family: KaTeX_Math; + font-weight: bold; + font-style: italic; + } + + .amsrm { + font-family: KaTeX_AMS; + } + + .mathbb, + .textbb { + font-family: KaTeX_AMS; + } + + .mathcal { + font-family: KaTeX_Caligraphic; + } + + .mathfrak, + .textfrak { + font-family: KaTeX_Fraktur; + } + + .mathboldfrak, + .textboldfrak { + font-family: KaTeX_Fraktur; + font-weight: bold; + } + + .mathtt { + font-family: KaTeX_Typewriter; + } + + .mathscr, + .textscr { + font-family: KaTeX_Script; + } + + .mathsf, + .textsf { + font-family: KaTeX_SansSerif; + } + + .mathboldsf, + .textboldsf { + font-family: KaTeX_SansSerif; + font-weight: bold; + } + + .mathitsf, + .textitsf { + font-family: KaTeX_SansSerif; + font-style: italic; + } + + .mainrm { + font-family: KaTeX_Main; + font-style: normal; + } + + // This value is also used in fontMetrics.js, if you change it make sure the + // values match. + $ptperem: 10; + $nulldelimiterspace: calc(1.2em / $ptperem); + + $muspace: 0.055556em; // 1mu + $thinspace: 0.16667em; // 3mu + $mediumspace: 0.22222em; // 4mu + $thickspace: 0.27778em; // 5mu + + .vlist-t { + display: inline-table; + table-layout: fixed; + border-collapse: collapse; + } + + .vlist-r { + display: table-row; + } + + .vlist { + display: table-cell; + vertical-align: bottom; + position: relative; + + > span { + display: block; + height: 0; + position: relative; + + > span { + display: inline-block; + } + + > .pstrut { + overflow: hidden; + width: 0; + } + } + } + + .vlist-t2 { + margin-right: -2px; + } + + .vlist-s { + // This cell solves Safari rendering problems. It has text content, so + // its baseline is used for the table. A very small font avoids line-box + // issues; absolute units prevent user font-size overrides from breaking + // rendering. Safari refuses to make the box zero-width, so we give it + // a known width and compensate with negative right margin on the + // inline-table. To prevent the "width: min-content" Chrome workaround + // from shrinking this box, we also set min-width. + display: table-cell; + vertical-align: bottom; + font-size: 1px; + width: 2px; + min-width: 2px; + } + + .vbox { + display: inline-flex; + flex-direction: column; + align-items: baseline; + } + + .hbox { + display: inline-flex; + flex-direction: row; + width: 100%; + } + + .thinbox { + display: inline-flex; + flex-direction: row; + width: 0; + max-width: 0; // necessary for Safari + } + + .msupsub { + text-align: left; + } + + .mfrac { + > span > span { + text-align: center; + } + + .frac-line { + display: inline-block; + width: 100%; + border-bottom-style: solid; + } + } + + // Prevent Chrome from disappearing frac-lines, rules, etc. + .mfrac .frac-line, + .overline .overline-line, + .underline .underline-line, + .hline, + .hdashline, + .rule { + min-height: 1px; + } + + .mspace { + display: inline-block; + } + + .llap, + .rlap, + .clap { + width: 0; + position: relative; + + > .inner { + position: absolute; + } + + > .fix { + display: inline-block; + } + } + + .llap > .inner { + right: 0; + } + + .rlap > .inner, + .clap > .inner { + left: 0; + } + + .clap > .inner > span { + margin-left: -50%; + margin-right: 50%; + } + + .rule { + display: inline-block; + border: solid 0; + position: relative; + } + + .overline .overline-line, + .underline .underline-line, + .hline { + display: inline-block; + width: 100%; + border-bottom-style: solid; + } + + .hdashline { + display: inline-block; + width: 100%; + border-bottom-style: dashed; + } + + .sqrt { + > .root { + /* These values are taken from the definition of `\r@@t`, + `\mkern 5mu` and `\mkern -10mu`. */ + margin-left: calc(5 * $mu); + margin-right: calc(-10 * $mu); + } + } + + .sizing, + .fontsize-ensurer { + $sizes: 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.2, 1.44, 1.728, 2.074, 2.488; + + @for $from from 1 through length($sizes) { + @for $to from 1 through length($sizes) { + &.reset-size#{$from}.size#{$to} { + /* stylelint-disable-next-line */ + font-size: calc((nth($sizes, $to) / nth($sizes, $from)) * 1em); + } + } + } + } + + .delimsizing { + &.size1 { + font-family: KaTeX_Size1; + } + &.size2 { + font-family: KaTeX_Size2; + } + &.size3 { + font-family: KaTeX_Size3; + } + &.size4 { + font-family: KaTeX_Size4; + } + + &.mult { + .delim-size1 > span { + font-family: KaTeX_Size1; + } + + .delim-size4 > span { + font-family: KaTeX_Size4; + } + } + } + + .nulldelimiter { + display: inline-block; + width: $nulldelimiterspace; + } + + .delimcenter { + position: relative; + } + + .op-symbol { + position: relative; + + &.small-op { + font-family: KaTeX_Size1; + } + + &.large-op { + font-family: KaTeX_Size2; + } + } + + .op-limits { + > .vlist-t { + text-align: center; + } + } + + .accent { + > .vlist-t { + text-align: center; + } + + .accent-body { + position: relative; // so that 'left' can shift the accent + } + + // Accents that are not of the accent-full class have zero width + // (do not contribute to the width of the final symbol). + .accent-body:not(.accent-full) { + width: 0; + } + } + + .overlay { + display: block; + } + + .mtable { + .vertical-separator { + display: inline-block; + // margin and border-right are set in JavaScript + min-width: 1px; // Prevent Chrome from omitting a line. + } + + .arraycolsep { + display: inline-block; + } + + .col-align-c > .vlist-t { + text-align: center; + } + + .col-align-l > .vlist-t { + text-align: left; + } + + .col-align-r > .vlist-t { + text-align: right; + } + } + + .svg-align { + text-align: left; + } + + svg { + display: block; + position: absolute; // absolute relative to parent + width: 100%; + height: inherit; + + // We want to inherit colors from our environment + fill: currentColor; + stroke: currentColor; + + // But path elements should not have an outline by default + // that would make them bigger than we expect. + path { + stroke: none; + } + + // And we don't want to inherit any other style properties + // that could affect SVG rendering without affecting font + // rendering. So we reset these properties to their default + // values for every element. + // See https://www.w3.org/TR/SVG/painting.html + fill-rule: nonzero; + fill-opacity: 1; + stroke-width: 1; + stroke-linecap: butt; + stroke-linejoin: miter; + stroke-miterlimit: 4; + stroke-dasharray: none; + stroke-dashoffset: 0; + stroke-opacity: 1; + } + + img { + border-style: none; + min-width: 0; + min-height: 0; + max-width: none; + max-height: none; + } + + // Define CSS for image whose width will match its span width. + .stretchy { + width: 100%; + display: block; + position: relative; + overflow: hidden; + + &::before, + &::after { + content: ''; + } + } + + // Hide the long tail of a stretchy SVG. + .hide-tail { + width: 100%; // necessary only to get IE to work properly + position: relative; // ditto + overflow: hidden; // This line applies to all browsers. + } + + .halfarrow-left { + position: absolute; + left: 0; + width: 50.2%; + overflow: hidden; + } + + .halfarrow-right { + position: absolute; + right: 0; + width: 50.2%; + overflow: hidden; + } + + .brace-left { + position: absolute; + left: 0; + width: 25.1%; + overflow: hidden; + } + + .brace-center { + position: absolute; + left: 25%; + width: 50%; + overflow: hidden; + } + + .brace-right { + position: absolute; + right: 0; + width: 25.1%; + overflow: hidden; + } + + // Lengthen the extensible arrows via padding. + .x-arrow-pad { + padding: 0 0.5em; + } + + .cd-arrow-pad { + padding: 0 0.55556em 0 0.27778em; // \;{#1}\;\; + } + + .x-arrow, + .mover, + .munder { + text-align: center; + } + + .boxpad { + padding: 0 0.3em; // \fboxsep = 3pt + } + + .fbox, + .fcolorbox { + box-sizing: border-box; + border: 0.04em solid; // \fboxrule = 0.4pt + } + + .cancel-pad { + padding: 0 0.2em; // ref: cancel package \advance\dimen@ 2\p@ % "+2" + } + + .cancel-lap { + margin-left: -0.2em; // \cancel does not affect horizontal spacing. + margin-right: -0.2em; // Apply negative margin to correct for 0.2em padding + } // inside the \cancel group. + + .sout { + border-bottom-style: solid; + border-bottom-width: 0.08em; + } + + .angl { + // from package actuarialangle, which is always used in a subscript. + box-sizing: border-box; + border-top: 0.049em solid; // defaultRuleThickness in scriptstyle + border-right: 0.049em solid; // ditto + margin-right: 0.03889em; // 1 mu + } + + .anglpad { + padding: 0 0.03889em; // pad 1mu left and right (in scriptstyle) + } + + .eqn-num::before { + counter-increment: katexEqnNo; + content: '(' counter(katexEqnNo) ')'; + } + + .mml-eqn-num::before { + counter-increment: mmlEqnNo; + content: '(' counter(mmlEqnNo) ')'; + } + + .mtr-glue { + width: 50%; + } + + .cd-vert-arrow { + display: inline-block; + position: relative; + } + + .cd-label-left { + display: inline-block; + position: absolute; + right: calc(50% + 0.3em); + text-align: left; + } + + .cd-label-right { + display: inline-block; + position: absolute; + left: calc(50% + 0.3em); + text-align: right; + } +} + +.katex-display { + display: block; + margin: 1em 0; + text-align: center; + + > .katex { + display: block; + text-align: center; + white-space: nowrap; + + > .katex-html { + display: block; + position: relative; + + > .tag { + position: absolute; + right: 0; + } + } + } +} + +// Left-justified tags (default is right-justified) +.katex-display.leqno > .katex > .katex-html > .tag { + left: 0; + right: auto; +} + +// Flush-left display math +.katex-display.fleqn > .katex { + text-align: left; + padding-left: 2em; +} + +// Automatic equation numbers for some environments. +// Use parallel counters for HTML and MathML. +body { + counter-reset: katexEqnNo mmlEqnNo; +} diff --git a/web/utils/datetime.ts b/web/utils/datetime.ts index d139c95c2..e596f0841 100644 --- a/web/utils/datetime.ts +++ b/web/utils/datetime.ts @@ -1,17 +1,32 @@ export const isToday = (timestamp: number) => { const today = new Date() - return today.setHours(0, 0, 0, 0) == new Date(timestamp).setHours(0, 0, 0, 0) + return today.setHours(0, 0, 0, 0) === new Date(timestamp).setHours(0, 0, 0, 0) } export const displayDate = (timestamp?: string | number | Date) => { if (!timestamp) return 'N/A' - let displayDate = new Date(timestamp).toLocaleString() + const date = new Date(timestamp) + + let displayDate = `${date.toLocaleDateString(undefined, { + day: '2-digit', + month: 'short', + year: 'numeric', + })}, ${date.toLocaleTimeString(undefined, { + hour: '2-digit', + minute: '2-digit', + second: '2-digit', + hour12: true, + })}` + if (typeof timestamp === 'number' && isToday(timestamp)) { - displayDate = new Date(timestamp).toLocaleTimeString(undefined, { + displayDate = date.toLocaleTimeString(undefined, { hour: '2-digit', minute: '2-digit', + second: '2-digit', + hour12: true, }) } + return displayDate } diff --git a/web/utils/modelEngine.ts b/web/utils/modelEngine.ts new file mode 100644 index 000000000..a12e9bb0e --- /dev/null +++ b/web/utils/modelEngine.ts @@ -0,0 +1,66 @@ +import { InferenceEngine } from '@janhq/core' + +export const getLogoEngine = (engine: InferenceEngine) => { + switch (engine) { + case InferenceEngine.anthropic: + return 'images/ModelProvider/anthropic.svg' + case InferenceEngine.nitro: + return 'images/ModelProvider/nitro.svg' + case InferenceEngine.cortex_llamacpp: + case InferenceEngine.cortex_onnx: + case InferenceEngine.cortex_tensorrtllm: + return 'images/ModelProvider/cortex.svg' + case InferenceEngine.mistral: + return 'images/ModelProvider/mistral.svg' + case InferenceEngine.martian: + return 'images/ModelProvider/martian.svg' + case InferenceEngine.openrouter: + return 'images/ModelProvider/openRouter.svg' + case InferenceEngine.openai: + return 'images/ModelProvider/openai.svg' + case InferenceEngine.groq: + return 'images/ModelProvider/groq.svg' + case InferenceEngine.triton_trtllm: + return 'images/ModelProvider/triton_trtllm.svg' + case InferenceEngine.cohere: + return 'images/ModelProvider/cohere.svg' + case InferenceEngine.nvidia: + return 'images/ModelProvider/nvidia.svg' + default: + return undefined + } +} + +export const localEngines = [ + InferenceEngine.nitro, + InferenceEngine.nitro_tensorrt_llm, + InferenceEngine.cortex_llamacpp, + InferenceEngine.cortex_onnx, + InferenceEngine.cortex_tensorrtllm, +] + +export const getTitleByEngine = (engine: InferenceEngine) => { + switch (engine) { + case InferenceEngine.nitro: + return 'Llama.cpp (Nitro)' + case InferenceEngine.cortex_llamacpp: + return 'Llama.cpp (Cortex)' + case InferenceEngine.cortex_onnx: + return 'Onnx (Cortex)' + case InferenceEngine.cortex_tensorrtllm: + return 'TensorRT-LLM (Cortex)' + case InferenceEngine.openai: + return 'OpenAI' + case InferenceEngine.openrouter: + return 'OpenRouter' + default: + return engine.charAt(0).toUpperCase() + engine.slice(1) + } +} + +export const priorityEngine = [ + InferenceEngine.cortex_llamacpp, + InferenceEngine.cortex_onnx, + InferenceEngine.cortex_tensorrtllm, + InferenceEngine.nitro, +]