diff --git a/extensions/model-extension/download.bat b/extensions/model-extension/download.bat deleted file mode 100644 index de055cb80..000000000 --- a/extensions/model-extension/download.bat +++ /dev/null @@ -1,3 +0,0 @@ -@echo off -set /p LLAMA_CPP_VERSION=<./scripts/version.txt -.\node_modules\.bin\download https://github.com/ggerganov/llama.cpp/archive/refs/tags/%LLAMA_CPP_VERSION%.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf .\scripts\llama.cpp.tar.gz "llama.cpp-%LLAMA_CPP_VERSION%/convert.py" "llama.cpp-%LLAMA_CPP_VERSION%/convert-hf-to-gguf.py" "llama.cpp-%LLAMA_CPP_VERSION%/gguf-py" && cpx "./llama.cpp-%LLAMA_CPP_VERSION%/**" "scripts" && rimraf "./scripts/llama.cpp.tar.gz" && rimraf "./llama.cpp-%LLAMA_CPP_VERSION%" \ No newline at end of file diff --git a/extensions/model-extension/package.json b/extensions/model-extension/package.json index 6bd8bbe5e..4a2c61b71 100644 --- a/extensions/model-extension/package.json +++ b/extensions/model-extension/package.json @@ -9,31 +9,25 @@ "license": "AGPL-3.0", "scripts": { "build": "tsc --module commonjs && rollup -c rollup.config.ts --configPlugin @rollup/plugin-typescript --bundleConfigAsCjs", - "download:llama": "run-script-os", - "download:llama:linux": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz --wildcards '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", - "download:llama:darwin": "LLAMA_CPP_VERSION=$(cat ./scripts/version.txt) && download https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_CPP_VERSION}.tar.gz -o . --filename ./scripts/llama.cpp.tar.gz && tar -xzf ./scripts/llama.cpp.tar.gz '*/convert.py' '*/convert-hf-to-gguf.py' '*/gguf-py' && cpx \"./llama.cpp-$LLAMA_CPP_VERSION/**\" \"scripts\" && rimraf \"./scripts/llama.cpp.tar.gz\" && rimraf \"./llama.cpp-$LLAMA_CPP_VERSION\"", - "download:llama:win32": "download.bat", - "build:publish:linux": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", - "build:publish:darwin": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && ../../.github/scripts/auto-sign.sh && npm pack && cpx *.tgz ../../pre-install", - "build:publish:win32": "rimraf *.tgz --glob && yarn build && yarn download:llama && cpx \"scripts/**\" \"dist/scripts\" && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install", - "build:publish": "run-script-os" + "build:publish": "rimraf *.tgz --glob && yarn build && npm pack && cpx *.tgz ../../pre-install" }, "devDependencies": { - "cpx": "^1.5.0", - "download-cli": "^1.1.1", - "rimraf": "^3.0.2", - "ts-loader": "^9.5.0", - "typescript": "5.3.3", "@rollup/plugin-commonjs": "^25.0.7", "@rollup/plugin-json": "^6.1.0", "@rollup/plugin-node-resolve": "^15.2.3", "@rollup/plugin-replace": "^5.0.5", "@rollup/plugin-typescript": "^11.1.6", "@types/pdf-parse": "^1.1.4", + "cpx": "^1.5.0", + "download-cli": "^1.1.1", + "rimraf": "^3.0.2", "rollup": "^2.38.5", "rollup-plugin-define": "^1.0.1", "rollup-plugin-sourcemaps": "^0.6.3", - "rollup-plugin-typescript2": "^0.36.0" + "rollup-plugin-typescript2": "^0.36.0", + "run-script-os": "^1.1.6", + "ts-loader": "^9.5.0", + "typescript": "5.3.3" }, "files": [ "dist/*", @@ -41,8 +35,15 @@ "README.md" ], "dependencies": { - "@janhq/core": "file:../../core", "@huggingface/gguf": "^0.0.11", + "@huggingface/jinja": "^0.3.0", + "@janhq/core": "file:../../core", + "hyllama": "^0.2.2", "python-shell": "^5.0.0" - } + }, + "bundleDependencies": [ + "hyllama", + "@huggingface/gguf", + "@huggingface/jinja" + ] } diff --git a/extensions/model-extension/rollup.config.ts b/extensions/model-extension/rollup.config.ts index aa22bd1f6..c3f3acc77 100644 --- a/extensions/model-extension/rollup.config.ts +++ b/extensions/model-extension/rollup.config.ts @@ -3,7 +3,7 @@ import sourceMaps from 'rollup-plugin-sourcemaps' import typescript from 'rollup-plugin-typescript2' import json from '@rollup/plugin-json' import replace from '@rollup/plugin-replace' - +import commonjs from '@rollup/plugin-commonjs' const settingJson = require('./resources/settings.json') const packageJson = require('./package.json') const defaultModelJson = require('./resources/default-model.json') @@ -39,6 +39,39 @@ export default [ browser: true, }), + // Resolve source maps to the original source + sourceMaps(), + ], + }, + { + input: `src/node/index.ts`, + output: [ + { + file: 'dist/node/index.cjs.js', + format: 'cjs', + sourcemap: true, + inlineDynamicImports: true, + }, + ], + // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash') + external: ['@janhq/core/node'], + watch: { + include: 'src/node/**', + }, + plugins: [ + // Allow json resolution + json(), + // Compile TypeScript files + typescript({ useTsconfigDeclarationDir: true }), + // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs) + commonjs(), + // Allow node_modules resolution, so you can use 'external' to control + // which external modules to include in the bundle + // https://github.com/rollup/rollup-plugin-node-resolve#usage + resolve({ + extensions: ['.ts', '.js', '.json'], + }), + // Resolve source maps to the original source sourceMaps(), ], diff --git a/extensions/model-extension/scripts/convert-hf-to-gguf.py b/extensions/model-extension/scripts/convert-hf-to-gguf.py deleted file mode 100755 index 0d4ea03b4..000000000 --- a/extensions/model-extension/scripts/convert-hf-to-gguf.py +++ /dev/null @@ -1,1720 +0,0 @@ -#!/usr/bin/env python3 - -from __future__ import annotations - -import argparse -import contextlib -import json -import os -import re -import sys -from enum import IntEnum -from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast - -import numpy as np -import torch - -if TYPE_CHECKING: - from torch import Tensor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf - -from convert import HfVocab - - -# check for any of the given keys in the dictionary and return the value of the first key found -def get_key_opts(d, keys): - for k in keys: - if k in d: - return d[k] - print(f"Could not find any of {keys}") - sys.exit() - - -###### MODEL DEFINITIONS ###### - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class Model: - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): - self.dir_model = dir_model - self.ftype = ftype - self.fname_out = fname_out - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.is_safetensors = self._is_model_safetensors() - self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") - self.part_names = self._get_part_names() - self.hparams = Model.load_hparams(self.dir_model) - self.model_arch = self._get_model_architecture() - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) - - def set_vocab(self): - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for part_name in self.part_names: - print(f"gguf: loading model part '{part_name}'") - ctx: ContextManager[Any] - if self.is_safetensors: - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) - else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) - - with ctx as model_part: - for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - yield name, data - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - )) - if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: - self.gguf_writer.add_context_length(n_ctx) - if (n_embd := self.hparams.get("hidden_size")) is not None: - self.gguf_writer.add_embedding_length(n_embd) - if (n_ff := self.hparams.get("intermediate_size")) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_heads")) is not None: - self.gguf_writer.add_head_count(n_head) - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - - if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file() - self.gguf_writer.close() - - def write_vocab(self): - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - @staticmethod - def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.endswith(prefix): - num_parts += 1 - - return num_parts - - @staticmethod - def load_hparams(dir_model): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @staticmethod - def from_model_architecture(model_architecture): - if model_architecture == "GPTNeoXForCausalLM": - return GPTNeoXModel - if model_architecture == "BloomForCausalLM": - return BloomModel - if model_architecture == "MPTForCausalLM": - return MPTModel - if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return BaichuanModel - if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): - return FalconModel - if model_architecture == "GPTBigCodeForCausalLM": - return StarCoderModel - if model_architecture == "GPTRefactForCausalLM": - return RefactModel - if model_architecture == "PersimmonForCausalLM": - return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return StableLMModel - if model_architecture == "QWenLMHeadModel": - return QwenModel - if model_architecture == "Qwen2ForCausalLM": - return Model - if model_architecture == "MixtralForCausalLM": - return MixtralModel - if model_architecture == "GPT2LMHeadModel": - return GPT2Model - if model_architecture == "PhiForCausalLM": - return Phi2Model - if model_architecture == "PlamoForCausalLM": - return PlamoModel - if model_architecture == "CodeShellForCausalLM": - return CodeShellModel - if model_architecture == "OrionForCausalLM": - return OrionModel - if model_architecture == "InternLM2ForCausalLM": - return InternLM2Model - if model_architecture == "MiniCPMForCausalLM": - return MiniCPMModel - return Model - - def _is_model_safetensors(self) -> bool: - return Model.count_model_parts(self.dir_model, ".safetensors") > 0 - - def _get_part_names(self): - if self.is_safetensors: - if self.num_parts == 1: # there's only one .safetensors file - return ("model.safetensors",) - return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) - - if self.num_parts == 1: # there's only one .bin file - return ("pytorch_model.bin",) - return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - - def _get_model_architecture(self) -> gguf.MODEL_ARCH: - arch = self.hparams["architectures"][0] - if arch == "GPTNeoXForCausalLM": - return gguf.MODEL_ARCH.GPTNEOX - if arch == "BloomForCausalLM": - return gguf.MODEL_ARCH.BLOOM - if arch == "MPTForCausalLM": - return gguf.MODEL_ARCH.MPT - if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return gguf.MODEL_ARCH.BAICHUAN - if arch in ("FalconForCausalLM", "RWForCausalLM"): - return gguf.MODEL_ARCH.FALCON - if arch == "GPTBigCodeForCausalLM": - return gguf.MODEL_ARCH.STARCODER - if arch == "GPTRefactForCausalLM": - return gguf.MODEL_ARCH.REFACT - if arch == "PersimmonForCausalLM": - return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return gguf.MODEL_ARCH.STABLELM - if arch == "QWenLMHeadModel": - return gguf.MODEL_ARCH.QWEN - if arch == "Qwen2ForCausalLM": - return gguf.MODEL_ARCH.QWEN2 - if arch == "MixtralForCausalLM": - return gguf.MODEL_ARCH.LLAMA - if arch == "GPT2LMHeadModel": - return gguf.MODEL_ARCH.GPT2 - if arch == "PhiForCausalLM": - return gguf.MODEL_ARCH.PHI2 - if arch == "PlamoForCausalLM": - return gguf.MODEL_ARCH.PLAMO - if arch == "CodeShellForCausalLM": - return gguf.MODEL_ARCH.CODESHELL - if arch == "OrionForCausalLM": - return gguf.MODEL_ARCH.ORION - if arch == "InternLM2ForCausalLM": - return gguf.MODEL_ARCH.INTERNLM2 - if arch == "MiniCPMForCausalLM": - return gguf.MODEL_ARCH.MINICPM - - raise NotImplementedError(f'Architecture "{arch}" not supported!') - - def _set_vocab_gpt2(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode('utf-8') - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_qwen(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode("utf-8") - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_sentencepiece(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) - - tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.id_to_piece(token_id) - text = piece.encode("utf-8") - score = tokenizer.get_score(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.is_unknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.is_control(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.is_unused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.is_byte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_hf(self): - path = self.dir_model - added_tokens_path = self.dir_model - vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - -class GPTNeoXModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - -class BloomModel(Model): - def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams["n_layer"] - tensors = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - has_lm_head = True - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - for name, data_torch in tensors.items(): - if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): - has_lm_head = False - - name = re.sub(r'transformer\.', '', name) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) - data = np.concatenate( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) - data = np.concatenate( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.bias") - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "word_embeddings.weight": - self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - -class MPTModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - if "scales" in name: - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) - if new_name is not None: - new_name = new_name.replace("scales", "act.scales") - else: - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - self.gguf_writer.add_tensor("output.weight", data) - - -class OrionModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - print("gguf: can not find ctx length parameter.") - sys.exit() - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - - def write_tensors(self): - # Collect tensors from generator object - model_kv = dict(self.get_tensors()) - block_count = self.hparams["num_hidden_layers"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - -class BaichuanModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - print("gguf: can not find ctx length parameter.") - sys.exit() - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def write_tensors(self): - # Collect tensors from generator object - model_kv = dict(self.get_tensors()) - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - for i in range(block_count): - if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: - print(f"Unpacking and permuting layer {i}") - model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ - self._reverse_hf_permute_part(w, 0, head_count, head_count) - model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ - self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) - model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ - self._reverse_hf_part(w, 2) - del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -class FalconModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_name("Falcon") - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - head_dim = self.hparams["hidden_size"] // n_head - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class StarCoderModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("StarCoder") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -class RefactModel(Model): - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("Refact") - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - block_count = self.hparams["n_layer"] - - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - tensors = dict(self.get_tensors()) - for i in range(block_count): - if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] - del tensors[f"transformer.h.{i}.attn.kv.weight"] - if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w - del tensors[f"transformer.h.{i}.attn.q.weight"] - if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: - tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] - tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] - del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - - for name, data_torch in tensors.items(): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class PersimmonModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - head_count = self.hparams["num_attention_heads"] - head_count_kv = head_count - hidden_size = self.hparams["hidden_size"] - - self.gguf_writer.add_name('persimmon-8b-chat') - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - - # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller - # than the head size? - # ref: https://github.com/ggerganov/llama.cpp/pull/4889 - # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) - self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) - - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - - def set_vocab(self): - self._set_vocab_sentencepiece() - # self.gguf_writer.add_bos_token_id(71013) - # self.gguf_writer.add_eos_token_id(71013) - - def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - if name.endswith(".self_attention.rotary_emb.inv_freq"): - continue - old_dtype = data_torch.dtype - # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data_torch.to(torch.float32).squeeze().numpy() - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - n_dims = len(data.shape) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - -class StableLMModel(Model): - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab - self._set_vocab_qwen() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) - - -class MixtralModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - -class MiniCPMModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def set_vocab(self): - self._set_vocab_hf() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - n_head = self.hparams.get("num_attention_heads") - n_kv_head = self.hparams.get("num_key_value_heads") - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class QwenModel(Model): - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - self._set_vocab_qwen() - - def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - - def write_tensors(self): - block_count = self.hparams["num_hidden_layers"] - model_kv = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - -class GPT2Model(Model): - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_context_length(self.hparams["n_ctx"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): - continue - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - # note: GPT2 output is tied to (same as) wte in original model - if new_name == "token_embd.weight": - print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor("output.weight", data) - - -class Phi2Model(Model): - def set_gguf_parameters(self): - block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) - - rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) - n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) - n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) - - self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) - - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(4 * n_embd) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_add_bos_token(False) - - -class PlamoModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - hparams = self.hparams - block_count = hparams["num_hidden_layers"] - - self.gguf_writer.add_name("PLaMo") - self.gguf_writer.add_context_length(4096) # not in config.json - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - - def shuffle_attn_q_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(8, 5, 128, 5120) - data_torch = torch.permute(data_torch, (1, 0, 2, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def shuffle_attn_output_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(5120, 8, 5, 128) - data_torch = torch.permute(data_torch, (0, 2, 1, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - if "self_attn.rotary_emb.inv_freq" in name: - continue - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class CodeShellModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("CodeShell") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(10000.0) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - tensors = dict(self.get_tensors()) - has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() - for name, data_torch in tensors.items(): - # we don't need these - if name.endswith((".attn.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "transformer.wte.weight": - self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - -class InternLM2Model(Model): - def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) - - sentencepiece_model = model.ModelProto() - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - - tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.id_to_piece(token_id) - text = piece.encode("utf-8") - score = tokenizer.get_score(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - print(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉" - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.is_unknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.is_control(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.is_unused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.is_byte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - old_eos = special_vocab.special_token_ids["eos"] - if "chat" in os.path.basename(self.dir_model.absolute()): - # For the chat model, we replace the eos with '<|im_end|>'. - special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ -in chat mode so that the conversation can end normally.") - - special_vocab.add_to_gguf(self.gguf_writer) - - def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') - im_end_list = tokenizer.encode('<|im_end|>') - assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) - if len(unused_145_list) == 1: - eos_token = unused_145_list[0] - if len(im_end_list) == 1: - eos_token = im_end_list[0] - return eos_token - - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - - def post_write_tensors(self, tensor_map, name, data_torch): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - def write_tensors(self): - from einops import rearrange - - num_heads = self.hparams.get("num_attention_heads") - num_kv_heads = self.hparams.get("num_key_value_heads") - hidden_size = self.hparams.get("hidden_size") - q_per_kv = num_heads // num_kv_heads - head_dim = hidden_size // num_heads - num_groups = num_heads // q_per_kv - - block_count = self.hparams["num_hidden_layers"] - model_kv = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] - qkv = data_torch - qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] - # The model weights of q and k equire additional reshape. - q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - v = rearrange(v, " o g n i -> o (g n i)").T - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) - else: - self.post_write_tensors(tensor_map, name, data_torch) - - -###### CONVERSION LOGIC ###### - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--awq-path", type=Path, default=None, - help="Path to scale awq cache file") - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "--outtype", type=str, choices=["f32", "f16"], default="f16", - help="output format - use f32 for float32, f16 for float16", - ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") - parser.add_argument( - "model", type=Path, - help="directory containing model file", - ) - - return parser.parse_args() - - -def main() -> None: - args = parse_args() - - dir_model = args.model - - if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" - dir_model = tmp_model_path - if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") - - if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file=sys.stderr) - sys.exit(1) - - ftype_map = { - "f32": gguf.GGMLQuantizationType.F32, - "f16": gguf.GGMLQuantizationType.F16, - } - - if args.outfile is not None: - fname_out = args.outfile - else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' - - print(f"Loading model: {dir_model.name}") - - hparams = Model.load_hparams(dir_model) - - with torch.inference_mode(): - model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) - - print("Set model parameters") - model_instance.set_gguf_parameters() - - print("Set model tokenizer") - model_instance.set_vocab() - - if args.vocab_only: - print(f"Exporting model vocab to '{fname_out}'") - model_instance.write_vocab() - else: - print(f"Exporting model to '{fname_out}'") - model_instance.write() - - print(f"Model successfully exported to '{fname_out}'") - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/convert.py b/extensions/model-extension/scripts/convert.py deleted file mode 100755 index 323e8058d..000000000 --- a/extensions/model-extension/scripts/convert.py +++ /dev/null @@ -1,1478 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import concurrent.futures -import enum -import faulthandler -import functools -import itertools -import json -import math -import mmap -import os -import pickle -import re -import signal -import struct -import sys -import time -import zipfile -from abc import ABCMeta, abstractmethod -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from dataclasses import dataclass -from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar - -import numpy as np -from sentencepiece import SentencePieceProcessor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf - -if TYPE_CHECKING: - from typing import TypeAlias - -if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): - faulthandler.register(signal.SIGUSR1) - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -ARCH = gguf.MODEL_ARCH.LLAMA - -DEFAULT_CONCURRENCY = 8 - -# -# data types -# - - -@dataclass(frozen=True) -class DataType: - name: str - dtype: np.dtype[Any] - valid_conversions: list[str] - - def elements_to_bytes(self, n_elements: int) -> int: - return n_elements * self.dtype.itemsize - - -@dataclass(frozen=True) -class UnquantizedDataType(DataType): - pass - - -DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) -DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) -DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) -DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) - - -@dataclass(frozen=True) -class QuantizedDataType(DataType): - block_size: int - quantized_dtype: np.dtype[Any] - ggml_type: gguf.GGMLQuantizationType - - def quantize(self, arr: NDArray) -> NDArray: - raise NotImplementedError(f'Quantization for {self.name} not implemented') - - def elements_to_bytes(self, n_elements: int) -> int: - assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' - return self.quantized_dtype.itemsize * (n_elements // self.block_size) - - -@dataclass(frozen=True) -class Q8_0QuantizedDataType(QuantizedDataType): - # Mini Q8_0 quantization in Python! - def quantize(self, arr: NDArray) -> NDArray: - assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}' - assert arr.dtype == np.float32, f'Bad array type {arr.dtype}' - n_blocks = arr.size // self.block_size - blocks = arr.reshape((n_blocks, self.block_size)) - # Much faster implementation of block quantization contributed by @Cebtenzzre - - def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: - d = abs(blocks).max(axis = 1) / np.float32(127) - with np.errstate(divide = 'ignore'): - qs = (blocks / d[:, None]).round() - qs[d == 0] = 0 - yield from zip(d, qs) - return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) - - -DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', - dtype = np.dtype(np.float32), valid_conversions = [], - ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, - quantized_dtype = np.dtype([('d', ' DataType: - dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) - if dt is None: - raise ValueError(self) - # 1D tensors are always F32. - return dt if len(tensor.shape) > 1 else DT_F32 - - -GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { - GGMLFileType.AllF32 : DT_F32, - GGMLFileType.MostlyF16 : DT_F16, - GGMLFileType.MostlyQ8_0: DT_Q8_0, -} - -# -# hparams loading -# - - -@dataclass -class Params: - n_vocab: int - n_embd: int - n_layer: int - n_ctx: int - n_ff: int - n_head: int - n_head_kv: int - n_experts: int | None = None - n_experts_used: int | None = None - f_norm_eps: float | None = None - - rope_scaling_type: gguf.RopeScalingType | None = None - f_rope_freq_base: float | None = None - f_rope_scale: float | None = None - n_orig_ctx: int | None = None - rope_finetuned: bool | None = None - - ftype: GGMLFileType | None = None - - # path to the directory containing the model files - path_model: Path | None = None - - @staticmethod - def guessed(model: LazyModel) -> Params: - # try transformer naming first - n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape - - # try transformer naming first - if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) - elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming - n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) - else: - n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) - - if n_layer < 1: - raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - - n_head = n_embd // 128 # guessed - n_mult = 256 # guessed - - # TODO: verify this - n_ff = int(2 * (4 * n_embd) / 3) - n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) - - return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_layer = n_layer, - n_ctx = -1, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head, - f_norm_eps = 1e-5, - ) - - @staticmethod - def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: - config = json.load(open(config_path)) - - rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None - rope_scaling = config.get("rope_scaling") - - if rope_scaling is not None and (typ := rope_scaling.get("type")): - rope_factor = rope_scaling.get("factor") - f_rope_scale = rope_factor - if typ == "linear": - rope_scaling_type = gguf.RopeScalingType.LINEAR - elif typ == "yarn": - rope_scaling_type = gguf.RopeScalingType.YARN - n_orig_ctx = rope_scaling['original_max_position_embeddings'] - rope_finetuned = rope_scaling['finetuned'] - else: - raise NotImplementedError(f'Unknown rope scaling type: {typ}') - - if "max_sequence_length" in config: - n_ctx = config["max_sequence_length"] - elif "max_position_embeddings" in config: - n_ctx = config["max_position_embeddings"] - else: - raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - - n_experts = None - n_experts_used = None - - if "num_local_experts" in config: - n_experts = config["num_local_experts"] - n_experts_used = config["num_experts_per_tok"] - - return Params( - n_vocab = config["vocab_size"], - n_embd = config["hidden_size"], - n_layer = config["num_hidden_layers"], - n_ctx = n_ctx, - n_ff = config["intermediate_size"], - n_head = (n_head := config["num_attention_heads"]), - n_head_kv = config.get("num_key_value_heads", n_head), - n_experts = n_experts, - n_experts_used = n_experts_used, - f_norm_eps = config["rms_norm_eps"], - f_rope_freq_base = config.get("rope_theta"), - rope_scaling_type = rope_scaling_type, - f_rope_scale = f_rope_scale, - n_orig_ctx = n_orig_ctx, - rope_finetuned = rope_finetuned, - ) - - # LLaMA v2 70B params.json - # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} - @staticmethod - def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: - config = json.load(open(config_path)) - - n_experts = None - n_experts_used = None - f_rope_freq_base = None - - # hack to determine LLaMA v1 vs v2 vs CodeLlama - if config.get("moe"): - # Mixtral - n_ctx = 32768 - elif config.get("rope_theta") == 1000000: - # CodeLlama - n_ctx = 16384 - elif config["norm_eps"] == 1e-05: - # LLaMA v2 - n_ctx = 4096 - else: - # LLaMA v1 - n_ctx = 2048 - - if "layers.0.feed_forward.w1.weight" in model: - n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] - - if config.get("moe"): - n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_experts = config["moe"]["num_experts"] - n_experts_used = config["moe"]["num_experts_per_tok"] - f_rope_freq_base = 1e6 - - return Params( - n_vocab = model["tok_embeddings.weight"].shape[0], - n_embd = config["dim"], - n_layer = config["n_layers"], - n_ctx = n_ctx, - n_ff = n_ff, - n_head = (n_head := config["n_heads"]), - n_head_kv = config.get("n_kv_heads", n_head), - n_experts = n_experts, - n_experts_used = n_experts_used, - f_norm_eps = config["norm_eps"], - f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), - ) - - @staticmethod - def load(model_plus: ModelPlus) -> Params: - hf_config_path = model_plus.paths[0].parent / "config.json" - orig_config_path = model_plus.paths[0].parent / "params.json" - - if hf_config_path.exists(): - params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) - elif orig_config_path.exists(): - params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) - elif model_plus.format != 'none': - params = Params.guessed(model_plus.model) - else: - raise ValueError('Cannot guess params when model format is none') - - params.path_model = model_plus.paths[0].parent - - return params - - -# -# vocab -# - -class BpeVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - if isinstance(self.bpe_tokenizer.get('model'), dict): - self.vocab = self.bpe_tokenizer["model"]["vocab"] - else: - self.vocab = self.bpe_tokenizer - added_tokens: dict[str, int] - if fname_added_tokens is not None: - # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - # Fall back to trying to find the added tokens in tokenizer.json - tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' - if not tokenizer_json_file.is_file(): - added_tokens = {} - else: - tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) - added_tokens = dict( - (item['content'], item['id']) - for item in tokenizer_json.get('added_tokens', []) - # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer) - - vocab_size: int = len(self.vocab) - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - expected_end_id = vocab_size + len(actual_ids) - 1 - raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") - - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_dict = added_tokens - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} - - for i, _ in enumerate(self.vocab): - yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.CONTROL - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.bpe_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class SentencePieceVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - added_tokens: dict[str, int] - if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - added_tokens = {} - - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - - new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} - expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) - - if expected_new_ids != actual_new_ids: - raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") - - # Token pieces that were added to the base vocabulary. - self.added_tokens_dict = added_tokens - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.sentencepiece_tokenizer - for i in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(i) - text: bytes = piece.encode("utf-8") - score: float = tokenizer.get_score(i) - - toktype = gguf.TokenType.NORMAL - if tokenizer.is_unknown(i): - toktype = gguf.TokenType.UNKNOWN - if tokenizer.is_control(i): - toktype = gguf.TokenType.CONTROL - - # NOTE: I think added_tokens are user defined. - # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto - # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - - if tokenizer.is_unused(i): - toktype = gguf.TokenType.UNUSED - if tokenizer.is_byte(i): - toktype = gguf.TokenType.BYTE - - yield text, score, toktype - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.sentencepiece_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class HfVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: - try: - from transformers import AutoTokenizer - except ImportError as e: - raise ImportError( - "To use HfVocab, please install the `transformers` package. " - "You can install it with `pip install transformers`." - ) from e - - print("fname_tokenizer:", fname_tokenizer) - # Allow the tokenizer to default to slow or fast versions. - # Explicitly set tokenizer to use local paths. - self.tokenizer = AutoTokenizer.from_pretrained( - fname_tokenizer, - cache_dir=fname_tokenizer, - local_files_only=True, - ) - - # Initialize lists and dictionaries for added tokens - self.added_tokens_list = [] - self.added_tokens_dict = dict() - self.added_tokens_ids = set() - - # Process added tokens - for tok, tokidx in sorted( - self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] - ): - # Only consider added tokens that are not in the base vocabulary - if tokidx >= self.tokenizer.vocab_size: - self.added_tokens_list.append(tok) - self.added_tokens_dict[tok] = tokidx - self.added_tokens_ids.add(tokidx) - - # Store special tokens and their IDs - self.specials = { - tok: self.tokenizer.get_vocab()[tok] - for tok in self.tokenizer.all_special_tokens - } - self.special_ids = set(self.tokenizer.all_special_ids) - - # Set vocabulary sizes - self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - reverse_vocab = { - id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() - } - - for token_id in range(self.vocab_size_base): - # Skip processing added tokens here - if token_id in self.added_tokens_ids: - continue - - # Convert token text to bytes - token_text = reverse_vocab[token_id].encode("utf-8") - - # Yield token text, score, and type - yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, token_text, self.special_ids # Reuse already stored special IDs - ) - - def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: - # Special case for byte tokens - if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - return gguf.TokenType.BYTE - - # Determine token type based on whether it's a special token - return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - - def get_token_score(self, token_id: int) -> float: - # Placeholder for actual logic to determine the token's score - # This needs to be implemented based on specific requirements - return -1000.0 # Default score - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - if text in self.specials: - toktype = self.get_token_type(self.specials[text], b'', self.special_ids) - score = self.get_token_score(self.specials[text]) - else: - toktype = gguf.TokenType.USER_DEFINED - score = -1000.0 - - yield text.encode("utf-8"), score, toktype - - def has_newline_token(self): - return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.hf_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" - - -# -# data loading -# TODO: reuse (probably move to gguf.py?) -# - - -def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - -class Tensor(metaclass=ABCMeta): - data_type: DataType - - @abstractmethod - def astype(self, data_type: DataType) -> Tensor: ... - @abstractmethod - def permute(self, n_head: int, n_head_kv: int) -> Tensor: ... - @abstractmethod - def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ... - @abstractmethod - def part(self, n_part: int) -> UnquantizedTensor: ... - @abstractmethod - def to_ggml(self) -> GGMLCompatibleTensor: ... - - -def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: - assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" - fp32_arr = bf16_arr.astype(np.uint32) << 16 - return fp32_arr.view(np.float32) - - -class UnquantizedTensor(Tensor): - def __init__(self, ndarray: NDArray) -> None: - assert isinstance(ndarray, np.ndarray) - self.ndarray = ndarray - self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] - - def astype(self, data_type: DataType) -> Tensor: - dtype = data_type.dtype - if self.data_type == DT_BF16: - self.ndarray = bf16_to_fp32(self.ndarray) - return UnquantizedTensor(self.ndarray.astype(dtype)) - - def to_ggml(self) -> UnquantizedTensor: - return self - - def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: - r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - - def part(self, n_part: int) -> UnquantizedTensor: - r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) - - def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor: - return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv)) - - -def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray: - tensor = lazy_tensor.load() - assert isinstance(tensor, UnquantizedTensor) - - # double-check: - actual_shape = list(tensor.ndarray.shape) - assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape) - if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype: - if convert: - tensor.ndarray = tensor.ndarray.astype(expected_dtype) - else: - raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}') - - return tensor.ndarray - - -GGMLCompatibleTensor = UnquantizedTensor - - -@dataclass -class LazyTensor: - _load: Callable[[], Tensor] - shape: list[int] - data_type: DataType - description: str - - def load(self) -> Tensor: - ret = self._load() - # Should be okay if it maps to the same numpy type? - assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ - (self.data_type, ret.data_type, self.description) - return ret - - def astype(self, data_type: DataType) -> LazyTensor: - self.validate_conversion_to(data_type) - - def load() -> Tensor: - return self.load().astype(data_type) - return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') - - def validate_conversion_to(self, data_type: DataType) -> None: - if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions: - raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') - - -LazyModel: TypeAlias = 'dict[str, LazyTensor]' - - -@dataclass -class ModelPlus: - model: LazyModel - paths: list[Path] # Where this was read from. - format: Literal['ggml', 'torch', 'safetensors', 'none'] - vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. - - -def merge_sharded(models: list[LazyModel]) -> LazyModel: - # Original LLaMA models have each file contain one part of each tensor. - # Use a dict instead of a set to preserve order. - names = {name: None for model in models for name in model} - - def convert(name: str) -> LazyTensor: - lazy_tensors: list[LazyTensor] = [model[name] for model in models] - if len(lazy_tensors) == 1: - # only one file; don't go through this procedure since there might - # be quantized tensors - return lazy_tensors[0] - if len(lazy_tensors[0].shape) == 1: - # the tensor is just duplicated in every file - return lazy_tensors[0] - if name.startswith('tok_embeddings.') or \ - name.endswith('.attention.wo.weight') or \ - name.endswith('.feed_forward.w2.weight'): - # split by columns - axis = 1 - else: - # split by rows - axis = 0 - concatenated_shape = list(lazy_tensors[0].shape) - concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) - - def load() -> UnquantizedTensor: - ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] - concatenated: NDArray = np.concatenate(ndarrays, axis=axis) - return UnquantizedTensor(concatenated) - description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' - return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) - return {name: convert(name) for name in names} - - -def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: - formats = set(mp.format for mp in models_plus) - assert len(formats) == 1, "different formats?" - format = formats.pop() - paths = [path for mp in models_plus for path in mp.paths] - # Use the first non-None vocab, if any. - try: - vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None) - except StopIteration: - vocab = None - - if any("model.embed_tokens.weight" in mp.model for mp in models_plus): - # Transformers models put different tensors in different files, but - # don't split individual tensors between files. - model: LazyModel = {} - for mp in models_plus: - model.update(mp.model) - else: - model = merge_sharded([mp.model for mp in models_plus]) - - return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types - - -def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: - def load() -> Tensor: - return lazy_tensor.load().permute(n_head, n_head_kv) - return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) - - -def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: - def load() -> Tensor: - return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) - s = lazy_tensor.shape.copy() - s[0] = s[0] // 3 - return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) - - -def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: - def load() -> Tensor: - return lazy_tensor.load().part(n_part) - s = lazy_tensor.shape.copy() - s[0] = s[0] // 3 - return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) - - -# Functionality that simulates `torch.load` but where individual tensors are -# only loaded into memory on demand, not all at once. -# PyTorch can't do this natively as of time of writing: -# - https://github.com/pytorch/pytorch/issues/64327 -# This allows us to de-shard without multiplying RAM usage, and also -# conveniently drops the PyTorch dependency (though we still need numpy). - - -@dataclass -class LazyStorageKind: - data_type: DataType - - -@dataclass -class LazyStorage: - load: Callable[[int, int], NDArray] - kind: LazyStorageKind - description: str - - -class LazyUnpickler(pickle.Unpickler): - def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): - super().__init__(fp) - self.data_base_path = data_base_path - self.zip_file = zip_file - - def persistent_load(self, pid: Any) -> Any: - assert pid[0] == 'storage' - assert isinstance(pid[1], LazyStorageKind) - data_type = pid[1].data_type - filename_stem = pid[2] - filename = f'{self.data_base_path}/{filename_stem}' - info = self.zip_file.getinfo(filename) - - def load(offset: int, elm_count: int) -> NDArray: - dtype = data_type.dtype - fp = self.zip_file.open(info) - fp.seek(offset * dtype.itemsize) - size = elm_count * dtype.itemsize - data = fp.read(size) - assert len(data) == size - return np.frombuffer(data, dtype) - description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' - return LazyStorage(load=load, kind=pid[1], description=description) - - @staticmethod - def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, - requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: - assert isinstance(storage, LazyStorage) - - def load() -> UnquantizedTensor: - elm_count = stride[0] * size[0] - return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size)) - description = f'pickled storage_offset={storage_offset} in {storage.description}' - return LazyTensor(load, list(size), storage.kind.data_type, description) - - @staticmethod - def rebuild_from_type_v2(func, new_type, args, state): - return func(*args) - - CLASSES: dict[tuple[str, str], Any] = { - # getattr used here as a workaround for mypy not being smart enough to determine - # the staticmethods have a __func__ attribute. - ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), - ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), - ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), - ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), - ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), - ('torch', 'IntStorage'): LazyStorageKind(DT_I32), - ('torch', 'Tensor'): LazyTensor, - } - - def find_class(self, module: str, name: str) -> Any: - if not module.startswith('torch'): - return super().find_class(module, name) - return self.CLASSES[(module, name)] - - -def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: - zf = zipfile.ZipFile(outer_fp) - pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] - assert len(pickle_paths) == 1, pickle_paths - pickle_fp = zf.open(pickle_paths[0], 'r') - unpickler = LazyUnpickler(pickle_fp, - data_base_path=pickle_paths[0][:-4], - zip_file=zf) - model = unpickler.load() - if 'model' in model: model = model['model'] - as_dict = dict(model.items()) - return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) - - -def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: - header_size, = struct.unpack(' LazyTensor: - data_type = SAFETENSORS_DATA_TYPES[info['dtype']] - numpy_dtype = data_type.dtype - shape: list[int] = info['shape'] - begin, end = info['data_offsets'] - assert 0 <= begin <= end <= len(byte_buf) - assert end - begin == math.prod(shape) * numpy_dtype.itemsize - buf = byte_buf[begin:end] - - def load() -> UnquantizedTensor: - return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) - description = f'safetensors begin={begin} end={end} type={data_type} path={path}' - return LazyTensor(load, shape, data_type, description) - model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'} - return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None) - - -def must_read(fp: IO[bytes], length: int) -> bytes: - ret = fp.read(length) - if len(ret) < length: - raise Exception("unexpectedly reached end of file") - return ret - - -@functools.lru_cache(maxsize=None) -def lazy_load_file(path: Path) -> ModelPlus: - fp = open(path, 'rb') - first8 = fp.read(8) - fp.seek(0) - if first8[:2] == b'PK': - # A zip file, i.e. PyTorch format - return lazy_load_torch_file(fp, path) - elif struct.unpack(' Iterable[Out]: - '''Parallel map, but with backpressure. If the caller doesn't call `next` - fast enough, this will stop calling `func` at some point rather than - letting results pile up in memory. Specifically, there is a max of one - output value buffered per thread.''' - if concurrency < 2: - yield from map(func, iterable) - # Not reached. - iterable = iter(iterable) - executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor] - if use_processpool_executor: - executor_class = ProcessPoolExecutor - else: - executor_class = ThreadPoolExecutor - with executor_class(max_workers=max_workers) as executor: - futures: list[concurrent.futures.Future[Out]] = [] - done = False - for _ in range(concurrency): - try: - futures.append(executor.submit(func, next(iterable))) - except StopIteration: - done = True - break - - while futures: - result = futures.pop(0).result() - while not done and len(futures) < concurrency: - try: - futures.append(executor.submit(func, next(iterable))) - except StopIteration: - done = True - break - yield result - - -def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: - # Handle special case where the model's vocab size is not set - if params.n_vocab == -1: - raise ValueError( - f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" - ) - - # Check for a vocab size mismatch - if params.n_vocab == vocab.vocab_size: - print("Ignoring added_tokens.json since model matches vocab size without it.") - return - - if pad_vocab and params.n_vocab > vocab.vocab_size: - pad_count = params.n_vocab - vocab.vocab_size - print( - f"Padding vocab with {pad_count} token(s) - through " - ) - for i in range(1, pad_count + 1): - vocab.added_tokens_dict[f""] = -1 - vocab.added_tokens_list.append(f"") - vocab.vocab_size = params.n_vocab - return - - msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})." - if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: - msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." - if vocab.vocab_size < params.n_vocab: - msg += " Add the --pad-vocab option and try again." - - raise Exception(msg) - - -class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - - def add_meta_arch(self, params: Params) -> None: - name = "LLaMA" - - # TODO: better logic to determine model name - if params.n_ctx == 4096: - name = "LLaMA v2" - elif params.path_model is not None: - name = str(params.path_model.parent).split('/')[-1] - - self.gguf.add_name (name) - self.gguf.add_context_length (params.n_ctx) - self.gguf.add_embedding_length (params.n_embd) - self.gguf.add_block_count (params.n_layer) - self.gguf.add_feed_forward_length (params.n_ff) - self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) - self.gguf.add_head_count (params.n_head) - self.gguf.add_head_count_kv (params.n_head_kv) - - if params.n_experts: - self.gguf.add_expert_count(params.n_experts) - - if params.n_experts_used: - self.gguf.add_expert_used_count(params.n_experts_used) - - if params.f_norm_eps: - self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) - else: - raise ValueError('f_norm_eps is None') - - if params.f_rope_freq_base is not None: - self.gguf.add_rope_freq_base(params.f_rope_freq_base) - - if params.rope_scaling_type: - assert params.f_rope_scale is not None - self.gguf.add_rope_scaling_type(params.rope_scaling_type) - self.gguf.add_rope_scaling_factor(params.f_rope_scale) - - if params.n_orig_ctx is not None: - self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx) - - if params.rope_finetuned is not None: - self.gguf.add_rope_scaling_finetuned(params.rope_finetuned) - - if params.ftype is not None: - self.gguf.add_file_type(params.ftype) - - def handle_tokenizer_model(self, vocab: Vocab) -> str: - # Map the vocab types to the supported tokenizer models - tokenizer_model = { - SentencePieceVocab: "llama", - HfVocab: "llama", - BpeVocab: "gpt2", - }.get(type(vocab)) - - # Block if vocab type is not predefined - if tokenizer_model is None: - raise ValueError("Unknown vocab type: Not supported") - - return tokenizer_model - - def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: - tokens = [] - scores = [] - toktypes = [] - - # NOTE: `all_tokens` returns the base vocabulary and added tokens - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - return tokens, scores, toktypes - - def add_meta_vocab(self, vocab: Vocab) -> None: - # Handle the tokenizer model - tokenizer_model = self.handle_tokenizer_model(vocab) - - # Ensure that tokenizer_model is added to the GGUF model - self.gguf.add_tokenizer_model(tokenizer_model) - - # Extract model vocabulary for model conversion - tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) - - # Add extracted token information for model conversion - self.gguf.add_token_list(tokens) - self.gguf.add_token_scores(scores) - self.gguf.add_token_types(toktypes) - - def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None: - svocab.add_to_gguf(self.gguf) - - def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: - n_elements = int(np.prod(tensor.shape)) - raw_dtype = getattr(tensor.data_type, 'ggml_type', None) - data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype - data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) - - def write_meta(self) -> None: - self.gguf.write_header_to_file() - self.gguf.write_kv_data_to_file() - - def write_tensor_info(self) -> None: - self.gguf.write_ti_data_to_file() - - def close(self) -> None: - self.gguf.close() - - @staticmethod - def write_vocab_only( - fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, - ) -> None: - check_vocab_size(params, vocab, pad_vocab = pad_vocab) - - of = OutputFile(fname_out, endianess=endianess) - - # meta data - of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) - - of.write_meta() - - of.close() - - @staticmethod - def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]: - name, lazy_tensor = item - tensor = lazy_tensor.load().to_ggml() - return (lazy_tensor.data_type, tensor.ndarray) - - @staticmethod - def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: - dt, arr = item - if not isinstance(dt, QuantizedDataType): - return arr - return dt.quantize(arr) - - @staticmethod - def write_all( - fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, - ) -> None: - check_vocab_size(params, vocab, pad_vocab=pad_vocab) - - of = OutputFile(fname_out, endianess=endianess) - - # meta data - of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) - - # tensor info - for name, lazy_tensor in model.items(): - of.add_tensor_info(name, lazy_tensor) - - of.write_meta() - of.write_tensor_info() - - # tensor data - ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) - if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, - use_processpool_executor=True, - ) - else: - ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) - - start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): - elapsed = time.time() - start - size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) - padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" - ) - of.gguf.write_tensor_data(ndarray) - - of.close() - - -def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type - - if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): - return GGMLFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): - return GGMLFileType.MostlyF16 - if output_type_str == "q8_0": - return GGMLFileType.MostlyQ8_0 - - name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} - - raise Exception(f"Unexpected combination of types: {name_to_type}") - - -def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: - return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) - for (name, tensor) in model.items()} - - -def convert_model_names(model: LazyModel, params: Params) -> LazyModel: - tmap = gguf.TensorNameMap(ARCH, params.n_layer) - should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) - - tmp = model - - # HF models permut or pack some of the tensors, so we need to undo that - for i in itertools.count(): - if f"model.layers.{i}.self_attn.q_proj.weight" in model: - print(f"Permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) - # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] - elif f"model.layers.{i}.self_attn.W_pack.weight" in model: - print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) - tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) - del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] - else: - break - - out: LazyModel = {} - for name, lazy_tensor in model.items(): - tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) - if name_new is None: - raise Exception(f"Unexpected tensor name: {name}") - - if tensor_type in should_skip: - print(f"skipping tensor {name_new}") - continue - - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") - out[name_new] = lazy_tensor - - return out - - -def nth_multifile_path(path: Path, n: int) -> Path | None: - '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return - the nth path in the model. - ''' - # Support the following patterns: - patterns: list[tuple[str, str]] = [ - # - x.00.pth, x.01.pth, etc. - (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), - # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. - (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'), - # x.bin, x.bin.1, etc. - (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}') - ] - for regex, replacement in patterns: - if re.search(regex, path.name): - new_path = path.with_name(re.sub(regex, replacement, path.name)) - if new_path.exists(): - return new_path - return None - - -def find_multifile_paths(path: Path) -> list[Path]: - '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return - the whole list of paths in the model. - ''' - ret: list[Path] = [] - for i in itertools.count(): - nth_path = nth_multifile_path(path, i) - if nth_path is None: - break - ret.append(nth_path) - if not ret: - # No matches. This should only happen if the file was named, e.g., - # foo.0, and there was no file named foo. Oh well, try to process it - # as a single file. - return [path] - return ret - - -def load_some_model(path: Path) -> ModelPlus: - '''Load a model of any supported format.''' - # Be extra-friendly and accept either a file or a directory: - if path.is_dir(): - # Check if it's a set of safetensors files first - globs = ["model-00001-of-*.safetensors", "model.safetensors"] - files = [file for glob in globs for file in path.glob(glob)] - if not files: - # Try the PyTorch patterns too, with lower priority - globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] - files = [file for glob in globs for file in path.glob(glob)] - if not files: - raise Exception(f"Can't find model in directory {path}") - if len(files) > 1: - raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") - path = files[0] - - paths = find_multifile_paths(path) - models_plus: list[ModelPlus] = [] - for path in paths: - print(f"Loading model file {path}") - models_plus.append(lazy_load_file(path)) - - model_plus = merge_multifile_models(models_plus) - return model_plus - - -class VocabFactory: - def __init__(self, path: Path): - self.path = path - self.files: dict[str, Path | None] = { - "tokenizer.model": None, - "vocab.json": None, - "tokenizer.json": None, - } - self._detect_files() - - def _detect_files(self): - for file in self.files.keys(): - file_path = self.path / file - parent_file_path = self.path.parent / file - if file_path.exists(): - self.files[file] = file_path - elif parent_file_path.exists(): - self.files[file] = parent_file_path - print(f"Found vocab files: {self.files}") - - def _select_file(self, vocabtype: str | None) -> Path: - if vocabtype in ["spm", "bpe"]: - for file_key in self.files.keys(): - if (file := self.files[file_key]) is not None: - return file - raise FileNotFoundError(f"{vocabtype} vocab not found.") - if vocabtype == "hfft": - # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file - return self.path - raise ValueError(f"Unsupported vocabulary type {vocabtype}") - - def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: - load_merges = vocabtype == "bpe" - n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None - return gguf.SpecialVocab( - model_parent_path, - load_merges=load_merges, - special_token_types=None, # Predetermined or passed as a parameter - n_vocab=n_vocab, - ) - - def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: - path = self._select_file(vocabtype) - print(f"Loading vocab file '{path}', type '{vocabtype}'") - - added_tokens_path = path.parent / "added_tokens.json" - vocab: Vocab - if vocabtype == "bpe": - vocab = BpeVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - elif vocabtype == "spm": - vocab = SentencePieceVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - elif vocabtype == "hfft": - vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") - # FIXME: Respect --vocab-dir? - special_vocab = self._create_special_vocab( - vocab, - vocabtype, - model_parent_path, - ) - return vocab, special_vocab - - -def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: - namestr = { - GGMLFileType.AllF32: "f32", - GGMLFileType.MostlyF16: "f16", - GGMLFileType.MostlyQ8_0:"q8_0", - }[file_type] - ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" - if ret in model_paths: - sys.stderr.write( - f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n") - sys.exit(1) - return ret - - -def do_dump_model(model_plus: ModelPlus) -> None: - print(f"model_plus.paths = {model_plus.paths!r}") - print(f"model_plus.format = {model_plus.format!r}") - print(f"model_plus.vocab = {model_plus.vocab!r}") - for name, lazy_tensor in model_plus.model.items(): - print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") - - -def main(args_in: list[str] | None = None) -> None: - output_choices = ["f32", "f16"] - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # We currently only support Q8_0 output on little endian systems. - output_choices.append("q8_0") - vocab_types = ["spm", "bpe", "hfft"] - parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") - parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) - parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") - parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") - parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") - parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") - parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") - parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) - parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") - parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") - - args = parser.parse_args(args_in) - if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" - if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") - args.model = tmp_model_path - - if args.dump_single: - model_plus = lazy_load_file(args.model) - do_dump_model(model_plus) - return - - if not args.vocab_only: - model_plus = load_some_model(args.model) - else: - model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) - - if args.dump: - do_dump_model(model_plus) - return - endianess = gguf.GGUFEndian.LITTLE - if args.big_endian: - endianess = gguf.GGUFEndian.BIG - - params = Params.load(model_plus) - if params.n_ctx == -1: - if args.ctx is None: - raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" - "Please specify one with --ctx:\n" - " - LLaMA v1: --ctx 2048\n" - " - LLaMA v2: --ctx 4096\n") - params.n_ctx = args.ctx - - if args.outtype: - params.ftype = { - "f32": GGMLFileType.AllF32, - "f16": GGMLFileType.MostlyF16, - "q8_0": GGMLFileType.MostlyQ8_0, - }[args.outtype] - - print(f"params = {params}") - - model_parent_path = model_plus.paths[0].parent - vocab_path = Path(args.vocab_dir or args.model or model_parent_path) - vocab_factory = VocabFactory(vocab_path) - vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) - - if args.vocab_only: - if not args.outfile: - raise ValueError("need --outfile if using --vocab-only") - outfile = args.outfile - OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, - endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") - return - - if model_plus.vocab is not None and args.vocab_dir is None: - vocab = model_plus.vocab - - print(f"Vocab info: {vocab}") - print(f"Special vocab info: {special_vocab}") - - model = model_plus.model - model = convert_model_names(model, params) - ftype = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype) - - params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") - - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/gguf-py/LICENSE b/extensions/model-extension/scripts/gguf-py/LICENSE deleted file mode 100644 index 76f67efdc..000000000 --- a/extensions/model-extension/scripts/gguf-py/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2023 Georgi Gerganov - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/extensions/model-extension/scripts/gguf-py/README.md b/extensions/model-extension/scripts/gguf-py/README.md deleted file mode 100644 index 22d7ffa52..000000000 --- a/extensions/model-extension/scripts/gguf-py/README.md +++ /dev/null @@ -1,81 +0,0 @@ -## gguf - -This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) -(GGML Universal File) format. - -See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) -as an example for its usage. - -## Installation -```sh -pip install gguf -``` - -## API Examples/Simple Tools - -[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. - -[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console. - -[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key. - -[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files. - -## Development -Maintainers who participate in development of this package are advised to install it in editable mode: - -```sh -cd /path/to/llama.cpp/gguf-py - -pip install --editable . -``` - -**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`. -In this case, upgrade Pip to the latest: - -```sh -pip install --upgrade pip -``` - -## Automatic publishing with CI - -There's a GitHub workflow to make a release automatically upon creation of tags in a specified format. - -1. Bump the version in `pyproject.toml`. -2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number. - -```sh -git tag -a gguf-v1.0.0 -m "Version 1.0 release" -``` - -3. Push the tags. - -```sh -git push origin --tags -``` - -## Manual publishing -If you want to publish the package manually for any reason, you need to have `twine` and `build` installed: - -```sh -pip install build twine -``` - -Then, follow these steps to release a new version: - -1. Bump the version in `pyproject.toml`. -2. Build the package: - -```sh -python -m build -``` - -3. Upload the generated distribution archives: - -```sh -python -m twine upload dist/* -``` - -## TODO -- [ ] Add tests -- [ ] Include conversion scripts as command line entry points in this package. diff --git a/extensions/model-extension/scripts/gguf-py/examples/writer.py b/extensions/model-extension/scripts/gguf-py/examples/writer.py deleted file mode 100755 index f39eed1af..000000000 --- a/extensions/model-extension/scripts/gguf-py/examples/writer.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -import sys -from pathlib import Path - -import numpy as np - -# Necessary to load the local gguf package -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf import GGUFWriter # noqa: E402 - - -# Example usage: -def writer_example() -> None: - # Example usage with a file - gguf_writer = GGUFWriter("example.gguf", "llama") - - gguf_writer.add_architecture() - gguf_writer.add_block_count(12) - gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer - gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float - gguf_writer.add_custom_alignment(64) - - tensor1 = np.ones((32,), dtype=np.float32) * 100.0 - tensor2 = np.ones((64,), dtype=np.float32) * 101.0 - tensor3 = np.ones((96,), dtype=np.float32) * 102.0 - - gguf_writer.add_tensor("tensor1", tensor1) - gguf_writer.add_tensor("tensor2", tensor2) - gguf_writer.add_tensor("tensor3", tensor3) - - gguf_writer.write_header_to_file() - gguf_writer.write_kv_data_to_file() - gguf_writer.write_tensors_to_file() - - gguf_writer.close() - - -if __name__ == '__main__': - writer_example() diff --git a/extensions/model-extension/scripts/gguf-py/gguf/__init__.py b/extensions/model-extension/scripts/gguf-py/gguf/__init__.py deleted file mode 100644 index 110ab342c..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .constants import * -from .gguf_reader import * -from .gguf_writer import * -from .tensor_mapping import * -from .vocab import * diff --git a/extensions/model-extension/scripts/gguf-py/gguf/constants.py b/extensions/model-extension/scripts/gguf-py/gguf/constants.py deleted file mode 100644 index 1cfd41c0b..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/constants.py +++ /dev/null @@ -1,665 +0,0 @@ -from __future__ import annotations - -import sys -from enum import Enum, IntEnum, auto -from typing import Any - -# -# constants -# - -GGUF_MAGIC = 0x46554747 # "GGUF" -GGUF_VERSION = 3 -GGUF_DEFAULT_ALIGNMENT = 32 - -# -# metadata keys -# - - -class Keys: - class General: - ARCHITECTURE = "general.architecture" - QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - NAME = "general.name" - AUTHOR = "general.author" - URL = "general.url" - DESCRIPTION = "general.description" - LICENSE = "general.license" - SOURCE_URL = "general.source.url" - SOURCE_HF_REPO = "general.source.huggingface.repository" - FILE_TYPE = "general.file_type" - - class LLM: - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - BLOCK_COUNT = "{arch}.block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" - USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - - class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" - LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - - class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - - class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" - - -# -# recommended mapping of model tensor names for storage in gguf -# - - -class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - PERSIMMON = auto() - REFACT = auto() - BERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - PHI2 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - - -class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - - -MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.PERSIMMON: "persimmon", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", -} - -TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", -} - -MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { - MODEL_ARCH.LLAMA: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_GATE_INP, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.FFN_GATE_EXP, - MODEL_TENSOR.FFN_DOWN_EXP, - MODEL_TENSOR.FFN_UP_EXP, - ], - MODEL_ARCH.GPTNEOX: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.FALCON: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_NORM_2, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.BAICHUAN: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.STARCODER: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.BERT: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.TOKEN_TYPES, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.MPT: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.FFN_ACT, - ], - MODEL_ARCH.GPTJ: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.PERSIMMON: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.ATTN_Q_NORM, - MODEL_TENSOR.ATTN_K_NORM, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.REFACT: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.BLOOM: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.TOKEN_EMBD_NORM, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.STABLELM: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.QWEN: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.QWEN2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.PLAMO: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.GPT2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.PHI2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.CODESHELL: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.POS_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.ORION: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.INTERNLM2: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ], - MODEL_ARCH.MINICPM: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_GATE_INP, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.FFN_GATE_EXP, - MODEL_TENSOR.FFN_DOWN_EXP, - MODEL_TENSOR.FFN_UP_EXP, - ], - # TODO -} - -# tensors that will not be serialized -MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { - MODEL_ARCH.LLAMA: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.BAICHUAN: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.PERSIMMON: [ - MODEL_TENSOR.ROPE_FREQS, - ], - MODEL_ARCH.QWEN: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.CODESHELL: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], - MODEL_ARCH.ORION: [ - MODEL_TENSOR.ROPE_FREQS, - MODEL_TENSOR.ATTN_ROT_EMBD, - ], -} - -# -# types -# - - -class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' - - -class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 - - -class GGUFEndian(IntEnum): - LITTLE = 0 - BIG = 1 - - -class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 - FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 - FLOAT64 = 12 - - @staticmethod - def get_type(val: Any) -> GGUFValueType: - if isinstance(val, (str, bytes, bytearray)): - return GGUFValueType.STRING - elif isinstance(val, list): - return GGUFValueType.ARRAY - elif isinstance(val, float): - return GGUFValueType.FLOAT32 - elif isinstance(val, bool): - return GGUFValueType.BOOL - elif isinstance(val, int): - return GGUFValueType.INT32 - # TODO: need help with 64-bit types in Python - else: - print("Unknown type:", type(val)) - sys.exit() - - -# Note: Does not support GGML_QKK_64 -QK_K = 256 -# Items here are (block size, type size) -GGML_QUANT_SIZES = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), -} - - -# Aliases for backward compatibility. - -# general -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE -KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE - -# LLM -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH -KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT - -# attention -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS -KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS - -# RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR -KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED - -# tokenization -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST -KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf.py deleted file mode 100644 index 651a81eb8..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/gguf.py +++ /dev/null @@ -1,15 +0,0 @@ -# This file left for compatibility. If you want to use the GGUF API from Python -# then don't import gguf/gguf.py directly. If you're looking for examples, see the -# examples/ directory for gguf-py - -import importlib -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent)) - -# Compatibility for people trying to import gguf/gguf.py directly instead of as a package. -importlib.invalidate_caches() -import gguf # noqa: E402 - -importlib.reload(gguf) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py deleted file mode 100644 index 5b6d4ba6b..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/gguf_reader.py +++ /dev/null @@ -1,264 +0,0 @@ -# -# GGUF file reading/modification support. For API usage information, -# please see the files scripts/ for some fairly simple examples. -# -from __future__ import annotations - -import os -from collections import OrderedDict -from typing import Any, Literal, NamedTuple, TypeVar, Union - -import numpy as np -import numpy.typing as npt - -if __name__ == "__main__": - import sys - from pathlib import Path - - # Allow running file in package as a script. - sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf.constants import ( - GGML_QUANT_SIZES, - GGUF_DEFAULT_ALIGNMENT, - GGUF_MAGIC, - GGUF_VERSION, - GGMLQuantizationType, - GGUFValueType, -) - - -READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] - - -class ReaderField(NamedTuple): - # Offset to start of this field. - offset: int - - # Name of the field (not necessarily from file data). - name: str - - # Data parts. Some types have multiple components, such as strings - # that consist of a length followed by the string data. - parts: list[npt.NDArray[Any]] = [] - - # Indexes into parts that we can call the actual data. For example - # an array of strings will be populated with indexes to the actual - # string data. - data: list[int] = [-1] - - types: list[GGUFValueType] = [] - - -class ReaderTensor(NamedTuple): - name: str - tensor_type: GGMLQuantizationType - shape: npt.NDArray[np.uint32] - n_elements: int - n_bytes: int - data_offset: int - data: npt.NDArray[Any] - field: ReaderField - - -class GGUFReader: - # I - same as host, S - swapped - byte_order: Literal['I' | 'S'] = 'I' - alignment: int = GGUF_DEFAULT_ALIGNMENT - - # Note: Internal helper, API may change. - gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = { - GGUFValueType.UINT8: np.uint8, - GGUFValueType.INT8: np.int8, - GGUFValueType.UINT16: np.uint16, - GGUFValueType.INT16: np.int16, - GGUFValueType.UINT32: np.uint32, - GGUFValueType.INT32: np.int32, - GGUFValueType.FLOAT32: np.float32, - GGUFValueType.UINT64: np.uint64, - GGUFValueType.INT64: np.int64, - GGUFValueType.FLOAT64: np.float64, - GGUFValueType.BOOL: np.bool_, - } - - def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'): - self.data = np.memmap(path, mode = mode) - offs = 0 - if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC: - raise ValueError('GGUF magic invalid') - offs += 4 - temp_version = self._get(offs, np.uint32) - if temp_version[0] & 65535 == 0: - # If we get 0 here that means it's (probably) a GGUF file created for - # the opposite byte order of the machine this script is running on. - self.byte_order = 'S' - temp_version = temp_version.newbyteorder(self.byte_order) - version = temp_version[0] - if version not in READER_SUPPORTED_VERSIONS: - raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle') - self.fields: OrderedDict[str, ReaderField] = OrderedDict() - self.tensors: list[ReaderTensor] = [] - offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32])) - temp_counts = self._get(offs, np.uint64, 2) - offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64])) - offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64])) - tensor_count, kv_count = temp_counts - offs = self._build_fields(offs, kv_count) - offs, tensors_fields = self._build_tensors_fields(offs, tensor_count) - new_align = self.fields.get('general.alignment') - if new_align is not None: - if new_align.types != [GGUFValueType.UINT32]: - raise ValueError('Bad type for general.alignment field') - self.alignment = new_align.parts[-1][0] - padding = offs % self.alignment - if padding != 0: - offs += self.alignment - padding - self._build_tensors(offs, tensors_fields) - - _DT = TypeVar('_DT', bound = npt.DTypeLike) - - # Fetch a key/value metadata field by key. - def get_field(self, key: str) -> Union[ReaderField, None]: - return self.fields.get(key, None) - - # Fetch a tensor from the list by index. - def get_tensor(self, idx: int) -> ReaderTensor: - return self.tensors[idx] - - def _get( - self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None, - ) -> npt.NDArray[Any]: - count = int(count) - itemsize = int(np.empty([], dtype = dtype).itemsize) - end_offs = offset + itemsize * count - return ( - self.data[offset:end_offs] - .view(dtype = dtype)[:count] - .newbyteorder(override_order or self.byte_order) - ) - - def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: - if field.name in self.fields: - raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') - self.fields[field.name] = field - return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts) - - def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]: - slen = self._get(offset, np.uint64) - return slen, self._get(offset + 8, np.uint8, slen[0]) - - def _get_field_parts( - self, orig_offs: int, raw_type: int, - ) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]: - offs = orig_offs - types: list[GGUFValueType] = [] - gtype = GGUFValueType(raw_type) - types.append(gtype) - # Handle strings. - if gtype == GGUFValueType.STRING: - sparts: list[npt.NDArray[Any]] = list(self._get_str(offs)) - size = sum(int(part.nbytes) for part in sparts) - return size, sparts, [1], types - # Check if it's a simple scalar type. - nptype = self.gguf_scalar_to_np.get(gtype) - if nptype is not None: - val = self._get(offs, nptype) - return int(val.nbytes), [val], [0], types - # Handle arrays. - if gtype == GGUFValueType.ARRAY: - raw_itype = self._get(offs, np.uint32) - offs += int(raw_itype.nbytes) - alen = self._get(offs, np.uint64) - offs += int(alen.nbytes) - aparts: list[npt.NDArray[Any]] = [raw_itype, alen] - data_idxs: list[int] = [] - for idx in range(alen[0]): - curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0]) - if idx == 0: - types += curr_types - idxs_offs = len(aparts) - aparts += curr_parts - data_idxs += (idx + idxs_offs for idx in curr_idxs) - offs += curr_size - return offs - orig_offs, aparts, data_idxs, types - # We can't deal with this one. - raise ValueError('Unknown/unhandled field type {gtype}') - - def _get_tensor(self, orig_offs: int) -> ReaderField: - offs = orig_offs - name_len, name_data = self._get_str(offs) - offs += int(name_len.nbytes + name_data.nbytes) - n_dims = self._get(offs, np.uint32) - offs += int(n_dims.nbytes) - dims = self._get(offs, np.uint64, n_dims[0]) - offs += int(dims.nbytes) - raw_dtype = self._get(offs, np.uint32) - offs += int(raw_dtype.nbytes) - offset_tensor = self._get(offs, np.uint64) - offs += int(offset_tensor.nbytes) - return ReaderField( - orig_offs, - str(bytes(name_data), encoding = 'utf-8'), - [name_len, name_data, n_dims, dims, raw_dtype, offset_tensor], - [1, 3, 4, 5], - ) - - def _build_fields(self, offs: int, count: int) -> int: - for _ in range(count): - orig_offs = offs - kv_klen, kv_kdata = self._get_str(offs) - offs += int(kv_klen.nbytes + kv_kdata.nbytes) - raw_kv_type = self._get(offs, np.uint32) - offs += int(raw_kv_type.nbytes) - parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type] - idxs_offs = len(parts) - field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0]) - parts += field_parts - self._push_field(ReaderField( - orig_offs, - str(bytes(kv_kdata), encoding = 'utf-8'), - parts, - [idx + idxs_offs for idx in field_idxs], - field_types, - ), skip_sum = True) - offs += field_size - return offs - - def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]: - tensor_fields = [] - for _ in range(count): - field = self._get_tensor(offs) - offs += sum(int(part.nbytes) for part in field.parts) - tensor_fields.append(field) - return offs, tensor_fields - - def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: - tensors = [] - for field in fields: - _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts - ggml_type = GGMLQuantizationType(raw_dtype[0]) - n_elems = np.prod(dims) - block_size, type_size = GGML_QUANT_SIZES[ggml_type] - n_bytes = n_elems * type_size // block_size - data_offs = int(start_offs + offset_tensor[0]) - item_type: npt.DTypeLike - if ggml_type == GGMLQuantizationType.F32: - item_count = n_elems - item_type = np.float32 - elif ggml_type == GGMLQuantizationType.F16: - item_count = n_elems - item_type = np.float16 - else: - item_count = n_bytes - item_type = np.uint8 - tensors.append(ReaderTensor( - name = str(bytes(name_data), encoding = 'utf-8'), - tensor_type = ggml_type, - shape = dims, - n_elements = n_elems, - n_bytes = n_bytes, - data_offset = data_offs, - data = self._get(data_offs, item_type, item_count), - field = field, - )) - self.tensors = tensors diff --git a/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py b/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py deleted file mode 100644 index 16808196e..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/gguf_writer.py +++ /dev/null @@ -1,427 +0,0 @@ -from __future__ import annotations - -import os -import shutil -import struct -import tempfile -from enum import Enum, auto -from io import BufferedWriter -from typing import IO, Any, Sequence - -import numpy as np - -from .constants import ( - GGUF_DEFAULT_ALIGNMENT, - GGUF_MAGIC, - GGUF_VERSION, - GGMLQuantizationType, - GGUFEndian, - GGUFValueType, - Keys, - RopeScalingType, - TokenType, -) - - -class WriterState(Enum): - EMPTY = auto() - HEADER = auto() - KV_DATA = auto() - TI_DATA = auto() - - -class GGUFWriter: - fout: BufferedWriter - temp_file: tempfile.SpooledTemporaryFile[bytes] | None - tensors: list[np.ndarray[Any, Any]] - _simple_value_packing = { - GGUFValueType.UINT8: "B", - GGUFValueType.INT8: "b", - GGUFValueType.UINT16: "H", - GGUFValueType.INT16: "h", - GGUFValueType.UINT32: "I", - GGUFValueType.INT32: "i", - GGUFValueType.FLOAT32: "f", - GGUFValueType.UINT64: "Q", - GGUFValueType.INT64: "q", - GGUFValueType.FLOAT64: "d", - GGUFValueType.BOOL: "?", - } - - def __init__( - self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True, - endianess: GGUFEndian = GGUFEndian.LITTLE, - ): - self.fout = open(path, "wb") - self.arch = arch - self.endianess = endianess - self.offset_tensor = 0 - self.data_alignment = GGUF_DEFAULT_ALIGNMENT - self.kv_data = bytearray() - self.kv_data_count = 0 - self.ti_data = bytearray() - self.ti_data_count = 0 - self.use_temp_file = use_temp_file - self.temp_file = None - self.tensors = [] - print("gguf: This GGUF file is for {0} Endian only".format( - "Big" if self.endianess == GGUFEndian.BIG else "Little", - )) - self.state = WriterState.EMPTY - - self.add_architecture() - - def write_header_to_file(self) -> None: - if self.state is not WriterState.EMPTY: - raise ValueError(f'Expected output file to be empty, got {self.state}') - - self._write_packed(" None: - if self.state is not WriterState.HEADER: - raise ValueError(f'Expected output file to contain the header, got {self.state}') - - self.fout.write(self.kv_data) - self.flush() - self.state = WriterState.KV_DATA - - def write_ti_data_to_file(self) -> None: - if self.state is not WriterState.KV_DATA: - raise ValueError(f'Expected output file to contain KV data, got {self.state}') - - self.fout.write(self.ti_data) - self.flush() - self.state = WriterState.TI_DATA - - def add_key(self, key: str) -> None: - self.add_val(key, GGUFValueType.STRING, add_vtype=False) - - def add_uint8(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT8) - - def add_int8(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT8) - - def add_uint16(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT16) - - def add_int16(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT16) - - def add_uint32(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT32) - - def add_int32(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT32) - - def add_float32(self, key: str, val: float) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.FLOAT32) - - def add_uint64(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.UINT64) - - def add_int64(self, key: str, val: int) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.INT64) - - def add_float64(self, key: str, val: float) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.FLOAT64) - - def add_bool(self, key: str, val: bool) -> None: - self.add_key(key) - self.add_val(val, GGUFValueType.BOOL) - - def add_string(self, key: str, val: str) -> None: - if not val: - return - self.add_key(key) - self.add_val(val, GGUFValueType.STRING) - - def add_array(self, key: str, val: Sequence[Any]) -> None: - if not isinstance(val, Sequence): - raise ValueError("Value must be a sequence for array type") - - self.add_key(key) - self.add_val(val, GGUFValueType.ARRAY) - - def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None: - if vtype is None: - vtype = GGUFValueType.get_type(val) - - if add_vtype: - self.kv_data += self._pack("I", vtype) - self.kv_data_count += 1 - - pack_fmt = self._simple_value_packing.get(vtype) - if pack_fmt is not None: - self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL) - elif vtype == GGUFValueType.STRING: - encoded_val = val.encode("utf8") if isinstance(val, str) else val - self.kv_data += self._pack("Q", len(encoded_val)) - self.kv_data += encoded_val - elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: - ltype = GGUFValueType.get_type(val[0]) - if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): - raise ValueError("All items in a GGUF array should be of the same type") - self.kv_data += self._pack("I", ltype) - self.kv_data += self._pack("Q", len(val)) - for item in val: - self.add_val(item, add_vtype=False) - else: - raise ValueError("Invalid GGUF metadata value type or value") - - @staticmethod - def ggml_pad(x: int, n: int) -> int: - return ((x + n - 1) // n) * n - - def add_tensor_info( - self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], - tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None, - ) -> None: - if self.state is not WriterState.EMPTY: - raise ValueError(f'Expected output file to be empty, got {self.state}') - - if raw_dtype is None and tensor_dtype not in (np.float32, np.float16): - raise ValueError("Only F32 and F16 tensors are supported for now") - - encoded_name = name.encode("utf8") - self.ti_data += self._pack("Q", len(encoded_name)) - self.ti_data += encoded_name - n_dims = len(tensor_shape) - self.ti_data += self._pack("I", n_dims) - for i in range(n_dims): - self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) - if raw_dtype is None: - dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 - else: - dtype = raw_dtype - self.ti_data += self._pack("I", dtype) - self.ti_data += self._pack("Q", self.offset_tensor) - self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) - self.ti_data_count += 1 - - def add_tensor( - self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, - raw_dtype: GGMLQuantizationType | None = None, - ) -> None: - if self.endianess == GGUFEndian.BIG: - tensor.byteswap(inplace=True) - if self.use_temp_file and self.temp_file is None: - fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) - fp.seek(0) - self.temp_file = fp - - shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape - self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) - - if self.temp_file is None: - self.tensors.append(tensor) - return - - tensor.tofile(self.temp_file) - self.write_padding(self.temp_file, tensor.nbytes) - - def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None: - pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n - if pad != 0: - fp.write(bytes([0] * pad)) - - def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: - if self.state is not WriterState.TI_DATA: - raise ValueError(f'Expected output file to contain tensor info, got {self.state}') - - if self.endianess == GGUFEndian.BIG: - tensor.byteswap(inplace=True) - self.write_padding(self.fout, self.fout.tell()) - tensor.tofile(self.fout) - self.write_padding(self.fout, tensor.nbytes) - - def write_tensors_to_file(self) -> None: - self.write_ti_data_to_file() - - self.write_padding(self.fout, self.fout.tell()) - - if self.temp_file is None: - while True: - try: - tensor = self.tensors.pop(0) - except IndexError: - break - tensor.tofile(self.fout) - self.write_padding(self.fout, tensor.nbytes) - return - - self.temp_file.seek(0) - - shutil.copyfileobj(self.temp_file, self.fout) - self.flush() - self.temp_file.close() - - def flush(self) -> None: - self.fout.flush() - - def close(self) -> None: - self.fout.close() - - def add_architecture(self) -> None: - self.add_string(Keys.General.ARCHITECTURE, self.arch) - - def add_author(self, author: str) -> None: - self.add_string(Keys.General.AUTHOR, author) - - def add_tensor_data_layout(self, layout: str) -> None: - self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) - - def add_url(self, url: str) -> None: - self.add_string(Keys.General.URL, url) - - def add_description(self, description: str) -> None: - self.add_string(Keys.General.DESCRIPTION, description) - - def add_source_url(self, url: str) -> None: - self.add_string(Keys.General.SOURCE_URL, url) - - def add_source_hf_repo(self, repo: str) -> None: - self.add_string(Keys.General.SOURCE_HF_REPO, repo) - - def add_file_type(self, ftype: int) -> None: - self.add_uint32(Keys.General.FILE_TYPE, ftype) - - def add_name(self, name: str) -> None: - self.add_string(Keys.General.NAME, name) - - def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None: - self.add_uint32( - Keys.General.QUANTIZATION_VERSION, quantization_version) - - def add_custom_alignment(self, alignment: int) -> None: - self.data_alignment = alignment - self.add_uint32(Keys.General.ALIGNMENT, alignment) - - def add_context_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) - - def add_embedding_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) - - def add_block_count(self, length: int) -> None: - self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) - - def add_feed_forward_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) - - def add_parallel_residual(self, use: bool) -> None: - self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) - - def add_head_count(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) - - def add_head_count_kv(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) - - def add_key_length(self, length: int) -> None: - self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length) - - def add_value_length(self, length: int) -> None: - self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) - - def add_max_alibi_bias(self, bias: float) -> None: - self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) - - def add_clamp_kqv(self, value: float) -> None: - self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) - - def add_expert_count(self, count: int) -> None: - self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) - - def add_expert_used_count(self, count: int) -> None: - self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count) - - def add_layer_norm_eps(self, value: float) -> None: - self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) - - def add_layer_norm_rms_eps(self, value: float) -> None: - self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) - - def add_rope_dimension_count(self, count: int) -> None: - self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) - - def add_rope_freq_base(self, value: float) -> None: - self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) - - def add_rope_scaling_type(self, value: RopeScalingType) -> None: - self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value) - - def add_rope_scaling_factor(self, value: float) -> None: - self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) - - def add_rope_scaling_orig_ctx_len(self, value: int) -> None: - self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) - - def add_rope_scaling_finetuned(self, value: bool) -> None: - self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) - - def add_tokenizer_model(self, model: str) -> None: - self.add_string(Keys.Tokenizer.MODEL, model) - - def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: - self.add_array(Keys.Tokenizer.LIST, tokens) - - def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: - self.add_array(Keys.Tokenizer.MERGES, merges) - - def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: - self.add_array(Keys.Tokenizer.TOKEN_TYPE, types) - - def add_token_scores(self, scores: Sequence[float]) -> None: - self.add_array(Keys.Tokenizer.SCORES, scores) - - def add_bos_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.BOS_ID, id) - - def add_eos_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.EOS_ID, id) - - def add_unk_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.UNK_ID, id) - - def add_sep_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SEP_ID, id) - - def add_pad_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PAD_ID, id) - - def add_add_bos_token(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_BOS, value) - - def add_add_eos_token(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_EOS, value) - - def add_add_space_prefix(self, value: bool) -> None: - self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) - - def add_chat_template(self, value: str) -> None: - self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) - - def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: - pack_prefix = '' - if not skip_pack_prefix: - pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>' - return struct.pack(f'{pack_prefix}{fmt}', value) - - def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: - self.fout.write(self._pack(fmt, value, skip_pack_prefix)) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/py.typed b/extensions/model-extension/scripts/gguf-py/gguf/py.typed deleted file mode 100644 index e69de29bb..000000000 diff --git a/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py b/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py deleted file mode 100644 index 4f16d8504..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/tensor_mapping.py +++ /dev/null @@ -1,332 +0,0 @@ -from __future__ import annotations - -from typing import Sequence - -from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES - - -class TensorNameMap: - mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - # Token embeddings - MODEL_TENSOR.TOKEN_EMBD: ( - "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen - "transformer.word_embeddings", # falcon - "word_embeddings", # bloom - "model.embed_tokens", # llama-hf - "tok_embeddings", # llama-pth - "embeddings.word_embeddings", # bert - "language_model.embedding.word_embeddings", # persimmon - "wte", # gpt2 - "transformer.embd.wte", # phi2 - "model.tok_embeddings", # internlm2 - ), - - # Token type embeddings - MODEL_TENSOR.TOKEN_TYPES: ( - "embeddings.token_type_embeddings", # bert - ), - - # Normalization of token embeddings - MODEL_TENSOR.TOKEN_EMBD_NORM: ( - "word_embeddings_layernorm", # bloom - ), - - # Position embeddings - MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", # gpt2 - "embeddings.position_embeddings", # bert - "wpe", # gpt2 - ), - - # Output - MODEL_TENSOR.OUTPUT: ( - "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen - "output", # llama-pth bloom internlm2 - "word_embeddings_for_head", # persimmon - "lm_head.linear", # phi2 - ), - - # Output norm - MODEL_TENSOR.OUTPUT_NORM: ( - "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon - "model.norm", # llama-hf baichuan internlm2 - "norm", # llama-pth - "embeddings.LayerNorm", # bert - "transformer.norm_f", # mpt - "ln_f", # refact bloom qwen gpt2 - "language_model.encoder.final_layernorm", # persimmon - "model.final_layernorm", # persimmon - "lm_head.ln", # phi2 - ), - - # Rope frequencies - MODEL_TENSOR.ROPE_FREQS: ( - "rope.freqs", # llama-pth - ), - } - - block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - # Attention norm - MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth - "encoder.layer.{bid}.attention.output.LayerNorm", # bert - "language_model.encoder.layers.{bid}.input_layernorm", # persimmon - "model.layers.{bid}.ln1", # yi - "h.{bid}.ln_1", # gpt2 - "transformer.h.{bid}.ln", # phi2 - "model.layers.layers.{bid}.norm", # plamo - "model.layers.{bid}.attention_norm", # internlm2 - ), - - # Attention norm 2 - MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b - ), - - # Attention query-key-value - MODEL_TENSOR.ATTN_QKV: ( - "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 qwen - "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.h.{bid}.self_attention.query_key_value", # falcon - "h.{bid}.self_attention.query_key_value", # bloom - "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon - "model.layers.{bid}.self_attn.query_key_value", # persimmon - "h.{bid}.attn.c_attn", # gpt2 - "transformer.h.{bid}.mixer.Wqkv", # phi2 - ), - - # Attention query - MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq" # internlm2 - ), - - # Attention key - MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk" # internlm2 - ), - - # Attention value - MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv" # internlm2 - ), - - # Attention output - MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - ), - - # Rotary embeddings - MODEL_TENSOR.ATTN_ROT_EMBD: ( - "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf - "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth - "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo - "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell - ), - - # Feed-forward norm - MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen - "h.{bid}.post_attention_layernorm", # bloom - "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf - "layers.{bid}.ffn_norm", # llama-pth - "encoder.layer.{bid}.output.LayerNorm", # bert - "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon - "model.layers.{bid}.ln2", # yi - "h.{bid}.ln_2", # gpt2 - "model.layers.{bid}.ffn_norm", # internlm2 - ), - - MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral - ), - - # Feed-forward up - MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 - "transformer.blocks.{bid}.ffn.up_proj", # mpt - "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon - "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact - "layers.{bid}.feed_forward.w3", # llama-pth - "encoder.layer.{bid}.intermediate.dense", # bert - "transformer.h.{bid}.mlp.fc_in", # gpt-j - "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "transformer.h.{bid}.mlp.w1", # qwen - "h.{bid}.mlp.c_fc", # gpt2 - "transformer.h.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.layers.{bid}.mlp.up_proj", # plamo - "model.layers.{bid}.feed_forward.w3", # internlm2 - ), - - MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral - ), - - # AWQ-activation gate - MODEL_TENSOR.FFN_ACT: ( - "transformer.blocks.{bid}.ffn.act", # mpt - ), - - # Feed-forward gate - MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - ), - - MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral - ), - - # Feed-forward down - MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.ffn.down_proj", # mpt - "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon - "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf - "layers.{bid}.feed_forward.w2", # llama-pth - "encoder.layer.{bid}.output.dense", # bert - "transformer.h.{bid}.mlp.fc_out", # gpt-j - "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "h.{bid}.mlp.c_proj", # gpt2 - "transformer.h.{bid}.mlp.fc2", # phi2 - "model.layers.{bid}.mlp.fc2", # phi2 - "model.layers.layers.{bid}.mlp.down_proj", # plamo - "model.layers.{bid}.feed_forward.w2", # internlm2 - ), - - MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral - ), - - MODEL_TENSOR.ATTN_Q_NORM: ( - "language_model.encoder.layers.{bid}.self_attention.q_layernorm", - "model.layers.{bid}.self_attn.q_layernorm", # persimmon - ), - - MODEL_TENSOR.ATTN_K_NORM: ( - "language_model.encoder.layers.{bid}.self_attention.k_layernorm", - "model.layers.{bid}.self_attn.k_layernorm", # persimmon - ), - - MODEL_TENSOR.ROPE_FREQS: ( - "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon - ), - } - - mapping: dict[str, tuple[MODEL_TENSOR, str]] - - def __init__(self, arch: MODEL_ARCH, n_blocks: int): - self.mapping = {} - for tensor, keys in self.mappings_cfg.items(): - if tensor not in MODEL_TENSORS[arch]: - continue - tensor_name = TENSOR_NAMES[tensor] - self.mapping[tensor_name] = (tensor, tensor_name) - for key in keys: - self.mapping[key] = (tensor, tensor_name) - for bid in range(n_blocks): - for tensor, keys in self.block_mappings_cfg.items(): - if tensor not in MODEL_TENSORS[arch]: - continue - # TODO: make this configurable - n_experts = 8 - for xid in range(n_experts): - tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) - self.mapping[tensor_name] = (tensor, tensor_name) - for key in keys: - key = key.format(bid = bid, xid = xid) - self.mapping[key] = (tensor, tensor_name) - - def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: - result = self.mapping.get(key) - if result is not None: - return result - for suffix in try_suffixes: - if key.endswith(suffix): - result = self.mapping.get(key[:-len(suffix)]) - if result is not None: - return result[0], result[1] + suffix - return None - - def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) - if result is None: - return None - return result[1] - - def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: - result = self.get_type_and_name(key, try_suffixes = try_suffixes) - if result is None: - return None - return result[0] - - def __getitem__(self, key: str) -> str: - try: - return self.mapping[key][1] - except KeyError: - raise KeyError(key) - - def __contains__(self, key: str) -> bool: - return key in self.mapping - - def __repr__(self) -> str: - return repr(self.mapping) - - -def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: - return TensorNameMap(arch, n_blocks) diff --git a/extensions/model-extension/scripts/gguf-py/gguf/vocab.py b/extensions/model-extension/scripts/gguf-py/gguf/vocab.py deleted file mode 100644 index cd1942975..000000000 --- a/extensions/model-extension/scripts/gguf-py/gguf/vocab.py +++ /dev/null @@ -1,185 +0,0 @@ -from __future__ import annotations - -import json -import os -import sys -from pathlib import Path -from typing import Any, Callable - -from .gguf_writer import GGUFWriter - - -class SpecialVocab: - merges: list[str] - add_special_token: dict[str, bool] - special_token_ids: dict[str, int] - chat_template: str | None - - def __init__( - self, path: str | os.PathLike[str], load_merges: bool = False, - special_token_types: tuple[str, ...] | None = None, - n_vocab: int | None = None, - ): - self.special_token_ids = {} - self.add_special_token = {} - self.n_vocab = n_vocab - self.load_merges = load_merges - self.merges = [] - self.chat_template = None - if special_token_types is not None: - self.special_token_types = special_token_types - else: - self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad') - self._load(Path(path)) - - def __repr__(self) -> str: - return ''.format( - len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset", - ) - - def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: - if self.merges: - if not quiet: - print(f'gguf: Adding {len(self.merges)} merge(s).') - gw.add_token_merges(self.merges) - elif self.load_merges: - print( - 'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.', - file = sys.stderr, - ) - for typ, tokid in self.special_token_ids.items(): - id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) - if id_handler is None: - print( - f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', - file = sys.stderr, - ) - continue - if not quiet: - print(f'gguf: Setting special token type {typ} to {tokid}') - id_handler(tokid) - for typ, value in self.add_special_token.items(): - add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) - if add_handler is None: - print( - f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping', - file = sys.stderr, - ) - continue - if not quiet: - print(f'gguf: Setting add_{typ}_token to {value}') - add_handler(value) - if self.chat_template is not None: - if not quiet: - print(f'gguf: Setting chat_template to {self.chat_template}') - gw.add_chat_template(self.chat_template) - - def _load(self, path: Path) -> None: - self._try_load_from_tokenizer_json(path) - self._try_load_from_config_json(path) - if self.load_merges and not self.merges: - self._try_load_merges_txt(path) - - def _try_load_merges_txt(self, path: Path) -> bool: - merges_file = path / 'merges.txt' - if not merges_file.is_file(): - return False - with open(merges_file, 'r', encoding = 'utf-8') as fp: - first_line = next(fp, '').strip() - if not first_line.startswith('#'): - fp.seek(0) - line_num = 0 - else: - line_num = 1 - merges = [] - for line in fp: - line_num += 1 - line = line.strip() - if not line: - continue - parts = line.split(None, 3) - if len(parts) != 2: - print( - f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring', - file = sys.stderr, - ) - continue - merges.append(f'{parts[0]} {parts[1]}') - self.merges = merges - return True - - def _set_special_token(self, typ: str, tid: Any) -> None: - if not isinstance(tid, int): - return - if tid < 0: - raise ValueError(f'invalid value for special token type {typ}: {tid}') - if self.n_vocab is None or tid < self.n_vocab: - if typ in self.special_token_ids: - return - self.special_token_ids[typ] = tid - return - print( - f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping', - file = sys.stderr, - ) - - def _try_load_from_tokenizer_json(self, path: Path) -> bool: - tokenizer_file = path / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, encoding = 'utf-8') as f: - tokenizer = json.load(f) - if self.load_merges: - merges = tokenizer.get('model', {}).get('merges') - if isinstance(merges, list) and merges and isinstance(merges[0], str): - self.merges = merges - added_tokens = tokenizer.get('added_tokens', {}) - else: - added_tokens = {} - tokenizer_config_file = path / 'tokenizer_config.json' - if not tokenizer_config_file.is_file(): - return True - with open(tokenizer_config_file, encoding = 'utf-8') as f: - tokenizer_config = json.load(f) - chat_template = tokenizer_config.get('chat_template') - if chat_template is None or isinstance(chat_template, str): - self.chat_template = chat_template - else: - print( - f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', - file = sys.stderr - ) - for typ in self.special_token_types: - add_entry = tokenizer_config.get(f'add_{typ}_token') - if isinstance(add_entry, bool): - self.add_special_token[typ] = add_entry - if not added_tokens: - # We will need this to get the content for the token, so if it's empty - # may as well just give up. - continue - entry = tokenizer_config.get(f'{typ}_token') - if isinstance(entry, str): - tc_content = entry - elif isinstance(entry, dict): - entry_content = entry.get('content') - if not isinstance(entry_content, str): - continue - tc_content = entry_content - else: - continue - # We only need the first match here. - maybe_token_id = next( - (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content), - None, - ) - self._set_special_token(typ, maybe_token_id) - return True - - def _try_load_from_config_json(self, path: Path) -> bool: - config_file = path / 'config.json' - if not config_file.is_file(): - return False - with open(config_file, encoding = 'utf-8') as f: - config = json.load(f) - for typ in self.special_token_types: - self._set_special_token(typ, config.get(f'{typ}_token_id')) - return True diff --git a/extensions/model-extension/scripts/gguf-py/pyproject.toml b/extensions/model-extension/scripts/gguf-py/pyproject.toml deleted file mode 100644 index 9789c2c87..000000000 --- a/extensions/model-extension/scripts/gguf-py/pyproject.toml +++ /dev/null @@ -1,35 +0,0 @@ -[tool.poetry] -name = "gguf" -version = "0.7.0" -description = "Read and write ML models in GGUF for GGML" -authors = ["GGML "] -packages = [ - {include = "gguf"}, - {include = "gguf/py.typed"}, - {include = "scripts"}, -] -readme = "README.md" -homepage = "https://ggml.ai" -repository = "https://github.com/ggerganov/llama.cpp" -keywords = ["ggml", "gguf", "llama.cpp"] -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] - -[tool.poetry.dependencies] -python = ">=3.8" -numpy = ">=1.17" - -[tool.poetry.dev-dependencies] -pytest = "^5.2" - -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" - -[tool.poetry.scripts] -gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" -gguf-dump = "scripts:gguf_dump_entrypoint" -gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" diff --git a/extensions/model-extension/scripts/gguf-py/scripts/__init__.py b/extensions/model-extension/scripts/gguf-py/scripts/__init__.py deleted file mode 100644 index 77132db7a..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import os - -from importlib import import_module - - -os.environ["NO_LOCAL_GGUF"] = "TRUE" - -gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main -gguf_dump_entrypoint = import_module("scripts.gguf-dump").main -gguf_set_metadata_entrypoint = import_module("scripts.gguf-set-metadata").main - -del import_module, os diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py deleted file mode 100755 index 10a16ad06..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-convert-endian.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import os -import sys -from pathlib import Path - -import numpy as np - -# Necessary to load the local gguf package -if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): - sys.path.insert(0, str(Path(__file__).parent.parent)) - -import gguf - - -def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # Host is little endian - host_endian = "little" - swapped_endian = "big" - else: - # Sorry PDP or other weird systems that don't use BE or LE. - host_endian = "big" - swapped_endian = "little" - if reader.byte_order == "S": - file_endian = swapped_endian - else: - file_endian = host_endian - order = host_endian if args.order == "native" else args.order - print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") - if file_endian == order: - print(f"* File is already {order.upper()} endian. Nothing to do.") - sys.exit(0) - print("* Checking tensors for conversion compatibility") - for tensor in reader.tensors: - if tensor.tensor_type not in ( - gguf.GGMLQuantizationType.F32, - gguf.GGMLQuantizationType.F16, - gguf.GGMLQuantizationType.Q8_0, - ): - raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") - print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") - if args.dry_run: - return - print("\n*** Warning *** Warning *** Warning **") - print("* This conversion process may damage the file. Ensure you have a backup.") - if order != host_endian: - print("* Requested endian differs from host, you will not be able to load the model on this machine.") - print("* The file will be modified immediately, so if conversion fails or is interrupted") - print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") - response = input("YES, I am sure> ") - if response != "YES": - print("You didn't enter YES. Okay then, see ya!") - sys.exit(0) - print(f"\n* Converting fields ({len(reader.fields)})") - for idx, field in enumerate(reader.fields.values()): - print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") - for part in field.parts: - part.byteswap(inplace=True) - print(f"\n* Converting tensors ({len(reader.tensors)})") - for idx, tensor in enumerate(reader.tensors): - print( - f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, " - f"elements={tensor.n_elements}... ", - end="", - ) - tensor_type = tensor.tensor_type - for part in tensor.field.parts: - part.byteswap(inplace=True) - if tensor_type != gguf.GGMLQuantizationType.Q8_0: - tensor.data.byteswap(inplace=True) - print() - continue - # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes - block_size = 34 - n_blocks = len(tensor.data) // block_size - for block_num in range(n_blocks): - block_offs = block_num * block_size - # I know I said f16, but it doesn't matter here - any simple 16 bit type works. - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - if block_num % 100000 == 0: - print(f"[{(n_blocks - block_num) // 1000}K]", end="") - sys.stdout.flush() - print() - print("* Completion") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Convert GGUF file byte order") - parser.add_argument( - "model", type=str, - help="GGUF format model filename", - ) - parser.add_argument( - "order", type=str, choices=['big', 'little', 'native'], - help="Requested byte order", - ) - parser.add_argument( - "--dry-run", action="store_true", - help="Don't actually change anything", - ) - args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') - reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') - convert_byteorder(reader, args) - - -if __name__ == "__main__": - main() diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py deleted file mode 100755 index dbf891508..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-dump.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import os -import sys -from pathlib import Path -from typing import Any - -import numpy as np - -# Necessary to load the local gguf package -if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): - sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf import GGUFReader, GGUFValueType # noqa: E402 - - -def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: - host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' - if reader.byte_order == 'S': - file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE' - else: - file_endian = host_endian - return (host_endian, file_endian) - - -# For more information about what field.parts and field.data represent, -# please see the comments in the modify_gguf.py example. -def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: - host_endian, file_endian = get_file_host_endian(reader) - print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') - print(f'\n* Dumping {len(reader.fields)} key/value pair(s)') - for n, field in enumerate(reader.fields.values(), 1): - if not field.types: - pretty_type = 'N/A' - elif field.types[0] == GGUFValueType.ARRAY: - nest_count = len(field.types) - 1 - pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count - else: - pretty_type = str(field.types[-1].name) - print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '') - if len(field.types) == 1: - curr_type = field.types[0] - if curr_type == GGUFValueType.STRING: - print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '') - elif field.types[0] in reader.gguf_scalar_to_np: - print(' = {0}'.format(field.parts[-1][0]), end = '') - print() - if args.no_tensors: - return - print(f'\n* Dumping {len(reader.tensors)} tensor(s)') - for n, tensor in enumerate(reader.tensors, 1): - prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) - print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') - - -def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: - import json - host_endian, file_endian = get_file_host_endian(reader) - metadata: dict[str, Any] = {} - tensors: dict[str, Any] = {} - result = { - "filename": args.model, - "endian": file_endian, - "metadata": metadata, - "tensors": tensors, - } - for idx, field in enumerate(reader.fields.values()): - curr: dict[str, Any] = { - "index": idx, - "type": field.types[0].name if field.types else 'UNKNOWN', - "offset": field.offset, - } - metadata[field.name] = curr - if field.types[:1] == [GGUFValueType.ARRAY]: - curr["array_types"] = [t.name for t in field.types][1:] - if not args.json_array: - continue - itype = field.types[-1] - if itype == GGUFValueType.STRING: - curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data] - else: - curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()] - elif field.types[0] == GGUFValueType.STRING: - curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8") - else: - curr["value"] = field.parts[-1].tolist()[0] - if not args.no_tensors: - for idx, tensor in enumerate(reader.tensors): - tensors[tensor.name] = { - "index": idx, - "shape": tensor.shape.tolist(), - "type": tensor.tensor_type.name, - "offset": tensor.field.offset, - } - json.dump(result, sys.stdout) - - -def main() -> None: - parser = argparse.ArgumentParser(description="Dump GGUF file metadata") - parser.add_argument("model", type=str, help="GGUF format model filename") - parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") - parser.add_argument("--json", action="store_true", help="Produce JSON output") - parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") - args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - if not args.json: - print(f'* Loading: {args.model}') - reader = GGUFReader(args.model, 'r') - if args.json: - dump_metadata_json(reader, args) - else: - dump_metadata(reader, args) - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py b/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py deleted file mode 100755 index 3ebdfa898..000000000 --- a/extensions/model-extension/scripts/gguf-py/scripts/gguf-set-metadata.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import os -import sys -from pathlib import Path - -# Necessary to load the local gguf package -if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): - sys.path.insert(0, str(Path(__file__).parent.parent)) - -from gguf import GGUFReader # noqa: E402 - - -def minimal_example(filename: str) -> None: - reader = GGUFReader(filename, 'r+') - field = reader.fields['tokenizer.ggml.bos_token_id'] - if field is None: - return - part_index = field.data[0] - field.parts[part_index][0] = 2 # Set tokenizer.ggml.bos_token_id to 2 - # - # So what's this field.data thing? It's helpful because field.parts contains - # _every_ part of the GGUF field. For example, tokenizer.ggml.bos_token_id consists - # of: - # - # Part index 0: Key length (27) - # Part index 1: Key data ("tokenizer.ggml.bos_token_id") - # Part index 2: Field type (4, the id for GGUFValueType.UINT32) - # Part index 3: Field value - # - # Note also that each part is an NDArray slice, so even a part that - # is only a single value like the key length will be a NDArray of - # the key length type (numpy.uint32). - # - # The .data attribute in the Field is a list of relevant part indexes - # and doesn't contain internal GGUF details like the key length part. - # In this case, .data will be [3] - just the part index of the - # field value itself. - - -def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: - field = reader.get_field(args.key) - if field is None: - print(f'! Field {repr(args.key)} not found', file = sys.stderr) - sys.exit(1) - # Note that field.types is a list of types. This is because the GGUF - # format supports arrays. For example, an array of UINT32 would - # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] - handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None - if handler is None: - print( - f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}', - file = sys.stderr, - ) - sys.exit(1) - current_value = field.parts[field.data[0]][0] - new_value = handler(args.value) - print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') - if current_value == new_value: - print(f'- Key {repr(args.key)} already set to requested value {current_value}') - sys.exit(0) - if args.dry_run: - sys.exit(0) - if not args.force: - print('*** Warning *** Warning *** Warning **') - print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') - print('* Enter exactly YES if you are positive you want to proceed:') - response = input('YES, I am sure> ') - if response != 'YES': - print("You didn't enter YES. Okay then, see ya!") - sys.exit(0) - field.parts[field.data[0]][0] = new_value - print('* Field changed. Successful completion.') - - -def main() -> None: - parser = argparse.ArgumentParser(description="Set a simple value in GGUF file metadata") - parser.add_argument("model", type=str, help="GGUF format model filename") - parser.add_argument("key", type=str, help="Metadata key to set") - parser.add_argument("value", type=str, help="Metadata value to set") - parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") - parser.add_argument("--force", action="store_true", help="Change the field without confirmation") - args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') - reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') - set_metadata(reader, args) - - -if __name__ == '__main__': - main() diff --git a/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py b/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py deleted file mode 100644 index 0adeb7d55..000000000 --- a/extensions/model-extension/scripts/gguf-py/tests/test_gguf.py +++ /dev/null @@ -1,7 +0,0 @@ -import gguf # noqa: F401 - -# TODO: add tests - - -def test_write_gguf() -> None: - pass diff --git a/extensions/model-extension/scripts/install_deps.py b/extensions/model-extension/scripts/install_deps.py deleted file mode 100644 index 2dfabed07..000000000 --- a/extensions/model-extension/scripts/install_deps.py +++ /dev/null @@ -1,14 +0,0 @@ -import subprocess -import sys - -deps = [ - 'numpy~=1.24.4', - 'sentencepiece~=0.1.98', - 'transformers>=4.35.2,<5.0.0', - 'gguf>=0.1.0', - 'protobuf>=4.21.0,<5.0.0', - 'torch~=2.1.1', - 'packaging>=20.0', - 'tiktoken~=0.5.0' -] -subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', '--force-reinstall', *deps]) diff --git a/extensions/model-extension/scripts/version.txt b/extensions/model-extension/scripts/version.txt deleted file mode 100644 index f743d6c4a..000000000 --- a/extensions/model-extension/scripts/version.txt +++ /dev/null @@ -1 +0,0 @@ -b2106 \ No newline at end of file diff --git a/extensions/model-extension/src/index.ts b/extensions/model-extension/src/index.ts index 7561ee6ed..a8977e07e 100644 --- a/extensions/model-extension/src/index.ts +++ b/extensions/model-extension/src/index.ts @@ -19,8 +19,6 @@ import { DownloadRequest, executeOnMain, HuggingFaceRepoData, - Quantization, - log, getFileSize, AllQuantizations, ModelEvent, @@ -565,6 +563,19 @@ export default class JanModelExtension extends ModelExtension { } const defaultModel = (await this.getDefaultModel()) as Model + const metadata = await executeOnMain( + NODE, + 'retrieveGGUFMetadata', + await joinPath([ + await getJanDataFolderPath(), + 'models', + dirName, + binaryFileName, + ]) + ) + + const eos_id = metadata['tokenizer.ggml.eos_token_id'] + if (!defaultModel) { console.error('Unable to find default model') return @@ -581,8 +592,18 @@ export default class JanModelExtension extends ModelExtension { filename: binaryFileName, }, ], + parameters: { + ...defaultModel.parameters, + stop: [metadata['tokenizer.ggml.tokens'][eos_id] ?? ''], + }, settings: { ...defaultModel.settings, + prompt_template: + metadata.parsed_chat_template ?? + defaultModel.settings.prompt_template, + ctx_len: + metadata['llama.context_length'] ?? defaultModel.settings.ctx_len, + ngl: (metadata['llama.block_count'] ?? 32) + 1, llama_model_path: binaryFileName, }, created: Date.now(), @@ -657,6 +678,13 @@ export default class JanModelExtension extends ModelExtension { return } + const metadata = await executeOnMain( + NODE, + 'retrieveGGUFMetadata', + modelBinaryPath + ) + const eos_id = metadata['tokenizer.ggml.eos_token_id'] + const binaryFileName = await baseName(modelBinaryPath) const model: Model = { @@ -669,8 +697,19 @@ export default class JanModelExtension extends ModelExtension { filename: binaryFileName, }, ], + parameters: { + ...defaultModel.parameters, + stop: [metadata['tokenizer.ggml.tokens'][eos_id] ?? ''], + }, + settings: { ...defaultModel.settings, + prompt_template: + metadata.parsed_chat_template ?? + defaultModel.settings.prompt_template, + ctx_len: + metadata['llama.context_length'] ?? defaultModel.settings.ctx_len, + ngl: (metadata['llama.block_count'] ?? 32) + 1, llama_model_path: binaryFileName, }, created: Date.now(), @@ -826,218 +865,4 @@ export default class JanModelExtension extends ModelExtension { importedModels ) } - - private getGgufFileList( - repoData: HuggingFaceRepoData, - selectedQuantization: Quantization - ): string[] { - return repoData.siblings - .map((file) => file.rfilename) - .filter((file) => file.indexOf(selectedQuantization) !== -1) - .filter((file) => file.endsWith('.gguf')) - } - - private getFileList(repoData: HuggingFaceRepoData): string[] { - // SafeTensors first, if not, then PyTorch - const modelFiles = repoData.siblings - .map((file) => file.rfilename) - .filter((file) => - JanModelExtension._safetensorsRegexs.some((regex) => regex.test(file)) - ) - if (modelFiles.length === 0) { - repoData.siblings.forEach((file) => { - if ( - JanModelExtension._pytorchRegexs.some((regex) => - regex.test(file.rfilename) - ) - ) { - modelFiles.push(file.rfilename) - } - }) - } - - const vocabFiles = [ - 'tokenizer.model', - 'vocab.json', - 'tokenizer.json', - ].filter((file) => - repoData.siblings.some((sibling) => sibling.rfilename === file) - ) - - const etcFiles = repoData.siblings - .map((file) => file.rfilename) - .filter( - (file) => - (file.endsWith('.json') && !vocabFiles.includes(file)) || - file.endsWith('.txt') || - file.endsWith('.py') || - file.endsWith('.tiktoken') - ) - - return [...modelFiles, ...vocabFiles, ...etcFiles] - } - - private async getModelDirPath(repoID: string): Promise { - const modelName = repoID.split('/').slice(1).join('/') - return joinPath([await getJanDataFolderPath(), 'models', modelName]) - } - - private async getConvertedModelPath(repoID: string): Promise { - const modelName = repoID.split('/').slice(1).join('/') - const modelDirPath = await this.getModelDirPath(repoID) - return joinPath([modelDirPath, modelName + '.gguf']) - } - - private async getQuantizedModelPath( - repoID: string, - quantization: Quantization - ): Promise { - const modelName = repoID.split('/').slice(1).join('/') - const modelDirPath = await this.getModelDirPath(repoID) - return joinPath([ - modelDirPath, - modelName + `-${quantization.toLowerCase()}.gguf`, - ]) - } - private getCtxLength(config: { - max_sequence_length?: number - max_position_embeddings?: number - n_ctx?: number - }): number { - if (config.max_sequence_length) return config.max_sequence_length - if (config.max_position_embeddings) return config.max_position_embeddings - if (config.n_ctx) return config.n_ctx - return 2048 - } - - /** - * Converts a Hugging Face model to GGUF. - * @param repoID - The repo ID of the model to convert. - * @returns A promise that resolves when the conversion is complete. - */ - async convert(repoID: string): Promise { - if (this.interrupted) return - const modelDirPath = await this.getModelDirPath(repoID) - const modelOutPath = await this.getConvertedModelPath(repoID) - if (!(await fs.existsSync(modelDirPath))) { - throw new Error('Model dir not found') - } - if (await fs.existsSync(modelOutPath)) return - - await executeOnMain(NODE, 'installDeps') - if (this.interrupted) return - - try { - await executeOnMain( - NODE, - 'convertHf', - modelDirPath, - modelOutPath + '.temp' - ) - } catch (err) { - log(`[Conversion]::Debug: Error using hf-to-gguf.py, trying convert.py`) - - let ctx = 2048 - try { - const config = await fs.readFileSync( - await joinPath([modelDirPath, 'config.json']), - 'utf8' - ) - const configParsed = JSON.parse(config) - ctx = this.getCtxLength(configParsed) - configParsed.max_sequence_length = ctx - await fs.writeFileSync( - await joinPath([modelDirPath, 'config.json']), - JSON.stringify(configParsed, null, 2) - ) - } catch (err) { - log(`${err}`) - // ignore missing config.json - } - - const bpe = await fs.existsSync( - await joinPath([modelDirPath, 'vocab.json']) - ) - - await executeOnMain( - NODE, - 'convert', - modelDirPath, - modelOutPath + '.temp', - { - ctx, - bpe, - } - ) - } - await executeOnMain( - NODE, - 'renameSync', - modelOutPath + '.temp', - modelOutPath - ) - - for (const file of await fs.readdirSync(modelDirPath)) { - if ( - modelOutPath.endsWith(file) || - (file.endsWith('config.json') && !file.endsWith('_config.json')) - ) - continue - await fs.unlinkSync(await joinPath([modelDirPath, file])) - } - } - - /** - * Quantizes a GGUF model. - * @param repoID - The repo ID of the model to quantize. - * @param quantization - The quantization to use. - * @returns A promise that resolves when the quantization is complete. - */ - async quantize(repoID: string, quantization: Quantization): Promise { - if (this.interrupted) return - const modelDirPath = await this.getModelDirPath(repoID) - const modelOutPath = await this.getQuantizedModelPath(repoID, quantization) - if (!(await fs.existsSync(modelDirPath))) { - throw new Error('Model dir not found') - } - if (await fs.existsSync(modelOutPath)) return - - await executeOnMain( - NODE, - 'quantize', - await this.getConvertedModelPath(repoID), - modelOutPath + '.temp', - quantization - ) - await executeOnMain( - NODE, - 'renameSync', - modelOutPath + '.temp', - modelOutPath - ) - - await fs.unlinkSync(await this.getConvertedModelPath(repoID)) - } - - /** - * Cancels the convert of current Hugging Face model. - * @param repoID - The repository ID to cancel. - * @param repoData - The repository data to cancel. - * @returns {Promise} A promise that resolves when the download has been cancelled. - */ - async cancelConvert( - repoID: string, - repoData: HuggingFaceRepoData - ): Promise { - this.interrupted = true - const modelDirPath = await this.getModelDirPath(repoID) - const files = this.getFileList(repoData) - for (const file of files) { - const filePath = file - const localPath = await joinPath([modelDirPath, filePath]) - await abortDownload(localPath) - } - - executeOnMain(NODE, 'killProcesses') - } } diff --git a/extensions/model-extension/src/node/index.ts b/extensions/model-extension/src/node/index.ts index 991548e00..2b498f424 100644 --- a/extensions/model-extension/src/node/index.ts +++ b/extensions/model-extension/src/node/index.ts @@ -1,182 +1,47 @@ -import { PythonShell } from 'python-shell' -import { spawn, ChildProcess } from 'child_process' -import { resolve as presolve, join as pjoin } from 'path' -import { log, Quantization } from '@janhq/core/node' -import { statSync } from 'fs' -export { renameSync } from 'fs' +import { closeSync, openSync, readSync } from 'fs' +import { Template } from '@huggingface/jinja' +/** + * This is to retrieve the metadata from a GGUF file + * It uses hyllama and jinja from @huggingface module + */ +export const retrieveGGUFMetadata = async (ggufPath: string) => { + try { + const { ggufMetadata } = await import('hyllama') + // Read first 10mb of gguf file + const fd = openSync(ggufPath, 'r') + const buffer = new Uint8Array(10_000_000) + readSync(fd, buffer, 0, 10_000_000, 0) + closeSync(fd) -let pythonShell: PythonShell | undefined = undefined -let quantizeProcess: ChildProcess | undefined = undefined + // Parse metadata and tensor info + const { metadata } = ggufMetadata(buffer.buffer) -export const getSize = (path: string): number => statSync(path).size - -export const killProcesses = () => { - if (pythonShell) { - pythonShell.kill() - pythonShell = undefined - } - if (quantizeProcess) { - quantizeProcess.kill() - quantizeProcess = undefined + const template = new Template(metadata['tokenizer.chat_template']) + const eos_id = metadata['tokenizer.ggml.eos_token_id'] + const bos_id = metadata['tokenizer.ggml.bos_token_id'] + const eos_token = metadata['tokenizer.ggml.tokens'][eos_id] + const bos_token = metadata['tokenizer.ggml.tokens'][bos_id] + // Parse jinja template + const renderedTemplate = template.render({ + add_generation_prompt: true, + eos_token, + bos_token, + messages: [ + { + role: 'system', + content: '{system_message}', + }, + { + role: 'user', + content: '{prompt}', + }, + ], + }) + return { + ...metadata, + parsed_chat_template: renderedTemplate, + } + } catch (e) { + console.log('[MODEL_EXT]', e) } } - -export const getQuantizeExecutable = (): string => { - let binaryFolder = pjoin(__dirname, '..', 'bin') // Current directory by default - let binaryName = 'quantize' - /** - * The binary folder is different for each platform. - */ - if (process.platform === 'win32') { - binaryFolder = pjoin(binaryFolder, 'win') - binaryName = 'quantize.exe' - } else if (process.platform === 'darwin') { - /** - * For MacOS: mac-universal both Silicon and InteL - */ - binaryFolder = pjoin(binaryFolder, 'mac-universal') - } else { - binaryFolder = pjoin(binaryFolder, 'linux-cpu') - } - return pjoin(binaryFolder, binaryName) -} - -export const installDeps = (): Promise => { - return new Promise((resolve, reject) => { - const _pythonShell = new PythonShell( - presolve(__dirname, '..', 'scripts', 'install_deps.py') - ) - _pythonShell.on('message', (message) => { - log(`[Install Deps]::Debug: ${message}`) - }) - _pythonShell.on('stderr', (stderr) => { - log(`[Install Deps]::Error: ${stderr}`) - }) - _pythonShell.on('error', (err) => { - pythonShell = undefined - log(`[Install Deps]::Error: ${err}`) - reject(err) - }) - _pythonShell.on('close', () => { - const exitCode = _pythonShell.exitCode - pythonShell = undefined - log( - `[Install Deps]::Debug: Deps installation exited with code: ${exitCode}` - ) - exitCode === 0 ? resolve() : reject(exitCode) - }) - }) -} - -export const convertHf = async ( - modelDirPath: string, - outPath: string -): Promise => { - return await new Promise((resolve, reject) => { - const _pythonShell = new PythonShell( - presolve(__dirname, '..', 'scripts', 'convert-hf-to-gguf.py'), - { - args: [modelDirPath, '--outfile', outPath], - } - ) - pythonShell = _pythonShell - _pythonShell.on('message', (message) => { - log(`[Conversion]::Debug: ${message}`) - }) - _pythonShell.on('stderr', (stderr) => { - log(`[Conversion]::Error: ${stderr}`) - }) - _pythonShell.on('error', (err) => { - pythonShell = undefined - log(`[Conversion]::Error: ${err}`) - reject(err) - }) - _pythonShell.on('close', () => { - const exitCode = _pythonShell.exitCode - pythonShell = undefined - if (exitCode !== 0) { - log(`[Conversion]::Debug: Conversion exited with code: ${exitCode}`) - reject(exitCode) - } else { - resolve() - } - }) - }) -} - -export const convert = async ( - modelDirPath: string, - outPath: string, - { ctx, bpe }: { ctx?: number; bpe?: boolean } -): Promise => { - const args = [modelDirPath, '--outfile', outPath] - if (ctx) { - args.push('--ctx') - args.push(ctx.toString()) - } - if (bpe) { - args.push('--vocab-type') - args.push('bpe') - } - return await new Promise((resolve, reject) => { - const _pythonShell = new PythonShell( - presolve(__dirname, '..', 'scripts', 'convert.py'), - { - args, - } - ) - _pythonShell.on('message', (message) => { - log(`[Conversion]::Debug: ${message}`) - }) - _pythonShell.on('stderr', (stderr) => { - log(`[Conversion]::Error: ${stderr}`) - }) - _pythonShell.on('error', (err) => { - pythonShell = undefined - log(`[Conversion]::Error: ${err}`) - reject(err) - }) - _pythonShell.on('close', () => { - const exitCode = _pythonShell.exitCode - pythonShell = undefined - if (exitCode !== 0) { - log(`[Conversion]::Debug: Conversion exited with code: ${exitCode}`) - reject(exitCode) - } else { - resolve() - } - }) - }) -} - -export const quantize = async ( - modelPath: string, - outPath: string, - quantization: Quantization -): Promise => { - return await new Promise((resolve, reject) => { - const quantizeExecutable = getQuantizeExecutable() - const _quantizeProcess = spawn(quantizeExecutable, [ - modelPath, - outPath, - quantization, - ]) - quantizeProcess = _quantizeProcess - - _quantizeProcess.stdout?.on('data', (data) => { - log(`[Quantization]::Debug: ${data}`) - }) - _quantizeProcess.stderr?.on('data', (data) => { - log(`[Quantization]::Error: ${data}`) - }) - - _quantizeProcess.on('close', (code) => { - if (code !== 0) { - log(`[Quantization]::Debug: Quantization exited with code: ${code}`) - reject(code) - } else { - resolve() - } - }) - }) -}