This reverts commit ad6fbea22df6deaba31e146dddb456e4a5d5dd75 Revert "chore: add engine logo from local instead of metadata logo (#3363)" This reverts commit ad6fbea22df6deaba31e146dddb456e4a5d5dd75. Revert "fix: LaTex formula render issue (#3353)" This reverts commit 3b2c84c4fee61b886c883c68801be3bc5a8584ad. Revert "chore: minor ui improvement (#3352)" This reverts commit 6dd387db2b5b9890f19d0c3505cf9cb770fd492f. Revert "fix: failed to relaunch app to update (#3351)" This reverts commit fcaf98a2fa4e674799602e8093914bcc04ced153. Revert "chore: add back GPU information to system monitoring bar (#3350)" This reverts commit 03455a91807c7af6c6325901997c6d7231d2cd0d. Revert "fix: empty model page not shown when delete all threads and models (#3343)" This reverts commit 9e29fcd69eb9085843896686806fd453a1285723. Revert "feat: allow user configure remote model from my model (#3348)" This reverts commit fdab8af057f80cf1ccaae0dc42c4e5161925f51e. Revert "chore: ui fix button outline for configure cloud model (#3347)" This reverts commit fe8ed1f26dc86ead92ffea4f36e2989caf7dad88. Revert "feat: move icon create new thread into top panel (#3346)" This reverts commit 46cb1b45b997181e2188f8dafb2fc0d0cc12ddcd. Revert "chore(UI): update experience model dropdown (#3342)" This reverts commit 8b44613015a907dc491113aeb99c963080424892. Revert "Chore/simple bug template and correct a copy (#3344)" This reverts commit 23cd5fd3979e7529811045da5c4912369bcc7532. Revert "chore(ui): fix alignment loader starter screen (#3338)" This reverts commit e9f5d2f837ce323b0851ea04cded913ab433388c. Revert "Increase retry upload to R2 to 5 times (#3337)" This reverts commit dcfb497934edc795955d971b6d391ee1e6309a03. Revert "fix: broken jan build - add log trace (jan.log) (#3336)" This reverts commit 77422c3a7ed240909942ac0d8c4b259af8d87a28. Revert "chore: disable quick ask (#3334)" This reverts commit 6e4b6b09ae009149f262d86d5b19bb8096267c19. Revert "fix: update legacy path (#3328)" This reverts commit 5eb112142c6431cfe0cdf11ce28810ca650a5427. Revert "chore: add cortex version (#3318)" This reverts commit 60587649c56a1f24272e763f25aa5b4042f7719a. Revert "fix: broken app due to incorrect api path (#3316)" This reverts commit 3de4eab2a0dfbf9f593d73b9dde6bca1d9df2279. Revert "feat: modal waiting cortex (#3306)" This reverts commit 1f5168d4af9080b867c19d334c398bf32e4f54b8. Revert "fix: refresh should not create new thread (#3314)" This reverts commit 624d07703c50ea332ed4eeac9dc3a26bc8190d08. Revert "fix: avoid lose title threads (#3307)" This reverts commit a4f5fda104c2d1e01ea72798f055e5b4e3cfd616. Revert "feat: change data folder (#3309)" This reverts commit b43242b9b24352c7f90995eccab753dede679616. Revert "feat: embed cortex into jan as a js module (#3305)" This reverts commit b348110fb73bd5f13c69f1b915168687dea776d0. Revert "fix: migration item in setting detail omit buttons (#3298)" This reverts commit 709204b2bc9d9ed08e2245cbb084482f5908ab3a. Revert "fix: merge gpu arch and os tensorrt models (#3299)" This reverts commit aa7dbdc9fa701debeee28d9c7eb4af6258685321. Revert "chore: update cortex new version (#3300)" This reverts commit 602097909d38b4874db8b9f19a729c65a0ac9619. Revert "fix: engine logo on model dropdown (#3291)" This reverts commit 8eb8611c28f6c4cdf1ab142a6e18c82bcc4c2073. Revert "fix: icon setting can close and open right panel (#3295)" This reverts commit be31e9315e2df5c483de3f46bd37740d277cfccd. Revert "fix: error while importing local model is not shown (#3294)" This reverts commit 26be941e8426462e1e3a28e5b9bf1f834f462f82. Revert "fix: add lower case quantization support (#3293)" This reverts commit 3135ccc27e894a4056f882cd25f0bf7e10e56f49. Revert "fix: onnx can't be selected in download model modal (#3283)" This reverts commit 2521e1db518e9e01493e89dcc98c181ccd2b48a2. Revert "feat: add chunk count (#3290)" This reverts commit bad481bf05aa38edcf553e1273f5d692a65c9225. Revert "fix: RAM always show 0% (#3287)" This reverts commit 2201e6c5f87538b953503937fe6b135fe1aa2d94. Revert "fix: remote engine should not allow reinit (#3284)" This reverts commit 98abff0da3467c090618233db12a25bfa4c1db69. Revert "chore": update minor UI (#3281)" This reverts commit 105a9aa1a1830648a32ae285f751b4078c8ac2b2. Revert "chore: update z-index tooltip (#3280)" This reverts commit 5a81865508c205ed8c54df209092553a0c40054f. Revert "feat: add nvidia engine (#3279)" This reverts commit 8372f30f0ee99606b123351e7bb62636c62c8b23. Revert "fix: migration wrong directory (#3278)" This reverts commit 7fb1354287677f577070ccb065ed3a5f9e5b9882. Revert "fix: clearer app loading prompt (#3275)" This reverts commit 44a6401000334b79b225ab6fd6afb79f9da4bd51. Revert "fix: allow user to reinit engine from settings page (#3277)" This reverts commit 57cf3c7b3d5bface785763d06813906ba6eab7c9. Revert "feat: enable copy over instructions (#3266)" This reverts commit 2074511067201f0addb9d274cc90d1e782f2bc1d. Revert "chore: toast message on model import fail with reason (#3276)" This reverts commit 3bebdfe67e1571c7414065a36d16eb5941115ee0. Revert "fix: should not let second instance terminate cortex (#3274)" This reverts commit d074a5a445b73ca195a49814a935300f9e895aaa. Revert "chore: remnove focus button (#3272)" This reverts commit 07fa79e71a401becdbc0f474c27b860654a8bd62. Revert "chore: update hub search result (#3273)" This reverts commit 10b4a9087af709d147b34f6c3ee63d2d3b75c77a. Revert "chore: temporary hidden import model (#3270)" This reverts commit db5d8aba454fd4cc1e07253ca4805d4b1b3e7fb2. Revert "fix: set cortex data folder path when starting jan (#3252)" This reverts commit 91c77eda78ecd251d480e58b853fe7b261f6de50. Revert "fix: remote model added manually does not shown in model drop down (#3261)" This reverts commit 224ca3f7cc25b2577ab123829907964b78b78aa8. Revert "feat: add more options for cortex popup (#3236)" This reverts commit 5e06ed8a122aaed9d68fbd04ce42b65bf8987e58. Revert "feat: manage cloud models from threads screen (#3223)" This reverts commit 37a3c4f844419e66cfe3f2a9ff79ba688538241f. Revert "chore: check the legacy incompatible message type (#3248)" This reverts commit c10caf8d7f1f9cf68551e41de5d54cd4450cf44a. Revert "chore: minor copy for grammar (#3235)" This reverts commit f0f23078f31f58e01ba27787d6926f5c1eb2ff0b. Revert "fix: add back normalize message function (#3234)" This reverts commit 83579df3a40ff61eac25975da8295fceaec679dc. Revert "chore: update conditional starter screen after cortex load (#3227)" This reverts commit 4d3a97f1dca9e6c3ea746586e8607541f2d1c0b3. Revert "fix: broken status parse due to empty category (#3233)" This reverts commit 68714eeaf9212a6fdacd5c6a48d8691db9cc99eb. Revert "feat: make scroll area type auto for make default visible scrollbar (#3220)" This reverts commit 13428d60e7d3ea6a24c0df8871ea13e2dec0d5fd. Revert "fix: update new api from cortex to support 0.5.0 (#3221)" This reverts commit ec9b5bf682a8676e132a08075b6ae03cf9e23132. Revert "feat: new starter screen (#3217)" This reverts commit e8ee694abd33b34112d2c7d09f8c03370c2d22cc. Revert "bump-cortex-0.5.0-1 (#3218)" This reverts commit 5369da78f5b83b1c8761cb48820ccf3111728a90. Revert "Deprecate Docker and K8s (#3219)" This reverts commit 7611a05c44982d07465bec57658d5bf965f30ad5. Revert "chore: set container max width for chat message and new hub screen (#3213)" This reverts commit 007daa71616268b0e741e7a890b319401e49a81e. Revert "feat: integrating cortex (#3001)" This reverts commit 101268f6f36df96b62982a9eeb8581ebe103a909.
666 lines
20 KiB
Python
666 lines
20 KiB
Python
from __future__ import annotations
|
|
|
|
import sys
|
|
from enum import Enum, IntEnum, auto
|
|
from typing import Any
|
|
|
|
#
|
|
# constants
|
|
#
|
|
|
|
GGUF_MAGIC = 0x46554747 # "GGUF"
|
|
GGUF_VERSION = 3
|
|
GGUF_DEFAULT_ALIGNMENT = 32
|
|
|
|
#
|
|
# metadata keys
|
|
#
|
|
|
|
|
|
class Keys:
|
|
class General:
|
|
ARCHITECTURE = "general.architecture"
|
|
QUANTIZATION_VERSION = "general.quantization_version"
|
|
ALIGNMENT = "general.alignment"
|
|
NAME = "general.name"
|
|
AUTHOR = "general.author"
|
|
URL = "general.url"
|
|
DESCRIPTION = "general.description"
|
|
LICENSE = "general.license"
|
|
SOURCE_URL = "general.source.url"
|
|
SOURCE_HF_REPO = "general.source.huggingface.repository"
|
|
FILE_TYPE = "general.file_type"
|
|
|
|
class LLM:
|
|
CONTEXT_LENGTH = "{arch}.context_length"
|
|
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
|
BLOCK_COUNT = "{arch}.block_count"
|
|
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
|
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
|
EXPERT_COUNT = "{arch}.expert_count"
|
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
|
|
|
class Attention:
|
|
HEAD_COUNT = "{arch}.attention.head_count"
|
|
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
|
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
|
CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
|
KEY_LENGTH = "{arch}.attention.key_length"
|
|
VALUE_LENGTH = "{arch}.attention.value_length"
|
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
|
|
|
class Rope:
|
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
|
FREQ_BASE = "{arch}.rope.freq_base"
|
|
SCALING_TYPE = "{arch}.rope.scaling.type"
|
|
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
|
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
|
|
|
class Tokenizer:
|
|
MODEL = "tokenizer.ggml.model"
|
|
LIST = "tokenizer.ggml.tokens"
|
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
|
SCORES = "tokenizer.ggml.scores"
|
|
MERGES = "tokenizer.ggml.merges"
|
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
|
HF_JSON = "tokenizer.huggingface.json"
|
|
RWKV = "tokenizer.rwkv.world"
|
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
|
|
|
|
|
#
|
|
# recommended mapping of model tensor names for storage in gguf
|
|
#
|
|
|
|
|
|
class MODEL_ARCH(IntEnum):
|
|
LLAMA = auto()
|
|
FALCON = auto()
|
|
BAICHUAN = auto()
|
|
GPT2 = auto()
|
|
GPTJ = auto()
|
|
GPTNEOX = auto()
|
|
MPT = auto()
|
|
STARCODER = auto()
|
|
PERSIMMON = auto()
|
|
REFACT = auto()
|
|
BERT = auto()
|
|
BLOOM = auto()
|
|
STABLELM = auto()
|
|
QWEN = auto()
|
|
QWEN2 = auto()
|
|
PHI2 = auto()
|
|
PLAMO = auto()
|
|
CODESHELL = auto()
|
|
ORION = auto()
|
|
INTERNLM2 = auto()
|
|
MINICPM = auto()
|
|
|
|
|
|
class MODEL_TENSOR(IntEnum):
|
|
TOKEN_EMBD = auto()
|
|
TOKEN_EMBD_NORM = auto()
|
|
TOKEN_TYPES = auto()
|
|
POS_EMBD = auto()
|
|
OUTPUT = auto()
|
|
OUTPUT_NORM = auto()
|
|
ROPE_FREQS = auto()
|
|
ATTN_Q = auto()
|
|
ATTN_K = auto()
|
|
ATTN_V = auto()
|
|
ATTN_QKV = auto()
|
|
ATTN_OUT = auto()
|
|
ATTN_NORM = auto()
|
|
ATTN_NORM_2 = auto()
|
|
ATTN_ROT_EMBD = auto()
|
|
FFN_GATE_INP = auto()
|
|
FFN_NORM = auto()
|
|
FFN_GATE = auto()
|
|
FFN_DOWN = auto()
|
|
FFN_UP = auto()
|
|
FFN_ACT = auto()
|
|
FFN_GATE_EXP = auto()
|
|
FFN_DOWN_EXP = auto()
|
|
FFN_UP_EXP = auto()
|
|
ATTN_Q_NORM = auto()
|
|
ATTN_K_NORM = auto()
|
|
|
|
|
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
MODEL_ARCH.LLAMA: "llama",
|
|
MODEL_ARCH.FALCON: "falcon",
|
|
MODEL_ARCH.BAICHUAN: "baichuan",
|
|
MODEL_ARCH.GPT2: "gpt2",
|
|
MODEL_ARCH.GPTJ: "gptj",
|
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
|
MODEL_ARCH.MPT: "mpt",
|
|
MODEL_ARCH.STARCODER: "starcoder",
|
|
MODEL_ARCH.PERSIMMON: "persimmon",
|
|
MODEL_ARCH.REFACT: "refact",
|
|
MODEL_ARCH.BERT: "bert",
|
|
MODEL_ARCH.BLOOM: "bloom",
|
|
MODEL_ARCH.STABLELM: "stablelm",
|
|
MODEL_ARCH.QWEN: "qwen",
|
|
MODEL_ARCH.QWEN2: "qwen2",
|
|
MODEL_ARCH.PHI2: "phi2",
|
|
MODEL_ARCH.PLAMO: "plamo",
|
|
MODEL_ARCH.CODESHELL: "codeshell",
|
|
MODEL_ARCH.ORION: "orion",
|
|
MODEL_ARCH.INTERNLM2: "internlm2",
|
|
MODEL_ARCH.MINICPM: "minicpm",
|
|
}
|
|
|
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
|
MODEL_TENSOR.OUTPUT: "output",
|
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
|
}
|
|
|
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
MODEL_ARCH.LLAMA: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
],
|
|
MODEL_ARCH.GPTNEOX: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.FALCON: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.BAICHUAN: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.STARCODER: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.POS_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.BERT: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
|
MODEL_TENSOR.POS_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.MPT: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
MODEL_TENSOR.FFN_ACT,
|
|
],
|
|
MODEL_ARCH.GPTJ: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.PERSIMMON: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
],
|
|
MODEL_ARCH.REFACT: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.BLOOM: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.STABLELM: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.QWEN: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.QWEN2: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.PLAMO: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.GPT2: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.POS_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.PHI2: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.CODESHELL: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.POS_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.ORION: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.INTERNLM2: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.OUTPUT,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
],
|
|
MODEL_ARCH.MINICPM: [
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
MODEL_TENSOR.ATTN_Q,
|
|
MODEL_TENSOR.ATTN_K,
|
|
MODEL_TENSOR.ATTN_V,
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
MODEL_TENSOR.FFN_NORM,
|
|
MODEL_TENSOR.FFN_GATE,
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
MODEL_TENSOR.FFN_UP,
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
],
|
|
# TODO
|
|
}
|
|
|
|
# tensors that will not be serialized
|
|
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
MODEL_ARCH.LLAMA: [
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
],
|
|
MODEL_ARCH.BAICHUAN: [
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
],
|
|
MODEL_ARCH.PERSIMMON: [
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
],
|
|
MODEL_ARCH.QWEN: [
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
],
|
|
MODEL_ARCH.CODESHELL: [
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
],
|
|
MODEL_ARCH.ORION: [
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
],
|
|
}
|
|
|
|
#
|
|
# types
|
|
#
|
|
|
|
|
|
class TokenType(IntEnum):
|
|
NORMAL = 1
|
|
UNKNOWN = 2
|
|
CONTROL = 3
|
|
USER_DEFINED = 4
|
|
UNUSED = 5
|
|
BYTE = 6
|
|
|
|
|
|
class RopeScalingType(Enum):
|
|
NONE = 'none'
|
|
LINEAR = 'linear'
|
|
YARN = 'yarn'
|
|
|
|
|
|
class GGMLQuantizationType(IntEnum):
|
|
F32 = 0
|
|
F16 = 1
|
|
Q4_0 = 2
|
|
Q4_1 = 3
|
|
Q5_0 = 6
|
|
Q5_1 = 7
|
|
Q8_0 = 8
|
|
Q8_1 = 9
|
|
Q2_K = 10
|
|
Q3_K = 11
|
|
Q4_K = 12
|
|
Q5_K = 13
|
|
Q6_K = 14
|
|
Q8_K = 15
|
|
|
|
|
|
class GGUFEndian(IntEnum):
|
|
LITTLE = 0
|
|
BIG = 1
|
|
|
|
|
|
class GGUFValueType(IntEnum):
|
|
UINT8 = 0
|
|
INT8 = 1
|
|
UINT16 = 2
|
|
INT16 = 3
|
|
UINT32 = 4
|
|
INT32 = 5
|
|
FLOAT32 = 6
|
|
BOOL = 7
|
|
STRING = 8
|
|
ARRAY = 9
|
|
UINT64 = 10
|
|
INT64 = 11
|
|
FLOAT64 = 12
|
|
|
|
@staticmethod
|
|
def get_type(val: Any) -> GGUFValueType:
|
|
if isinstance(val, (str, bytes, bytearray)):
|
|
return GGUFValueType.STRING
|
|
elif isinstance(val, list):
|
|
return GGUFValueType.ARRAY
|
|
elif isinstance(val, float):
|
|
return GGUFValueType.FLOAT32
|
|
elif isinstance(val, bool):
|
|
return GGUFValueType.BOOL
|
|
elif isinstance(val, int):
|
|
return GGUFValueType.INT32
|
|
# TODO: need help with 64-bit types in Python
|
|
else:
|
|
print("Unknown type:", type(val))
|
|
sys.exit()
|
|
|
|
|
|
# Note: Does not support GGML_QKK_64
|
|
QK_K = 256
|
|
# Items here are (block size, type size)
|
|
GGML_QUANT_SIZES = {
|
|
GGMLQuantizationType.F32: (1, 4),
|
|
GGMLQuantizationType.F16: (1, 2),
|
|
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
|
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
|
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
|
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
|
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
|
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
|
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
|
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
|
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
|
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
|
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
|
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
|
}
|
|
|
|
|
|
# Aliases for backward compatibility.
|
|
|
|
# general
|
|
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
|
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
|
|
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
|
|
KEY_GENERAL_NAME = Keys.General.NAME
|
|
KEY_GENERAL_AUTHOR = Keys.General.AUTHOR
|
|
KEY_GENERAL_URL = Keys.General.URL
|
|
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
|
|
KEY_GENERAL_LICENSE = Keys.General.LICENSE
|
|
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
|
|
KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
|
|
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
|
|
|
# LLM
|
|
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
|
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
|
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
|
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
|
|
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
|
|
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
|
|
|
|
# attention
|
|
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
|
|
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
|
|
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
|
|
KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV
|
|
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
|
|
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
|
|
|
|
# RoPE
|
|
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
|
|
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
|
|
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
|
|
KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
|
|
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
|
|
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
|
|
|
# tokenization
|
|
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
|
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
|
|
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
|
|
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
|
|
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
|
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
|
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
|
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
|
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
|
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|