diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts index 07873ea57..e28eb3fbb 100644 --- a/packages/gguf/src/gguf.ts +++ b/packages/gguf/src/gguf.ts @@ -3,6 +3,7 @@ import { GGUFValueType } from "./types"; export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types"; export { GGUFValueType, GGMLQuantizationType } from "./types"; +export { QUANT_DESCRIPTIONS } from "./quant-descriptions"; export const RE_GGUF_FILE = /\.gguf$/; export const RE_GGUF_SHARD_FILE = /-(\d{5})-of-(\d{5})\.gguf$/; diff --git a/packages/gguf/src/quant-descriptions.ts b/packages/gguf/src/quant-descriptions.ts new file mode 100644 index 000000000..fe4f4a8e9 --- /dev/null +++ b/packages/gguf/src/quant-descriptions.ts @@ -0,0 +1,98 @@ +import { GGMLQuantizationType } from "./types"; + +export const QUANT_DESCRIPTIONS: Record = { + [GGMLQuantizationType.F32]: { + txt: "32-bit standard IEEE 754 single-precision floating-point number.", + src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format", + }, + [GGMLQuantizationType.F16]: { + txt: "16-bit standard IEEE 754 half-precision floating-point number.", + src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format", + }, + [GGMLQuantizationType.Q4_0]: { + txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today)", + src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249", + }, + [GGMLQuantizationType.Q4_1]: { + txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today)", + src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290", + }, + [GGMLQuantizationType.Q5_0]: { + txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today)", + src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249", + }, + [GGMLQuantizationType.Q5_1]: { + txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today)", + src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290", + }, + [GGMLQuantizationType.Q8_0]: { + txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today)", + src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249", + }, + [GGMLQuantizationType.Q8_1]: { + txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today)", + src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290", + }, + [GGMLQuantizationType.Q2_K]: { + txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.5625 bits-per-weight.`, + src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305", + }, + [GGMLQuantizationType.Q3_K]: { + txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight`, + src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305", + }, + [GGMLQuantizationType.Q4_K]: { + txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`, + src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305", + }, + [GGMLQuantizationType.Q5_K]: { + txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`, + src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305", + }, + [GGMLQuantizationType.Q6_K]: { + txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`, + src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305", + }, + [GGMLQuantizationType.Q8_K]: { + txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`, + src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305", + }, + [GGMLQuantizationType.IQ2_XXS]: { + txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.", + src_url: + "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70", + }, + [GGMLQuantizationType.IQ2_XS]: { + txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.", + src_url: + "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70", + }, + [GGMLQuantizationType.IQ3_XXS]: { + txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.", + src_url: + "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70", + }, + [GGMLQuantizationType.IQ1_S]: { + txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.", + src_url: + "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70", + }, + [GGMLQuantizationType.IQ4_NL]: { + txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix", + }, + [GGMLQuantizationType.IQ3_S]: { + txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.", + src_url: + "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70", + }, + [GGMLQuantizationType.IQ2_S]: { + txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.", + src_url: + "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70", + }, + [GGMLQuantizationType.IQ4_XS]: { + txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.", + src_url: + "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70", + }, +};