Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use eetq kernel from the hub #3029

Merged
merged 2 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,6 @@ COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN . .venv/bin/activate && make build-awq

# Build eetq kernels
FROM kernel-builder AS eetq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN . .venv/bin/activate && make build-eetq

# Build Lorax Punica kernels
FROM kernel-builder AS lorax-punica-builder
WORKDIR /usr/src
Expand Down Expand Up @@ -216,8 +209,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from eetq kernels builder
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from lorax punica kernels builder
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from mamba builder
Expand Down
7 changes: 4 additions & 3 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
};
nix-filter.url = "github:numtide/nix-filter";
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
tgi-nix.url = "github:huggingface/text-generation-inference-nix/eetq-0.0.1";
nixpkgs.follows = "tgi-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {
Expand Down
4 changes: 2 additions & 2 deletions nix/server.nix
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
awq-inference-engine,
causal-conv1d,
compressed-tensors,
eetq,
einops,
exllamav2,
flashinfer,
Expand Down Expand Up @@ -36,6 +35,7 @@
py-cpuinfo,
pydantic,
quantization,
quantization-eetq,
safetensors,
tokenizers,
torch,
Expand Down Expand Up @@ -80,7 +80,6 @@ buildPythonPackage {

dependencies = [
awq-inference-engine
eetq
causal-conv1d
compressed-tensors
einops
Expand Down Expand Up @@ -111,6 +110,7 @@ buildPythonPackage {
py-cpuinfo
pydantic
quantization
quantization-eetq
safetensors
sentencepiece
tokenizers
Expand Down
1 change: 0 additions & 1 deletion server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ include Makefile-flash-att
include Makefile-flash-att-v2
include Makefile-vllm
include Makefile-awq
include Makefile-eetq
include Makefile-selective-scan
include Makefile-lorax-punica
include Makefile-exllamav2
Expand Down
198 changes: 198 additions & 0 deletions server/hf-kernels.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6736,5 +6736,203 @@
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
}
]
},
{
"repo_id": "kernels-community/quantization-eetq",
"sha": "a80ce846d6270ddddeee109523ed947f594f246b",
"files": [
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "9c191845fb7acbd7ea6bae36ce8c237b168557e1"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_v7rnpcck3kry4.abi3.so",
"blob_id": "9edc9126b9ec8ce4f47a8e6688a5f0329c905329"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "ccec58b06a2282da51356fe5d04dd1e2757ce80c"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_zcfiojfkx55be.abi3.so",
"blob_id": "ea27fb040515267ec631cec5545b878da680e7cc"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "bb409419898138ffa9ade9ba505a167a067ea378"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_btymam4x7xvs6.abi3.so",
"blob_id": "0395dd048ccf10ed020a77fa04bcb026ba369d73"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "f250a00832d2044f7bbb87557a1c878d9c8dd24d"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_yy3p6bsf622sq.abi3.so",
"blob_id": "c98d156835e442b039d38a82e9f111036750329c"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "b5259247e8fb3ed9429cf005a525edc8bcae4903"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_imijtykkseqze.abi3.so",
"blob_id": "c46908ce00d02376ae8e18efebb7fee55afbc3ac"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "79f8d42700ad34b9b46e6e328f90885d1ee9beab"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_4qerj3t7ddiry.abi3.so",
"blob_id": "9ba519d2fd4e347b784c21f4c171cbbab57c7774"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "805ec785b7f5196f78dfe77b6cd7c2603c02490e"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_j23ltbqvrnixg.abi3.so",
"blob_id": "77d53c16e57c658e8f9caa37b0084c4a3a7ffda1"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "7b590a5a6ede67e0ae13f97dbd7a82a4674e1b23"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_p5neqtnhdgxv2.abi3.so",
"blob_id": "e3e5fbd8ce3232b6e9a7c3077eab9665b95bef49"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "0be7ffcb2e9590899683a197b977ec0b39ca7cb7"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_idk3dezy35dfk.abi3.so",
"blob_id": "61aa67cbe7ce810bf9792e6e8f19219c757ff181"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "998eba3eddd0520769a2b4ecb3402c024bde44ea"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_fpjoxzd7nm2qa.abi3.so",
"blob_id": "31d835db1d0348e3f35c23e6a8f2532fd7e9fea7"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "6d5320b05b03f2f3ddfd299d6e2a72aa6116264f"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_k7mlunxe2ye4s.abi3.so",
"blob_id": "1946e4c2fab63243d051012cb12e19895828145f"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "9b15d85f44e4223ce1f16df987feafd6640dcc62"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_7m7hz3sbwkaio.abi3.so",
"blob_id": "eb1536ccd1dfa2655ea7de4445aa3c6790f3a0ae"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
}
]
}
]
1 change: 1 addition & 0 deletions server/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ build-backend = "setuptools.build_meta"
"kernels-community/paged-attention" = ">=0.0.2"
"kernels-community/moe" = ">=0.1.1"
"kernels-community/quantization" = ">=0.0.3"
"kernels-community/quantization-eetq" = ">=0.0.1"

[project.scripts]
text-generation-server = "text_generation_server.cli:app"
Expand Down
10 changes: 7 additions & 3 deletions server/text_generation_server/layers/eetq.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from dataclasses import dataclass

import torch
from EETQ import quant_weights, w8_a16_gemm
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.weights import UnquantizedWeight

quantization_eetq = load_kernel(
module="quantization_eetq", repo_id="kernels-community/quantization-eetq"
)


@dataclass
class EETQWeight(UnquantizedWeight):
Expand Down Expand Up @@ -31,13 +35,13 @@ def __init__(
if weight.dtype != torch.float16:
weight = weight.to(dtype=torch.float16)
weight = torch.t(weight).contiguous().cpu()
weight, scale = quant_weights(weight, torch.int8, False)
weight, scale = quantization_eetq.quant_weights(weight, torch.int8, False)

self.weight = weight.cuda(device)
self.scale = scale.cuda(device)
self.bias = bias.cuda(device) if bias is not None else None

def forward(self, input: torch.Tensor) -> torch.Tensor:
output = w8_a16_gemm(input, self.weight, self.scale)
output = quantization_eetq.w8_a16_gemm(input, self.weight, self.scale)
output = output + self.bias if self.bias is not None else output
return output
Loading