Skip to content

Commit

Permalink
Use eetq kernel from the hub
Browse files Browse the repository at this point in the history
  • Loading branch information
danieldk committed Feb 17, 2025
1 parent cfd4fbb commit 38a1987
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 20 deletions.
9 changes: 0 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,6 @@ COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN . .venv/bin/activate && make build-awq

# Build eetq kernels
FROM kernel-builder AS eetq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN . .venv/bin/activate && make build-eetq

# Build Lorax Punica kernels
FROM kernel-builder AS lorax-punica-builder
WORKDIR /usr/src
Expand Down Expand Up @@ -216,8 +209,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from eetq kernels builder
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from lorax punica kernels builder
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from mamba builder
Expand Down
7 changes: 4 additions & 3 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
};
nix-filter.url = "github:numtide/nix-filter";
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
tgi-nix.url = "github:huggingface/text-generation-inference-nix/eetq-0.0.1";
nixpkgs.follows = "tgi-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {
Expand Down
4 changes: 2 additions & 2 deletions nix/server.nix
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
awq-inference-engine,
causal-conv1d,
compressed-tensors,
eetq,
einops,
exllamav2,
flashinfer,
Expand Down Expand Up @@ -36,6 +35,7 @@
py-cpuinfo,
pydantic,
quantization,
quantization-eetq,
safetensors,
tokenizers,
torch,
Expand Down Expand Up @@ -80,7 +80,6 @@ buildPythonPackage {

dependencies = [
awq-inference-engine
eetq
causal-conv1d
compressed-tensors
einops
Expand Down Expand Up @@ -111,6 +110,7 @@ buildPythonPackage {
py-cpuinfo
pydantic
quantization
quantization-eetq
safetensors
sentencepiece
tokenizers
Expand Down
1 change: 0 additions & 1 deletion server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ include Makefile-flash-att
include Makefile-flash-att-v2
include Makefile-vllm
include Makefile-awq
include Makefile-eetq
include Makefile-selective-scan
include Makefile-lorax-punica
include Makefile-exllamav2
Expand Down
200 changes: 199 additions & 1 deletion server/hf-kernels.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6736,5 +6736,203 @@
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
}
]
},
{
"repo_id": "kernels-community/quantization-eetq",
"sha": "a80ce846d6270ddddeee109523ed947f594f246b",
"files": [
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "9c191845fb7acbd7ea6bae36ce8c237b168557e1"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_v7rnpcck3kry4.abi3.so",
"blob_id": "9edc9126b9ec8ce4f47a8e6688a5f0329c905329"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "ccec58b06a2282da51356fe5d04dd1e2757ce80c"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_zcfiojfkx55be.abi3.so",
"blob_id": "ea27fb040515267ec631cec5545b878da680e7cc"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "bb409419898138ffa9ade9ba505a167a067ea378"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_btymam4x7xvs6.abi3.so",
"blob_id": "0395dd048ccf10ed020a77fa04bcb026ba369d73"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "f250a00832d2044f7bbb87557a1c878d9c8dd24d"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_yy3p6bsf622sq.abi3.so",
"blob_id": "c98d156835e442b039d38a82e9f111036750329c"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "b5259247e8fb3ed9429cf005a525edc8bcae4903"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_imijtykkseqze.abi3.so",
"blob_id": "c46908ce00d02376ae8e18efebb7fee55afbc3ac"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "79f8d42700ad34b9b46e6e328f90885d1ee9beab"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_4qerj3t7ddiry.abi3.so",
"blob_id": "9ba519d2fd4e347b784c21f4c171cbbab57c7774"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "805ec785b7f5196f78dfe77b6cd7c2603c02490e"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_j23ltbqvrnixg.abi3.so",
"blob_id": "77d53c16e57c658e8f9caa37b0084c4a3a7ffda1"
},
{
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "7b590a5a6ede67e0ae13f97dbd7a82a4674e1b23"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_p5neqtnhdgxv2.abi3.so",
"blob_id": "e3e5fbd8ce3232b6e9a7c3077eab9665b95bef49"
},
{
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "0be7ffcb2e9590899683a197b977ec0b39ca7cb7"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_idk3dezy35dfk.abi3.so",
"blob_id": "61aa67cbe7ce810bf9792e6e8f19219c757ff181"
},
{
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "998eba3eddd0520769a2b4ecb3402c024bde44ea"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_fpjoxzd7nm2qa.abi3.so",
"blob_id": "31d835db1d0348e3f35c23e6a8f2532fd7e9fea7"
},
{
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "6d5320b05b03f2f3ddfd299d6e2a72aa6116264f"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_k7mlunxe2ye4s.abi3.so",
"blob_id": "1946e4c2fab63243d051012cb12e19895828145f"
},
{
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/__init__.py",
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_ops.py",
"blob_id": "9b15d85f44e4223ce1f16df987feafd6640dcc62"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_7m7hz3sbwkaio.abi3.so",
"blob_id": "eb1536ccd1dfa2655ea7de4445aa3c6790f3a0ae"
},
{
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
}
]
}
]
]
1 change: 1 addition & 0 deletions server/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ build-backend = "setuptools.build_meta"
"kernels-community/paged-attention" = ">=0.0.2"
"kernels-community/moe" = ">=0.1.1"
"kernels-community/quantization" = ">=0.0.3"
"kernels-community/quantization-eetq" = ">=0.0.1"

[project.scripts]
text-generation-server = "text_generation_server.cli:app"
Expand Down
10 changes: 7 additions & 3 deletions server/text_generation_server/layers/eetq.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from dataclasses import dataclass

import torch
from EETQ import quant_weights, w8_a16_gemm
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.weights import UnquantizedWeight

quantization_eetq = load_kernel(
module="quantization_eetq", repo_id="kernels-community/quantization-eetq"
)


@dataclass
class EETQWeight(UnquantizedWeight):
Expand Down Expand Up @@ -31,13 +35,13 @@ def __init__(
if weight.dtype != torch.float16:
weight = weight.to(dtype=torch.float16)
weight = torch.t(weight).contiguous().cpu()
weight, scale = quant_weights(weight, torch.int8, False)
weight, scale = quantization_eetq.quant_weights(weight, torch.int8, False)

self.weight = weight.cuda(device)
self.scale = scale.cuda(device)
self.bias = bias.cuda(device) if bias is not None else None

def forward(self, input: torch.Tensor) -> torch.Tensor:
output = w8_a16_gemm(input, self.weight, self.scale)
output = quantization_eetq.w8_a16_gemm(input, self.weight, self.scale)
output = output + self.bias if self.bias is not None else output
return output

0 comments on commit 38a1987

Please sign in to comment.