-
Notifications
You must be signed in to change notification settings - Fork 243
Support KIMI K2 Thinking int4 checkpoint PTQ #669
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
998856b
3aebac7
95ee275
a09d86f
09c12af
295bbb7
e8b7fc6
bfb9a14
55c7224
be81219
dc33337
aa7f8be
e6ffcce
4200d3c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| compressed-tensors==0.12.0 | ||
| fire | ||
| flash-attn>=2.6.0 | ||
| rouge_score>=0.1.2 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,8 @@ | |
| from typing import TYPE_CHECKING | ||
|
|
||
| import torch | ||
| from torch import Tensor | ||
| from torch.nn.functional import linear | ||
|
|
||
| try: | ||
| from torch.distributed.tensor import Shard | ||
|
|
@@ -665,6 +667,32 @@ def top_k(self, value): | |
| self.router.moe_top_k = value | ||
|
|
||
|
|
||
| class _QuantCompressedLinear(QuantModule): | ||
| def _setup(self): | ||
| self.input_quantizer = TensorQuantizer() | ||
| self.weight_quantizer = TensorQuantizer() | ||
|
|
||
| def forward(self, input: Tensor) -> Tensor: | ||
| from compressed_tensors.quantization import QuantizationStatus | ||
|
|
||
| if self.quantization_status == QuantizationStatus.COMPRESSED: | ||
| weight_data = self.compressor.decompress_module(self) | ||
| else: | ||
| weight_data = self.weight | ||
|
|
||
| return linear(self.input_quantizer(input), self.weight_quantizer(weight_data), self.bias) | ||
|
|
||
| def unpack_weight(self): | ||
| from compressed_tensors.quantization import QuantizationStatus | ||
|
|
||
| if self.quantization_status == QuantizationStatus.COMPRESSED: | ||
| self.weight = nn.Parameter(self.compressor.decompress_module(self), requires_grad=False) | ||
| if hasattr(self, "weight_packed"): | ||
| del self.weight_packed | ||
| if hasattr(self, "weight_scale"): | ||
| del self.weight_scale | ||
|
|
||
|
|
||
| try: | ||
| from transformers.models.llama4.modeling_llama4 import Llama4TextExperts, Llama4TextMoe | ||
|
|
||
|
|
@@ -740,6 +768,16 @@ def top_k(self, value): | |
| except ImportError: | ||
| pass | ||
|
|
||
| try: | ||
| from compressed_tensors.linear.compressed_linear import CompressedLinear | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add compressed-tensor as an optional dependency?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kevalmorabia97 @realAsma what do you think?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If a user is quantizing a model with CompressedLinear, wouldn't they already have
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we move this to a seperate file
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is a good point. +1
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not right now
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
How strong do you feel about it? Right now I feel this still fall under hf plugins as it's part of the HF's invocation. |
||
|
|
||
| if CompressedLinear not in QuantModuleRegistry: | ||
| QuantModuleRegistry.register({CompressedLinear: "hf.CompressedLinear"})( | ||
| _QuantCompressedLinear | ||
| ) | ||
| except ImportError: | ||
| pass | ||
|
|
||
| try: | ||
| from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( | ||
| Qwen3VLMoeTextExperts, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.