quic · quic-morteza · Mar 25, 2025 · Apr 7, 2025 · Apr 8, 2025 · ochougul
@@ -107,7 +107,7 @@ class FP8DeQuantLinearToLinearTransform(ModuleMutatorTransform):
     @classmethod
     def mutate(cls, original_module, parent_module):
         #  -- de-quantizing the weights --
-        dequant_weights = original_module.weight.to(torch.float32) * original_module.weight_scale
+        dequant_weights = original_module.weight.to(torch.float32)  # * original_module.weight_scale
         dequant_linear_layer = nn.Linear(
             original_module.in_features, original_module.out_features, bias=original_module.bias is not None
         )

@@ -121,7 +121,7 @@ def for_fp8_layer(cls, in_features, out_features, activation_quantization_strate
     def forward(self, x):
         # Only inference supported
         with torch.no_grad():
-            dequantized_weights = self.weight.to(torch.float32) * self.weight_scale
+            dequantized_weights = self.weight.to(torch.float32)  # * self.weight_scale
             out = torch.matmul(x.float(), dequantized_weights.T)
             out = out + self.bias if self.bias is not None else out
 

@@ -53,6 +53,10 @@ def duplicate_weights_for_linear_layer(
         layer.weight.data = torch.repeat_interleave(
             layer.weight.data.view(orig_kv_heads, head_dim, hidden_size), repeat, 0
         ).view(new_kv_heads * head_dim, hidden_size)
+        if layer.bias is not None:
+            layer.bias.data = torch.repeat_interleave(layer.bias.data.view(orig_kv_heads, head_dim), repeat, 0).view(
+                new_kv_heads * head_dim
+            )
 
 
 def main(args):