Add 8-bit quantization support and release 7B model (lm-sys#252)

merrymercy · web-flow · commit e2de15f23ea4 · 2023-04-06T11:45:45.000-07:00
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ We release [Vicuna](https://vicuna.lmsys.org/) weights as delta weights to compl
 You can add our delta to the original LLaMA weights to obtain the Vicuna weights. Instructions:
 
 1. Get the original LLaMA weights in the huggingface format by following the instructions [here](https://huggingface.co/docs/transformers/main/model_doc/llama).
-2. Use the following scripts to get Vicuna weights by applying our delta. It will automatically download delta weights from our Hugging Face account.
+2. Use the following scripts to get Vicuna weights by applying our delta. They will automatically download delta weights from our Hugging Face account.
 
 **NOTE**:
 Our released weights are only compatible with the latest main branch of huggingface/transformers.
@@ -68,12 +68,18 @@ python3 -m fastchat.model.apply_delta \
 ```
 
 ### Vicuna-7B
-Coming soon.
+This conversion command needs around 30 GB of CPU RAM.
+```bash
+python3 -m fastchat.model.apply_delta \
+    --base /path/to/llama-7b \
+    --target /output/path/to/vicuna-7b \
+    --delta lmsys/vicuna-7b-delta-v0
+```
 
 ## Inference with Command Line Interface
 
 ### Single GPU
-The command below requires around 28GB of GPU memory for Vicuna-13B.
+The command below requires around 28GB of GPU memory for Vicuna-13B and 14GB of GPU memory for Vicuna-7B.
 ```
 python3 -m fastchat.serve.cli --model-name /path/to/vicuna/weights
 ```
@@ -85,22 +91,21 @@ python3 -m fastchat.serve.cli --model-name /path/to/vicuna/weights --num-gpus 2
 ```
 
 ### CPU Only
-This runs on the CPU only and does not require GPU. It requires around 60GB of CPU memory for Vicuna-13B.
+This runs on the CPU only and does not require GPU. It requires around 60GB of CPU memory for Vicuna-13B and around 30GB of CPU memory for Vicuna-7B.
 ```
 python3 -m fastchat.serve.cli --model-name /path/to/vicuna/weights --device cpu
 ```
 
 ### Metal Backend (Mac computers with Apple silicon or AMD GPUs)
+Use `--device mps` to enable GPU acceleration on Mac computers and use `--load-8bit` to turn on 8-bit compression.
 ```
-python3 -m fastchat.serve.cli --model-name /path/to/vicuna/weights --device mps
+python3 -m fastchat.serve.cli --model-name /path/to/vicuna/weights --device mps --load-8bit
 ```
 
 ### Others (Quantization, Low-end Devices, and More Platforms)
-
-You can load in 8-bit mode to reduce GPU memory usage with slightly degraded model quality.
-It is tested on a single 4090 and requires around 18GB of GPU memory for Vicuna-13B.
-Note that this mode only works on a single GPU.
-You are also required to install `bitsandbytes` according to the printed messages.
+If you do not have enough memory, you can enable 8-bit compression by adding `--load-8bit` to commands above.
+It works with CPU, GPU, and Metal.
+This can reduce the memory usage by around half with slightly degraded model quality.
 
 ```
 python3 -m fastchat.serve.cli --model-name /path/to/vicuna/weights --load-8bit
diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py
@@ -9,6 +9,7 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
 
 from fastchat.conversation import conv_templates, SeparatorStyle
+from fastchat.serve.compression import compress_module
 from fastchat.serve.monkey_patch_non_inplace import replace_llama_attn_with_non_inplace_operations
 
 
@@ -32,8 +33,8 @@ def load_model(model_name, device, num_gpus, load_8bit=False):
                         "max_memory": {i: "13GiB" for i in range(num_gpus)},
                     })
     elif device == "mps":
-        # Avoid bugs in mps backend by not using in-place operations.
         kwargs = {"torch_dtype": torch.float16}
+        # Avoid bugs in mps backend by not using in-place operations.
         replace_llama_attn_with_non_inplace_operations()
     else:
         raise ValueError(f"Invalid device: {device}")
@@ -48,6 +49,12 @@ def load_model(model_name, device, num_gpus, load_8bit=False):
     elif device == "mps":
         model.to("mps")
 
+    if (device == "mps" or device == "cpu") and load_8bit:
+        compress_module(model)
+
+    if args.debug:
+        print(model)
+
     return model, tokenizer
 
 
diff --git a/fastchat/serve/compression.py b/fastchat/serve/compression.py
@@ -0,0 +1,116 @@
+import dataclasses
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+@dataclasses.dataclass
+class CompressionConfig:
+    """Group-wise quantization."""
+    num_bits: int
+    group_size: int
+    group_dim: int
+    symmetric: bool
+    enabled: bool = True
+
+
+default_compression_config = CompressionConfig(
+    num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True)
+
+
+class CLinear(nn.Module):
+    def __init__(self, weight, bias):
+        super().__init__()
+
+        self.weight = compress(weight.data, default_compression_config)
+        self.bias = bias
+
+    def forward(self, input: Tensor) -> Tensor:
+        weight = decompress(self.weight, default_compression_config)
+        return F.linear(input, weight, self.bias)
+
+
+def compress_module(module):
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            setattr(module, attr_str, CLinear(target_attr.weight, target_attr.bias))
+    for name, child in module.named_children():
+        compress_module(child)
+
+
+def compress(tensor, config):
+    """Simulate group-wise quantization."""
+    if not config.enabled:
+        return tensor
+
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size, config.num_bits, config.group_dim, config.symmetric)
+    assert num_bits <= 8
+
+    original_shape = tensor.shape
+    num_groups = (original_shape[group_dim] + group_size - 1) // group_size
+    new_shape = (original_shape[:group_dim] + (num_groups, group_size) +
+                 original_shape[group_dim+1:])
+
+    # Pad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len != 0:
+        pad_shape = original_shape[:group_dim] + (pad_len,) + original_shape[group_dim+1:]
+        tensor = torch.cat([
+            tensor,
+            torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
+            dim=group_dim)
+    data = tensor.view(new_shape)
+
+    # Quantize
+    if symmetric:
+        B = 2 ** (num_bits - 1) - 1
+        scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
+        data = data * scale
+        data = data.clamp_(-B, B).round_().to(torch.int8)
+        return data, scale, original_shape
+    else:
+        B = 2 ** num_bits - 1
+        mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
+        mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
+
+        scale = B / (mx - mn)
+        data = data - mn
+        data.mul_(scale)
+
+        data = data.clamp_(0, B).round_().to(torch.uint8)
+        return data, mn, scale, original_shape
+
+
+def decompress(packed_data, config):
+    """Simulate group-wise dequantization."""
+    if not config.enabled:
+        return packed_data
+
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size, config.num_bits, config.group_dim, config.symmetric)
+
+    # Dequantize
+    if symmetric:
+        data, scale, original_shape = packed_data
+        data = data / scale
+    else:
+        data, mn, scale, original_shape = packed_data
+        data = data / scale
+        data.add_(mn)
+
+    # Unpad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len:
+        padded_original_shape = (
+            original_shape[:group_dim] +
+            (original_shape[group_dim] + pad_len,) +
+            original_shape[group_dim+1:])
+        data = data.reshape(padded_original_shape)
+        indices = [slice(0, x) for x in original_shape]
+        return data[indices].contiguous()
+    else:
+        return data.view(original_shape)
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fschat"
-version = "0.1.6"
+version = "0.1.7"
 description = "An open platform for training, serving, and evaluating large language model based chatbots."
 readme = "README.md"
 requires-python = ">=3.8"