vllm-project · horheynm · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -80,6 +80,7 @@ jobs:
         run: |
           pytest -v tests/llmcompressor/pytorch
 
+
   transformers-tests:
     runs-on: gcp-k8s-vllm-l4-solo
     steps:
@@ -100,6 +101,15 @@ jobs:
           pip3 install ./compressed-tensors/
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
+      - uses: actions/checkout@v4
+        with:
+          repository: "huggingface/transformers"
+          path: "transformers"
+      - name: "⚙️ Install transformers from source"
+        run: |
+          pip3 uninstall -y transformers
+          pip3 install ./transformers
+          rm -rf transformers
       - name: "🔬 Running transformers tests"
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |

diff --git a/src/llmcompressor/pytorch/utils/sparsification.py b/src/llmcompressor/pytorch/utils/sparsification.py
@@ -105,15 +105,18 @@ def params_quantized(self) -> int:
         """
         :return: number of parameters across quantized layers
         """
-        return sum(
-            torch.numel(self.trainable_params[f"{name}.weight"])
-            + (
-                torch.numel(self.trainable_params[f"{name}.bias"])
-                if hasattr(layer, "bias") and layer.bias is not None
-                else 0
+        num_params = 0
+        for name, layer in get_quantized_layers(self.module):
+            num_param = torch.numel(
+                self.trainable_params.get(f"{name}.weight", torch.tensor([]))
             )
-            for (name, layer) in get_quantized_layers(self.module)
-        )
+            if num_param is None:
+                logger.warning(f"{name} is not recognized in trainable_params")
+                continue
+            if hasattr(layer, "bias") and layer.bias is not None:
+                num_params += layer.bias
+
+        return num_params
 
     @property
     def params_quantized_percent(self) -> float:

diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -30,6 +30,7 @@
     PreTrainedModel,
     set_seed,
 )
+from transformers.utils.quantization_config import CompressedTensorsConfig
 
 from llmcompressor.core import pre_initialize_structure, reset_session
 from llmcompressor.pytorch.model_load.helpers import (
@@ -52,7 +53,10 @@
 from llmcompressor.transformers.sparsification.sparse_model import (
     get_shared_processor_src,
 )
-from llmcompressor.transformers.utils.helpers import detect_last_checkpoint
+from llmcompressor.transformers.utils.helpers import (
+    detect_last_checkpoint,
+    is_model_quantized_from_path,
+)
 from llmcompressor.typing import Processor
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
@@ -224,6 +228,13 @@ def initialize_model_from_path(
         "trust_remote_code": model_args.trust_remote_code_model,
     }
     # this calls from_pretrained under the hood so should be FSDP safe
+
+    # optimized models must be decompressed to carry out oneshot/train/etc
+    if is_model_quantized_from_path(model_path):
+        model_kwargs["quantization_config"] = CompressedTensorsConfig(
+            run_compressed=False
+        )
+
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         **model_kwargs,

diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py
@@ -4,9 +4,13 @@
 """
 
 import os
-from typing import TYPE_CHECKING, Optional
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Tuple, Union
 
+import requests
+from huggingface_hub import HUGGINGFACE_CO_URL_HOME, hf_hub_download
 from loguru import logger
+from transformers import AutoConfig
 from transformers.trainer_utils import get_last_checkpoint
 
 if TYPE_CHECKING:
@@ -15,6 +19,7 @@
 __all__ = [
     "RECIPE_FILE_NAME",
     "detect_last_checkpoint",
+    "is_model_quantized_from_path",
 ]
 
 RECIPE_FILE_NAME = "recipe.yaml"
@@ -54,3 +59,223 @@ def detect_last_checkpoint(
             )
 
     return last_checkpoint
+
+
+def is_model_quantized_from_path(path: str):
+    """
+    Determine if model is quantized based on the config
+    """
+    config = AutoConfig.from_pretrained(path)
+    if config is not None:
+        if hasattr(config, "quantization_config"):
+            return True
+    return False
+
+
+def resolve_recipe(
+    model_path: Union[str, Path],
+    recipe: Union[str, Path, None] = None,
+) -> Union[str, None]:
+    """
+    Resolve the recipe to apply to the model.
+    :param recipe: the recipe to apply to the model.
+        It can be one of the following:
+        - None
+            This means that we are not either not applying
+            any recipe and allowing the model to potentially
+            infer the appropriate pre-existing recipe
+            from the model_path
+        - a path to the recipe file
+            This can be a string or Path object pointing
+            to a recipe file. If the specified recipe file
+            is different from the potential pre-existing
+            recipe for that model (stored in the model_path),
+            the function will raise an warning
+        - name of the recipe file (e.g. "recipe.yaml")
+            Recipe file name specific is assumed to be stored
+            in the model_path
+        - a string containing the recipe
+            Needs to adhere to the SparseML recipe format
+
+    :param model_path: the path to the model to load.
+        It can be one of the following:
+        - a path to the model directory
+        - a path to the model file
+        - Hugging face model id
+
+    :return: the resolved recipe
+    """
+
+    if recipe is None:
+        return infer_recipe_from_model_path(model_path)
+
+    elif os.path.isfile(recipe):
+        # recipe is a path to a recipe file
+        return resolve_recipe_file(recipe, model_path)
+
+    elif os.path.isfile(os.path.join(model_path, recipe)):
+        # recipe is a name of a recipe file
+        recipe = os.path.join(model_path, recipe)
+        return resolve_recipe_file(recipe, model_path)
+
+    elif isinstance(recipe, str):
+        # recipe is a string containing the recipe
+        logger.debug(
+            "Applying the recipe string directly to the model, without "
+            "checking for a potential existing recipe in the model_path."
+        )
+        return recipe
+
+    logger.info(
+        "No recipe requested and no default recipe "
+        f"found in {model_path}. Skipping recipe resolution."
+    )
+    return None
+
+
+def infer_recipe_from_model_path(model_path: Union[str, Path]) -> Optional[str]:
+    """
+    Infer the recipe from the model_path.
+    :param model_path: the path to the model to load.
+        It can be one of the following:
+        - a path to the model directory
+        - a path to the model file
+        - Hugging face model id
+    :return the path to the recipe file if found, None otherwise
+    """
+    model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path
+
+    if os.path.isdir(model_path) or os.path.isfile(model_path):
+        # model_path is a local path to the model directory or model file
+        # attempting to find the recipe in the model_directory
+        model_path = (
+            os.path.dirname(model_path) if os.path.isfile(model_path) else model_path
+        )
+        recipe = os.path.join(model_path, RECIPE_FILE_NAME)
+        if os.path.isfile(recipe):
+            logger.info(f"Found recipe in the model_path: {recipe}")
+            return recipe
+        logger.debug(f"No recipe found in the model_path: {model_path}")
+        return None
+
+    recipe = recipe_from_huggingface_model_id(model_path)[0]
+
+    if recipe is None:
+        logger.info("Failed to infer the recipe from the model_path")
+    return recipe
+
+
+def recipe_from_huggingface_model_id(
+    model_path: str, RECIPE_FILE_NAME: str = RECIPE_FILE_NAME
+) -> Tuple[Optional[str], bool]:
+    """
+    Attempts to download the recipe from the huggingface model id.
+
+    :param model_path: Assumed to be the huggingface model id.
+        If it is not, this function will return None.
+    :param RECIPE_FILE_NAME: The name of the recipe file to download.
+        Defaults to RECIPE_FILE_NAME.
+    :return: tuple:
+        - the path to the recipe file if found, None otherwise
+        - True if model_path is a valid huggingface model id, False otherwise
+    """
+    model_id = os.path.join(HUGGINGFACE_CO_URL_HOME, model_path)
+    request = requests.get(model_id)
+    if not request.status_code == 200:
+        logger.debug(
+            "model_path is not a valid huggingface model id. "
+            "Skipping recipe resolution."
+        )
+        return None, False
+
+    logger.info(
+        "model_path is a huggingface model id. "
+        "Attempting to download recipe from "
+        f"{HUGGINGFACE_CO_URL_HOME}"
+    )
+    try:
+        recipe = hf_hub_download(repo_id=model_path, filename=RECIPE_FILE_NAME)
+        logger.info(f"Found recipe: {RECIPE_FILE_NAME} for model id: {model_path}.")
+    except Exception as e:
+        logger.info(
+            f"Unable to to find recipe {RECIPE_FILE_NAME} "
+            f"for model id: {model_path}: {e}. "
+            "Skipping recipe resolution."
+        )
+        recipe = None
+    return recipe, True
+
+
+def resolve_recipe_file(
+    requested_recipe: Union[str, Path], model_path: Union[str, Path]
+) -> Union[str, Path, None]:
+    """
+    Given the requested recipe and the model_path, return the path to the recipe file.
+
+    :param requested_recipe. Is a full path to the recipe file
+    :param model_path: the path to the model to load.
+        It can be one of the following:
+        - a path to the model directory
+        - a path to the model file
+        - Hugging face model id
+    :return the path to the recipe file if found, None otherwise
+    """
+    # preprocess arguments so that they are all strings
+    requested_recipe = (
+        requested_recipe.as_posix()
+        if isinstance(requested_recipe, Path)
+        else requested_recipe
+    )
+    model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path
+    model_path = (
+        os.path.dirname(model_path) if os.path.isfile(model_path) else model_path
+    )
+
+    if not os.path.isdir(model_path):
+        default_recipe, model_exists = recipe_from_huggingface_model_id(model_path)
+        if not model_exists:
+            raise ValueError(f"Unrecognized model_path: {model_path}")
+
+        if not default_recipe == requested_recipe and default_recipe is not None:
+            logger.warning(
+                f"Attempting to apply recipe: {requested_recipe} "
+                f"to the model at: {model_path}, "
+                f"but the model already has a recipe: {default_recipe}. "
+                f"Using {requested_recipe} instead."
+            )
+        return requested_recipe
+
+    # pathway for model_path that is a directory
+    default_recipe = os.path.join(model_path, RECIPE_FILE_NAME)
+    default_recipe_exists = os.path.isfile(default_recipe)
+    default_and_request_recipes_identical = os.path.samefile(
+        default_recipe, requested_recipe
+    )
+
+    if (
+        default_recipe_exists
+        and requested_recipe
+        and not default_and_request_recipes_identical
+    ):
+        logger.warning(
+            f"Attempting to apply recipe: {requested_recipe} "
+            f"to the model located in {model_path}, "
+            f"but the model already has a recipe stored as {default_recipe}. "
+            f"Using {requested_recipe} instead."
+        )
+
+    elif not default_recipe_exists and requested_recipe:
+        logger.warning(
+            f"Attempting to apply {requested_recipe} "
+            f"to the model located in {model_path}."
+            "However, it is expected that the model "
+            f"has its target recipe stored as {default_recipe}."
+            "Applying any recipe before the target recipe may "
+            "result in unexpected behavior."
+            f"Applying {requested_recipe} nevertheless."
+        )
+
+    elif default_recipe_exists:
+        logger.info(f"Using the default recipe: {requested_recipe}")
+
+    return requested_recipe
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
-empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 
+compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed
+uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/tinyllama-w4a16-compressed"
-empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed
+uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/tinyllama-w8a16-dense"
-empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed
+uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/tinyllama-w8a8-compressed"
-empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed
+uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed