Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/llmcompressor/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ def format_calibration_data(
f"the provided dataset only has {safe_calibration_samples}. "
)

if safe_calibration_samples == 0:
raise ValueError(
"Dataset is empty. Cannot create a calibration dataloader with 0 samples."
)

if do_shuffle:
tokenized_dataset = tokenized_dataset.shuffle()
tokenized_calibration = tokenized_dataset.select(range(safe_calibration_samples))
Expand Down
36 changes: 34 additions & 2 deletions src/llmcompressor/entrypoints/oneshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import os
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union

from loguru import logger
from torch.utils.data import DataLoader
Expand Down Expand Up @@ -253,8 +253,15 @@ def oneshot(
preprocessing_num_workers: Optional[int] = None,
min_tokens_per_module: Optional[float] = None,
calibrate_moe_context: bool = False,
pipeline: str = "independent",
tracing_ignore: Optional[List[str]] = None,
raw_kwargs: Optional[Dict[str, Any]] = None,
preprocessing_func: Optional[Callable] = None,
max_train_samples: Optional[int] = None,
remove_columns: Optional[List[str]] = None,
dvc_data_repository: Optional[str] = None,
quantization_aware_calibration: bool = True,
# Miscellaneous arguments
sequential_targets: Optional[List[str]] = None,
output_dir: Optional[str] = None,
log_dir: Optional[str] = None,
**kwargs,
Expand Down Expand Up @@ -324,6 +331,16 @@ def oneshot(
during forward pass in calibration. When False, quantization is disabled
during forward pass in calibration. Default is set to True.

:param pipeline: The pipeline configuration to use for calibration. Options include
'independent', 'sequential', or 'layer_sequential'.
:param tracing_ignore: List of module names to ignore during tracing.
:param raw_kwargs: Dictionary of raw keyword arguments passed to the function.
:param preprocessing_func: Optional callable for preprocessing the dataset.
:param max_train_samples: Maximum number of training samples to use.
:param remove_columns: List of column names to remove from the dataset.
:param dvc_data_repository: Path to the DVC data repository, if applicable.
:param sequential_targets: List of sequential targets for calibration.

# Miscellaneous arguments
:param output_dir: Path to save the output model after calibration.
Nothing is saved if None.
Expand All @@ -333,10 +350,25 @@ def oneshot(
:return: The calibrated PreTrainedModel
"""

if sequential_targets and pipeline == "independent":
raise ValueError(
"Invalid configuration: "
"sequential_targets' cannot be used with 'independent' pipeline. "
"Please use 'sequential' or 'layer_sequential' pipeline when specifying "
"sequential_targets."
)

# pass all args directly into Oneshot
if raw_kwargs is None:
raw_kwargs = {}

local_args = {
k: v for k, v in locals().items() if k not in ("local_args", "kwargs")
}
local_args = {
k: v for k, v in locals().items() if k not in ("local_args", "kwargs")
}

one_shot = Oneshot(**local_args, **kwargs)
one_shot()

Expand Down
44 changes: 38 additions & 6 deletions tests/llmcompressor/transformers/oneshot/test_api_inputs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import pytest
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from llmcompressor import oneshot
from tests.llmcompressor.transformers.oneshot.dataset_processing import get_data_utils
Expand Down Expand Up @@ -42,15 +47,42 @@ def wrapped_preprocess_func(sample):
dataset_config_name=config.get("dataset_config_name"),
)

args["pipeline"] = config.get("pipeline", "independent")
args["sequential_targets"] = config.get("sequential_targets", None)
args["tracing_ignore"] = config.get("tracing_ignore", [])
args["raw_kwargs"] = config.get("raw_kwargs", {})
args["preprocessing_func"] = config.get("preprocessing_func", lambda x: x)
args["max_train_samples"] = config.get("max_train_samples", 50)
args["remove_columns"] = config.get("remove_columns", None)
args["dvc_data_repository"] = config.get("dvc_data_repository", None)
args["splits"] = config.get("splits", {"calibration": "train[:50]"})
args["log_dir"] = config.get("log_dir", "sparse_logs")

return args


@pytest.mark.smoke
@pytest.mark.integration
def test_one_shot_inputs(one_shot_args, tmp_path):
oneshot(
**one_shot_args,
output_dir=tmp_path,
num_calibration_samples=10,
pad_to_max_length=False,
)
logger.info(f"Dataset type: {type(one_shot_args.get('dataset'))}")
if isinstance(one_shot_args.get("dataset"), str):
logger.info(f"Dataset name: {one_shot_args.get('dataset')}")
logger.info(f"Dataset config: {one_shot_args.get('dataset_config_name')}")
try:
# Call oneshot with all parameters as flat arguments
oneshot(
**one_shot_args,
output_dir=tmp_path,
num_calibration_samples=10,
pad_to_max_length=False,
)

except ValueError as e:
if "num_samples should be a positive integer value" in str(
e
) or "Dataset is empty. Cannot create a calibration dataloader" in str(e):
logger.warning(f"Dataset is empty: {one_shot_args.get('dataset')}")
pytest.skip(f"Dataset is empty: {one_shot_args.get('dataset')}")
else:
raise # Re-raise other ValueError exceptions