NVIDIA-NeMo · andreatgretel · Feb 18, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 4, 2026
@@ -93,6 +93,58 @@ This gives you direct access to all `ModelFacade` capabilities: custom parsers,
 | `generator_function` | Callable | Yes | Decorated function |
 | `generation_strategy` | GenerationStrategy | No | `CELL_BY_CELL` or `FULL_COLUMN` |
 | `generator_params` | BaseModel | No | Typed params passed to function |
+| `allow_resize` | bool | No | Allow 1:N or N:1 generation |
+
+### Resizing (1:N and N:1)
+
+**FULL_COLUMN:** Set `allow_resize=True` and return a DataFrame with more or fewer rows than the input:
+
+```python
+@dd.custom_column_generator(
+    required_columns=["topic"],
+    side_effect_columns=["variation_id"],
+)
+def expand_topics(df: pd.DataFrame, params: None, models: dict) -> pd.DataFrame:
-def expand_topics(df: pd.DataFrame, params: None, models: dict) -> pd.DataFrame:
+def expand_topics(df: pd.DataFrame, generator_params: None, models: dict) -> pd.DataFrame:
-def expand_topics(df: pd.DataFrame, params: None, models: dict) -> pd.DataFrame:
+def expand_topics(df: pd.DataFrame, generator_params: None, models: dict) -> pd.DataFrame:
+    rows = []
+    for _, row in df.iterrows():
+        for i in range(3):  # Generate 3 variations per input
+            rows.append({
+                "topic": row["topic"],
+                "question": f"Question {i+1} about {row['topic']}",
+                "variation_id": i,
+            })
+    return pd.DataFrame(rows)
+
+dd.CustomColumnConfig(
+    name="question",
+    generator_function=expand_topics,
+    generation_strategy=dd.GenerationStrategy.FULL_COLUMN,
+    allow_resize=True,
+)
+```
+
+**CELL_BY_CELL:** With `allow_resize=True`, your function may return a single row (`dict`) or multiple rows (`list[dict]`). Return `[]` to drop that input row.
+
+```python
+@dd.custom_column_generator(required_columns=["id"])
+def expand_row(row: dict) -> list[dict]:
+    return [
+        {**row, "variant": "a"},
+        {**row, "variant": "b"},
+    ]
+
+dd.CustomColumnConfig(
+    name="variant",
+    generator_function=expand_row,
+    generation_strategy=dd.GenerationStrategy.CELL_BY_CELL,
+    allow_resize=True,
+)
+```
+
+Use cases:
+
+- **Expansion (1:N)**: Generate multiple variations per input
+- **Retraction (N:1)**: Filter, aggregate, or deduplicate records (FULL_COLUMN) or return `[]` per row (CELL_BY_CELL)
 
 ## Multi-Turn Example
 

@@ -82,6 +82,17 @@ class IndexMultiplierColumnConfig(SingleColumnConfig):
 - `required_columns` lists any columns this generator depends on (empty if none)
 - `side_effect_columns` lists any additional columns this generator produces beyond the primary column (empty if none)
 
+**If your plugin can expand or retract the number of rows (1:N or N:1):** set `allow_resize=True` in the config class so the pipeline updates batch bookkeeping correctly. For example:
+
+```python
+class MyColumnConfig(SingleColumnConfig):
+    column_type: Literal["my-plugin"] = "my-plugin"
+    allow_resize: bool = True  # required when output row count can differ from input
+    # ...
+```
+
+The default is `False`; only set it to `True` when your `generate` method can return more or fewer rows than it receives.
+
 ### Step 3: Create the implementation class
 
 The implementation class defines the actual business logic of the plugin. For column generator plugins, inherit from `ColumnGeneratorFullColumn` or `ColumnGeneratorCellByCell` and implement the `generate` method.

@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Example: Chaining expand -> retract -> expand resize operations.
+
+Pipeline: 5 topics -> 15 questions (3 per topic) -> ~8 hard questions (filter easy)
+          -> ~24 answer variants (3 per question)
+"""
+
+from __future__ import annotations
+
+import data_designer.config as dd
+from data_designer.interface import DataDesigner
+from data_designer.lazy_heavy_imports import pd
+
+
+# Step 1: Expand — 1:N, generate 3 questions per topic
+@dd.custom_column_generator(required_columns=["topic"], side_effect_columns=["question_id", "difficulty"])
+def expand_to_questions(df: pd.DataFrame) -> pd.DataFrame:
+    rows = []
+    for _, row in df.iterrows():
+        for i in range(3):
+            rows.append(
+                {
+                    "topic": row["topic"],
+                    "question": f"Q{i + 1} about {row['topic']}?",
+                    "question_id": i,
+                    "difficulty": ["easy", "medium", "hard"][i],
+                }
+            )
+    return pd.DataFrame(rows)
+
+
+# Step 2: Retract — N:1, keep only medium/hard questions
+@dd.custom_column_generator(required_columns=["difficulty"])
+def filter_non_easy(df: pd.DataFrame) -> pd.DataFrame:
+    return df[df["difficulty"] != "easy"].copy().assign(filtered=True)
+
+
+# Step 3: Expand again — 1:N, generate 3 answer variants per surviving question
+@dd.custom_column_generator(required_columns=["question"], side_effect_columns=["variant"])
+def expand_to_answers(df: pd.DataFrame) -> pd.DataFrame:
+    rows = []
+    for _, row in df.iterrows():
+        for v in range(3):
+            rows.append({**row.to_dict(), "answer": f"Answer v{v} to: {row['question']}", "variant": v})
+    return pd.DataFrame(rows)
+
+
+def main() -> None:
+    data_designer = DataDesigner()
+    config_builder = dd.DataDesignerConfigBuilder()
+
+    # Seed: 5 topics
+    config_builder.add_column(
+        dd.SamplerColumnConfig(
+            name="topic",
+            sampler_type=dd.SamplerType.CATEGORY,
+            params=dd.CategorySamplerParams(values=["Python", "ML", "Data", "Stats", "SQL"]),
+        )
+    )
+
+    # Expand: 5 topics -> 15 questions
+    config_builder.add_column(
+        dd.CustomColumnConfig(
+            name="question",
+            generator_function=expand_to_questions,
+            generation_strategy=dd.GenerationStrategy.FULL_COLUMN,
+            allow_resize=True,
+        )
+    )
+
+    # Retract: 15 -> 10 (drop "easy" questions)
+    config_builder.add_column(
+        dd.CustomColumnConfig(
+            name="filtered",
+            generator_function=filter_non_easy,
+            generation_strategy=dd.GenerationStrategy.FULL_COLUMN,
+            allow_resize=True,
+        )
+    )
+
+    # Expand again: 10 -> 30 answer variants
+    config_builder.add_column(
+        dd.CustomColumnConfig(
+            name="answer",
+            generator_function=expand_to_answers,
+            generation_strategy=dd.GenerationStrategy.FULL_COLUMN,
+            allow_resize=True,
+        )
+    )
+
+    # Preview (single batch)
+    preview = data_designer.preview(config_builder=config_builder, num_records=5)
+    print(f"Preview: 5 topics -> {len(preview.dataset)} answer variants")
+    print(preview.dataset[["topic", "difficulty", "question", "variant", "answer"]].to_string())
+    print()
+
+    # Build (multiple batches: 10 records with buffer_size=3 -> 4 batches)
+    data_designer.set_run_config(dd.RunConfig(buffer_size=3))
+    results = data_designer.create(config_builder=config_builder, num_records=10)
+    df = results.load_dataset()
+    print(f"Build: 10 topics (4 batches of 3+3+3+1) -> {len(df)} answer variants")
+    print(df[["topic", "difficulty", "question", "variant"]].to_string())
+
+
+if __name__ == "__main__":
+    main()
@@ -37,6 +37,7 @@ class SingleColumnConfig(ConfigBase, ABC):
 
     name: str
     drop: bool = False
+    allow_resize: bool = False
     column_type: str
 
     @staticmethod

@@ -517,19 +517,24 @@ def test_sampler_column_config_discriminated_union_wrong_params_type():
         )
 
 
-def test_default_column_emoji_for_custom_column_type() -> None:
-    """Ensure the base get_column_emoji implementation is used when not overridden."""
+class StubColumnConfig(SingleColumnConfig):
+    column_type: Literal["stub"] = "stub"
+
+    @property
+    def required_columns(self) -> list[str]:
+        return []
 
-    class StubColumnConfigWithoutEmoji(SingleColumnConfig):
-        column_type: Literal["stub-without-emoji"] = "stub-without-emoji"
-        value: str
+    @property
+    def side_effect_columns(self) -> list[str]:
+        return []
 
-        @property
-        def required_columns(self) -> list[str]:
-            return []
 
-        @property
-        def side_effect_columns(self) -> list[str]:
-            return []
+def test_default_column_emoji_for_custom_column_type() -> None:
+    """Ensure the base get_column_emoji implementation is used when not overridden."""
+    assert StubColumnConfig.get_column_emoji() == "🎨"
+
 
-    assert StubColumnConfigWithoutEmoji.get_column_emoji() == "🎨"
+def test_allow_resize_inherited_by_subclasses() -> None:
+    """Subclasses inherit allow_resize from SingleColumnConfig."""
+    assert StubColumnConfig(name="test").allow_resize is False
+    assert StubColumnConfig(name="test", allow_resize=True).allow_resize is True
@@ -43,8 +43,11 @@ def get_generation_strategy(self) -> GenerationStrategy:
         """Return strategy based on config."""
         return self.config.generation_strategy
 
-    def generate(self, data: dict | pd.DataFrame) -> dict | pd.DataFrame:
-        """Generate column value(s) for a row (dict) or batch (DataFrame)."""
+    def generate(self, data: dict | pd.DataFrame) -> dict | pd.DataFrame | list[dict]:
+        """Generate column value(s) for a row (dict) or batch (DataFrame).
+
+        For cell_by_cell with allow_resize=True, may return dict or list[dict] (0, 1, or N rows).
+        """
         is_full_column = self.config.generation_strategy == GenerationStrategy.FULL_COLUMN
         is_dataframe = not isinstance(data, dict)
 
@@ -62,7 +65,7 @@ def generate(self, data: dict | pd.DataFrame) -> dict | pd.DataFrame:
 
         return self._generate(data, is_dataframe)
 
-    def _generate(self, data: dict | pd.DataFrame, is_dataframe: bool) -> dict | pd.DataFrame:
+    def _generate(self, data: dict | pd.DataFrame, is_dataframe: bool) -> dict | pd.DataFrame | list[dict]:
         """Unified generation logic for both strategies."""
         # Get columns/keys using unified accessor
         get_keys = (lambda d: set(d.columns)) if is_dataframe else (lambda d: set(d.keys()))
@@ -93,7 +96,23 @@ def _generate(self, data: dict | pd.DataFrame, is_dataframe: bool) -> dict | pd.
                 f"Custom generator function failed for column '{self.config.name}': {e}"
             ) from e
 
-        # Validate return type
+        # Cell-by-cell with allow_resize: accept dict or list[dict]
+        if not is_dataframe and self.config.allow_resize:
+            if isinstance(result, dict):
+                return self._validate_output(result, keys_before, is_dataframe)
+            if isinstance(result, list):
+                if not all(isinstance(r, dict) for r in result):
+                    raise CustomColumnGenerationError(
+                        f"Custom generator for column '{self.config.name}' with allow_resize must return "
+                        "dict or list[dict]; list elements must be dicts."
+                    )
+                return [self._validate_cell_output(r, keys_before) for r in result]
+            raise CustomColumnGenerationError(
+                f"Custom generator for column '{self.config.name}' with allow_resize must return "
+                f"dict or list[dict], got {type(result).__name__}"
+            )
+
+        # Validate return type for non-resize paths
         if not isinstance(result, expected_type):
             raise CustomColumnGenerationError(
                 f"Custom generator for column '{self.config.name}' must return a {type_name}, "
@@ -102,6 +121,38 @@ def _generate(self, data: dict | pd.DataFrame, is_dataframe: bool) -> dict | pd.
 
         return self._validate_output(result, keys_before, is_dataframe)
 
+    def _validate_cell_output(self, row: dict, keys_before: set[str]) -> dict:
+        """Validate a single row output (dict) for cell_by_cell; strip undeclared columns."""
+        expected_new = {self.config.name} | set(self.config.side_effect_columns)
+        result_keys = set(row.keys())
+
+        if self.config.name not in result_keys:
+            raise CustomColumnGenerationError(
+                f"Custom generator for column '{self.config.name}' did not create the expected column. "
+                f"The generator_function must add a column named '{self.config.name}' to the row."
+            )
+        missing = set(self.config.side_effect_columns) - result_keys
+        if missing:
+            raise CustomColumnGenerationError(
+                f"Custom generator for column '{self.config.name}' did not create declared side_effect_columns: "
+                f"{sorted(missing)}. Declared side_effect_columns must be added to the row."
+            )
+        removed = keys_before - result_keys
+        if removed:
+            raise CustomColumnGenerationError(
+                f"Custom generator for column '{self.config.name}' removed pre-existing columns: "
+                f"{sorted(removed)}. The generator_function must not remove any existing columns."
+            )
+        undeclared = (result_keys - keys_before) - expected_new
+        if undeclared:
+            logger.warning(
+                f"⚠️ Custom generator for column '{self.config.name}' created undeclared columns: "
+                f"{sorted(undeclared)}. These columns will be removed. "
+                f"To keep additional columns, declare them in @custom_column_generator(side_effect_columns=[...])."
+            )
+            row = {k: v for k, v in row.items() if k not in undeclared}
+        return row
+
     def _validate_output(
         self, result: dict | pd.DataFrame, keys_before: set[str], is_dataframe: bool
     ) -> dict | pd.DataFrame:
@@ -147,8 +198,7 @@ def _validate_output(
             if is_dataframe:
                 result = result.drop(columns=list(undeclared))
             else:
-                for key in undeclared:
-                    del result[key]
+                result = {k: v for k, v in result.items() if k not in undeclared}
 
         return result
 
@@ -199,3 +249,5 @@ def log_pre_generation(self) -> None:
             logger.info(f"{LOG_INDENT}model_aliases: {self.config.model_aliases}")
         if self.config.generator_params:
             logger.info(f"{LOG_INDENT}generator_params: {self.config.generator_params}")
+        if self.config.allow_resize:
+            logger.info(f"{LOG_INDENT}allow_resize: {self.config.allow_resize}")