NVIDIA-NeMo · andreatgretel · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -0,0 +1,64 @@
+# Demo Processor Plugins for Data Designer
+
+Two example processor plugins demonstrating the Data Designer plugin system.
+
+## Processors
+
+### RegexFilterProcessor (`regex-filter`)
+
+Filters rows by regex pattern on a specified column. Runs at `process_before_batch`.
+
+```python
+config_builder.add_processor(
+    RegexFilterProcessorConfig(
+        name="filter_english",
+        column="language",
+        pattern="^(en|english)$",
+        invert=False,
+    )
+)
+```
+
+**Parameters:**
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `column` | `str` | required | Column to match against |
+| `pattern` | `str` | required | Regex pattern |
+| `invert` | `bool` | `False` | Keep non-matching rows instead |
+
+### SemanticDedupProcessor (`semantic-dedup`)
+
+Removes near-duplicate rows using embedding cosine similarity. Runs at `process_after_generation`.
+
+```python
+config_builder.add_processor(
+    SemanticDedupProcessorConfig(
+        name="dedup",
+        column="generated_text",
+        similarity_threshold=0.9,
+    )
+)
+```
+
+**Parameters:**
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `column` | `str` | required | Column to compute embeddings on |
+| `similarity_threshold` | `float` | `0.9` | Cosine similarity threshold |
+| `model_name` | `str` | `all-MiniLM-L6-v2` | Sentence-transformers model |
+
+## Installation
+
+```bash
+uv pip install -e demo/data_designer_demo_processors
+```
+
+## Entry Points
+
+```toml
+[project.entry-points."data_designer.plugins"]
+regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
+semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
+```
@@ -0,0 +1,128 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Processor Plugins Demo
+#
+# Demonstrates the `regex-filter` and `semantic-dedup` processor plugins
+# with a simple product review pipeline.
+
+# %%
+import pandas as pd
+from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
+from data_designer_demo_processors.semantic_dedup.config import SemanticDedupProcessorConfig
+
+import data_designer.config as dd
+from data_designer.interface import DataDesigner
+
+# %% [markdown]
+# ### Setup
+
+# %%
+MODEL_ALIAS = "openai-text"
+
+data_designer = DataDesigner()
+
+model_configs = [
+    dd.ModelConfig(
+        alias=MODEL_ALIAS,
+        model="nvidia/nemotron-3-nano-30b-a3b",
+        provider="nvidia",
+        inference_parameters=dd.ChatCompletionInferenceParams(
+            temperature=1.0,
+            max_tokens=512,
+            extra_body={"chat_template_kwargs": {"enable_thinking": False}},
+        ),
+    )
+]
+
+# %% [markdown]
+# ### Seed data
+#
+# A simple CSV with topics and languages. The regex filter will keep only English rows.
+
+# %%
+seed_data = pd.DataFrame(
+    {
+        "topic": [
+            "machine learning",
+            "aprendizaje automático",
+            "cloud computing",
+            "computación en la nube",
+            "quantum computing",
+            "web development",
+            "desarrollo web",
+            "cybersecurity",
+        ],
+        "language": ["en", "es", "en", "es", "en", "en", "es", "en"],
+    }
+)
+print(f"Seed data: {len(seed_data)} rows")
+seed_data
+
+# %% [markdown]
+# ### Build the config
+
+# %%
+config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
+
+config_builder.with_seed_dataset(dd.DataFrameSeedSource(df=seed_data))
+
+config_builder.add_column(
+    dd.LLMTextColumnConfig(
+        name="explanation",
+        prompt=("Write a short (2-3 sentence) explanation of {{ topic }}. Be concise and informative."),
+        model_alias=MODEL_ALIAS,
+    )
+)
+
+# %% [markdown]
+# ### Add processor plugins
+#
+# 1. **RegexFilter** (process_before_batch): keep only English rows
+# 2. **SemanticDedup** (process_after_generation): remove near-duplicate explanations
+
+# %%
+config_builder.add_processor(
+    RegexFilterProcessorConfig(
+        name="english_only",
+        column="language",
+        pattern="^en$",
+    )
+)
+
+config_builder.add_processor(
+    SemanticDedupProcessorConfig(
+        name="dedup_explanations",
+        column="explanation",
+        similarity_threshold=0.9,
+    )
+)
+
+data_designer.validate(config_builder)
+
+# %% [markdown]
+# ### Preview
+
+# %%
+preview = data_designer.preview(config_builder, num_records=4)
+preview.dataset
+
+# %% [markdown]
+# ### Full run
+
+# %%
+results = data_designer.create(config_builder, num_records=10, dataset_name="processor-plugins-demo")
+dataset = results.load_dataset()
+print(f"Final dataset: {len(dataset)} rows")
+dataset
@@ -0,0 +1,38 @@
+[project]
+name = "data-designer-demo-processors"
+version = "0.1.0"
+description = "Demo processor plugins for Data Designer"
+requires-python = ">=3.10"
+dependencies = [
+  "data-designer",
+  "sentence-transformers",
+]
+
+[tool.uv.sources]
+data-designer = { path = "../../packages/data-designer" }
+
+[dependency-groups]
+dev = [
+  "pytest>=9.0.2,<10",
+]
+
+[project.entry-points."data_designer.plugins"]
+regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
+semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+env = [
+  "DISABLE_DATA_DESIGNER_PLUGINS=false",
+]
+
+[tool.uv]
+package = true
+required-version = ">=0.7.10"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/data_designer_demo_processors"]
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import Field
+
+from data_designer.config.processors import ProcessorConfig
+
+
+class RegexFilterProcessorConfig(ProcessorConfig):
+    """Filters rows by regex pattern on a specified column."""
+
+    processor_type: Literal["regex-filter"] = "regex-filter"
+    column: str = Field(description="Column to match against.")
+    pattern: str = Field(description="Regex pattern to match.")
+    invert: bool = Field(default=False, description="If True, keep rows that do NOT match.")
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import logging
+import re
+from typing import TYPE_CHECKING
+
+from data_designer.engine.processing.processors.base import Processor
+from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+class RegexFilterProcessor(Processor[RegexFilterProcessorConfig]):
+    """Filters batch rows based on a regex pattern."""
+
+    def process_before_batch(self, data: pd.DataFrame) -> pd.DataFrame:
+        compiled = re.compile(self.config.pattern)
+        mask = data[self.config.column].astype(str).apply(lambda v: bool(compiled.search(v)))
+        if self.config.invert:
+            mask = ~mask
+        before = len(data)
+        result = data[mask].reset_index(drop=True)
+        logger.info(f"🔍 RegexFilter: {before} → {len(result)} rows (column={self.config.column!r})")
+        return result
@@ -0,0 +1,7 @@
+from data_designer.plugins.plugin import Plugin, PluginType
+
+regex_filter_plugin = Plugin(
+    config_qualified_name="data_designer_demo_processors.regex_filter.config.RegexFilterProcessorConfig",
+    impl_qualified_name="data_designer_demo_processors.regex_filter.impl.RegexFilterProcessor",
+    plugin_type=PluginType.PROCESSOR,
+)
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import Field
+
+from data_designer.config.processors import ProcessorConfig
+
+
+class SemanticDedupProcessorConfig(ProcessorConfig):
+    """Removes semantically similar rows using embedding similarity."""
+
+    processor_type: Literal["semantic-dedup"] = "semantic-dedup"
+    column: str = Field(description="Column to compute embeddings on.")
+    similarity_threshold: float = Field(default=0.9, description="Cosine similarity threshold for deduplication.")
+    model_name: str = Field(default="all-MiniLM-L6-v2", description="Sentence-transformers model name.")
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import numpy as np
+from sentence_transformers import SentenceTransformer
+
+from data_designer.engine.processing.processors.base import Processor
+from data_designer_demo_processors.semantic_dedup.config import SemanticDedupProcessorConfig
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+def _suppress_transformers_logging() -> None:
+    import transformers.utils.logging as tf_logging
+
+    tf_logging.set_verbosity_error()
+    tf_logging.disable_progress_bar()
+
+
+class SemanticDedupProcessor(Processor[SemanticDedupProcessorConfig]):
+    """Removes near-duplicate rows based on embedding cosine similarity."""
+
+    def _initialize(self) -> None:
+        _suppress_transformers_logging()
+        self._model = SentenceTransformer(self.config.model_name)
+
+    def process_after_generation(self, data: pd.DataFrame) -> pd.DataFrame:
+        texts = data[self.config.column].astype(str).tolist()
+        if len(texts) <= 1:
+            return data
+
+        embeddings = self._model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
+        sim_matrix = np.dot(embeddings, embeddings.T)
+
+        keep = set(range(len(texts)))
+        for i in range(len(texts)):
+            if i not in keep:
+                continue
+            for j in range(i + 1, len(texts)):
+                if j in keep and sim_matrix[i, j] >= self.config.similarity_threshold:
+                    keep.discard(j)
+
+        before = len(data)
+        result = data.iloc[sorted(keep)].reset_index(drop=True)
+        logger.info(f"🧹 SemanticDedup: {before} → {len(result)} rows (threshold={self.config.similarity_threshold})")
+        return result
@@ -0,0 +1,7 @@
+from data_designer.plugins.plugin import Plugin, PluginType
+
+semantic_dedup_plugin = Plugin(
+    config_qualified_name="data_designer_demo_processors.semantic_dedup.config.SemanticDedupProcessorConfig",
+    impl_qualified_name="data_designer_demo_processors.semantic_dedup.impl.SemanticDedupProcessor",
+    plugin_type=PluginType.PROCESSOR,
+)