Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions demo/data_designer_demo_processors/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Demo Processor Plugins for Data Designer

Two example processor plugins demonstrating the Data Designer plugin system.

## Processors

### RegexFilterProcessor (`regex-filter`)

Filters rows by regex pattern on a specified column. Runs at `process_before_batch`.

```python
config_builder.add_processor(
RegexFilterProcessorConfig(
name="filter_english",
column="language",
pattern="^(en|english)$",
invert=False,
)
)
```

**Parameters:**

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `column` | `str` | required | Column to match against |
| `pattern` | `str` | required | Regex pattern |
| `invert` | `bool` | `False` | Keep non-matching rows instead |

### SemanticDedupProcessor (`semantic-dedup`)

Removes near-duplicate rows using embedding cosine similarity. Runs at `process_after_generation`.

```python
config_builder.add_processor(
SemanticDedupProcessorConfig(
name="dedup",
column="generated_text",
similarity_threshold=0.9,
)
)
```

**Parameters:**

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `column` | `str` | required | Column to compute embeddings on |
| `similarity_threshold` | `float` | `0.9` | Cosine similarity threshold |
| `model_name` | `str` | `all-MiniLM-L6-v2` | Sentence-transformers model |

## Installation

```bash
uv pip install -e demo/data_designer_demo_processors
```

## Entry Points

```toml
[project.entry-points."data_designer.plugins"]
regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
```
128 changes: 128 additions & 0 deletions demo/data_designer_demo_processors/demo_processor_plugins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# kernelspec:
# display_name: .venv
# language: python
# name: python3
# ---

# %% [markdown]
# # Processor Plugins Demo
#
# Demonstrates the `regex-filter` and `semantic-dedup` processor plugins
# with a simple product review pipeline.

# %%
import pandas as pd
from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
from data_designer_demo_processors.semantic_dedup.config import SemanticDedupProcessorConfig

import data_designer.config as dd
from data_designer.interface import DataDesigner

# %% [markdown]
# ### Setup

# %%
MODEL_ALIAS = "openai-text"

data_designer = DataDesigner()

model_configs = [
dd.ModelConfig(
alias=MODEL_ALIAS,
model="nvidia/nemotron-3-nano-30b-a3b",
provider="nvidia",
inference_parameters=dd.ChatCompletionInferenceParams(
temperature=1.0,
max_tokens=512,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
),
)
]

# %% [markdown]
# ### Seed data
#
# A simple CSV with topics and languages. The regex filter will keep only English rows.

# %%
seed_data = pd.DataFrame(
{
"topic": [
"machine learning",
"aprendizaje automático",
"cloud computing",
"computación en la nube",
"quantum computing",
"web development",
"desarrollo web",
"cybersecurity",
],
"language": ["en", "es", "en", "es", "en", "en", "es", "en"],
}
)
print(f"Seed data: {len(seed_data)} rows")
seed_data

# %% [markdown]
# ### Build the config

# %%
config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)

config_builder.with_seed_dataset(dd.DataFrameSeedSource(df=seed_data))

config_builder.add_column(
dd.LLMTextColumnConfig(
name="explanation",
prompt=("Write a short (2-3 sentence) explanation of {{ topic }}. Be concise and informative."),
model_alias=MODEL_ALIAS,
)
)

# %% [markdown]
# ### Add processor plugins
#
# 1. **RegexFilter** (process_before_batch): keep only English rows
# 2. **SemanticDedup** (process_after_generation): remove near-duplicate explanations

# %%
config_builder.add_processor(
RegexFilterProcessorConfig(
name="english_only",
column="language",
pattern="^en$",
)
)

config_builder.add_processor(
SemanticDedupProcessorConfig(
name="dedup_explanations",
column="explanation",
similarity_threshold=0.9,
)
)

data_designer.validate(config_builder)

# %% [markdown]
# ### Preview

# %%
preview = data_designer.preview(config_builder, num_records=4)
preview.dataset

# %% [markdown]
# ### Full run

# %%
results = data_designer.create(config_builder, num_records=10, dataset_name="processor-plugins-demo")
dataset = results.load_dataset()
print(f"Final dataset: {len(dataset)} rows")
dataset
38 changes: 38 additions & 0 deletions demo/data_designer_demo_processors/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[project]
name = "data-designer-demo-processors"
version = "0.1.0"
description = "Demo processor plugins for Data Designer"
requires-python = ">=3.10"
dependencies = [
"data-designer",
"sentence-transformers",
]

[tool.uv.sources]
data-designer = { path = "../../packages/data-designer" }

[dependency-groups]
dev = [
"pytest>=9.0.2,<10",
]

[project.entry-points."data_designer.plugins"]
regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"

[tool.pytest.ini_options]
testpaths = ["tests"]
env = [
"DISABLE_DATA_DESIGNER_PLUGINS=false",
]

[tool.uv]
package = true
required-version = ">=0.7.10"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/data_designer_demo_processors"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will remove all these demos! (promise!)

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from __future__ import annotations

from typing import Literal

from pydantic import Field

from data_designer.config.processors import ProcessorConfig


class RegexFilterProcessorConfig(ProcessorConfig):
"""Filters rows by regex pattern on a specified column."""

processor_type: Literal["regex-filter"] = "regex-filter"
column: str = Field(description="Column to match against.")
pattern: str = Field(description="Regex pattern to match.")
invert: bool = Field(default=False, description="If True, keep rows that do NOT match.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from __future__ import annotations

import logging
import re
from typing import TYPE_CHECKING

from data_designer.engine.processing.processors.base import Processor
from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig

if TYPE_CHECKING:
import pandas as pd

logger = logging.getLogger(__name__)


class RegexFilterProcessor(Processor[RegexFilterProcessorConfig]):
"""Filters batch rows based on a regex pattern."""

def process_before_batch(self, data: pd.DataFrame) -> pd.DataFrame:
compiled = re.compile(self.config.pattern)
mask = data[self.config.column].astype(str).apply(lambda v: bool(compiled.search(v)))
if self.config.invert:
mask = ~mask
before = len(data)
result = data[mask].reset_index(drop=True)
logger.info(f"🔍 RegexFilter: {before} → {len(result)} rows (column={self.config.column!r})")
return result
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from data_designer.plugins.plugin import Plugin, PluginType

regex_filter_plugin = Plugin(
config_qualified_name="data_designer_demo_processors.regex_filter.config.RegexFilterProcessorConfig",
impl_qualified_name="data_designer_demo_processors.regex_filter.impl.RegexFilterProcessor",
plugin_type=PluginType.PROCESSOR,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from __future__ import annotations

from typing import Literal

from pydantic import Field

from data_designer.config.processors import ProcessorConfig


class SemanticDedupProcessorConfig(ProcessorConfig):
"""Removes semantically similar rows using embedding similarity."""

processor_type: Literal["semantic-dedup"] = "semantic-dedup"
column: str = Field(description="Column to compute embeddings on.")
similarity_threshold: float = Field(default=0.9, description="Cosine similarity threshold for deduplication.")
model_name: str = Field(default="all-MiniLM-L6-v2", description="Sentence-transformers model name.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import numpy as np
from sentence_transformers import SentenceTransformer

from data_designer.engine.processing.processors.base import Processor
from data_designer_demo_processors.semantic_dedup.config import SemanticDedupProcessorConfig

if TYPE_CHECKING:
import pandas as pd

logger = logging.getLogger(__name__)


def _suppress_transformers_logging() -> None:
import transformers.utils.logging as tf_logging

tf_logging.set_verbosity_error()
tf_logging.disable_progress_bar()


class SemanticDedupProcessor(Processor[SemanticDedupProcessorConfig]):
"""Removes near-duplicate rows based on embedding cosine similarity."""

def _initialize(self) -> None:
_suppress_transformers_logging()
self._model = SentenceTransformer(self.config.model_name)

def process_after_generation(self, data: pd.DataFrame) -> pd.DataFrame:
texts = data[self.config.column].astype(str).tolist()
if len(texts) <= 1:
return data

embeddings = self._model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
sim_matrix = np.dot(embeddings, embeddings.T)

keep = set(range(len(texts)))
for i in range(len(texts)):
if i not in keep:
continue
for j in range(i + 1, len(texts)):
if j in keep and sim_matrix[i, j] >= self.config.similarity_threshold:
keep.discard(j)

before = len(data)
result = data.iloc[sorted(keep)].reset_index(drop=True)
logger.info(f"🧹 SemanticDedup: {before} → {len(result)} rows (threshold={self.config.similarity_threshold})")
return result
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from data_designer.plugins.plugin import Plugin, PluginType

semantic_dedup_plugin = Plugin(
config_qualified_name="data_designer_demo_processors.semantic_dedup.config.SemanticDedupProcessorConfig",
impl_qualified_name="data_designer_demo_processors.semantic_dedup.impl.SemanticDedupProcessor",
plugin_type=PluginType.PROCESSOR,
)
Empty file.
Loading