Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 52 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,12 @@ help:
@echo " update-license-headers - Add license headers to all files"
@echo ""
@echo "⚡ Performance:"
@echo " perf-import - Profile import time and show summary"
@echo " perf-import CLEAN=1 - Clean cache, then profile import time"
@echo " perf-import NOFILE=1 - Profile without writing to file (for CI)"
@echo " perf-import - Profile pure import time and show summary"
@echo " perf-import CLEAN=1 - Clean cache, then profile pure import time"
@echo " perf-import NOFILE=1 - Profile pure import without writing to file (for CI)"
@echo " perf-import-runtime - Profile runtime init time (constructors included)"
@echo " bench-cli-startup - Benchmark CLI startup (isolated venv)"
@echo " bench-cli-startup-verbose - Benchmark CLI startup with import trace"
@echo ""
@echo "🚀 Publish:"
@echo " publish VERSION=X.Y.Z - Publish all packages to PyPI"
Expand Down Expand Up @@ -481,9 +484,9 @@ perf-import:
ifdef CLEAN
@$(MAKE) clean-pycache
endif
@echo "⚡ Profiling import time for data_designer.config and data_designer.interface.DataDesigner..."
@echo "⚡ Profiling pure import time for data_designer.config and DataDesigner symbol..."
ifdef NOFILE
@PERF_OUTPUT=$$(uv run python -X importtime -c "import data_designer.config as dd; from data_designer.interface import DataDesigner; DataDesigner(); dd.DataDesignerConfigBuilder()" 2>&1); \
@PERF_OUTPUT=$$(uv run python -X importtime -c "import data_designer.config as dd; from data_designer.interface import DataDesigner" 2>&1); \
echo "$$PERF_OUTPUT"; \
echo ""; \
echo "Summary:"; \
Expand All @@ -495,7 +498,7 @@ ifdef NOFILE
echo "$$PERF_OUTPUT" | grep "import time:" | sort -rn -k5 | head -10 | awk '{printf "%-12.3f %-12.3f %s", $$3/1000000, $$5/1000000, $$7; for(i=8;i<=NF;i++) printf " %s", $$i; printf "\n"}'
else
@PERF_FILE="perf_import_$$(date +%Y%m%d_%H%M%S).txt"; \
uv run python -X importtime -c "import data_designer.config as dd; from data_designer.interface import DataDesigner; DataDesigner(); dd.DataDesignerConfigBuilder()" > "$$PERF_FILE" 2>&1; \
uv run python -X importtime -c "import data_designer.config as dd; from data_designer.interface import DataDesigner" > "$$PERF_FILE" 2>&1; \
echo "📊 Import profile saved to $$PERF_FILE"; \
echo ""; \
echo "Summary:"; \
Expand All @@ -507,6 +510,46 @@ else
grep "import time:" "$$PERF_FILE" | sort -rn -k5 | head -10 | awk '{printf "%-12.3f %-12.3f %s", $$3/1000000, $$5/1000000, $$7; for(i=8;i<=NF;i++) printf " %s", $$i; printf "\n"}'
endif

perf-import-runtime:
ifdef CLEAN
@$(MAKE) clean-pycache
endif
@echo "⚡ Profiling runtime initialization time (DataDesigner + DataDesignerConfigBuilder constructors)..."
ifdef NOFILE
@PERF_OUTPUT=$$(uv run python -X importtime -c "import data_designer.config as dd; from data_designer.interface import DataDesigner; DataDesigner(); dd.DataDesignerConfigBuilder()" 2>&1); \
echo "$$PERF_OUTPUT"; \
echo ""; \
echo "Summary:"; \
echo "$$PERF_OUTPUT" | tail -1 | awk '{printf " Total: %.3fs\n", $$5/1000000}'; \
echo ""; \
echo "💡 Top 10 slowest imports:"; \
printf "%-12s %-12s %s\n" "Self (s)" "Cumulative (s)" "Module"; \
printf "%-12s %-12s %s\n" "--------" "--------------" "------"; \
echo "$$PERF_OUTPUT" | grep "import time:" | sort -rn -k5 | head -10 | awk '{printf "%-12.3f %-12.3f %s", $$3/1000000, $$5/1000000, $$7; for(i=8;i<=NF;i++) printf " %s", $$i; printf "\n"}'
else
@PERF_FILE="perf_import_runtime_$$(date +%Y%m%d_%H%M%S).txt"; \
uv run python -X importtime -c "import data_designer.config as dd; from data_designer.interface import DataDesigner; DataDesigner(); dd.DataDesignerConfigBuilder()" > "$$PERF_FILE" 2>&1; \
echo "📊 Runtime import profile saved to $$PERF_FILE"; \
echo ""; \
echo "Summary:"; \
tail -1 "$$PERF_FILE" | awk '{printf " Total: %.3fs\n", $$5/1000000}'; \
echo ""; \
echo "💡 Top 10 slowest imports:"; \
printf "%-12s %-12s %s\n" "Self (s)" "Cumulative (s)" "Module"; \
printf "%-12s %-12s %s\n" "--------" "--------------" "------"; \
grep "import time:" "$$PERF_FILE" | sort -rn -k5 | head -10 | awk '{printf "%-12.3f %-12.3f %s", $$3/1000000, $$5/1000000, $$7; for(i=8;i<=NF;i++) printf " %s", $$i; printf "\n"}'
endif

BENCH_CLI_ARGS ?=

bench-cli-startup:
@echo "⚡ Benchmarking CLI startup time (isolated venv)..."
uv run python scripts/benchmarks/benchmark_cli_startup.py $(BENCH_CLI_ARGS)

bench-cli-startup-verbose:
@echo "⚡ Benchmarking CLI startup time (isolated + import trace)..."
uv run python scripts/benchmarks/benchmark_cli_startup.py --verbose $(BENCH_CLI_ARGS)

# ==============================================================================
# PUBLISH
# ==============================================================================
Expand Down Expand Up @@ -576,7 +619,8 @@ clean-test-coverage:
# PHONY TARGETS
# ==============================================================================

.PHONY: build build-config build-engine build-interface \
.PHONY: bench-cli-startup bench-cli-startup-verbose \
build build-config build-engine build-interface \
check-all check-all-fix check-config check-engine check-interface \
check-license-headers \
clean clean-dist clean-notebooks clean-pycache clean-test-coverage \
Expand All @@ -586,7 +630,7 @@ clean-test-coverage:
generate-colab-notebooks help \
install install-dev install-dev-notebooks install-dev-recipes \
lint lint-config lint-engine lint-fix lint-fix-config lint-fix-engine lint-fix-interface lint-interface \
perf-import publish serve-docs-locally show-versions \
perf-import perf-import-runtime publish serve-docs-locally show-versions \
health-checks \
test test-config test-config-isolated test-e2e test-engine test-engine-isolated \
test-interface test-interface-isolated test-isolated \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@
SeedConfig,
)
from data_designer.config.seed_source import ( # noqa: F401
DataFrameSeedSource,
HuggingFaceSeedSource,
LocalFileSeedSource,
)
from data_designer.config.seed_source_dataframe import DataFrameSeedSource # noqa: F401
from data_designer.config.utils.code_lang import CodeLang # noqa: F401
from data_designer.config.utils.info import InfoType # noqa: F401
from data_designer.config.utils.trace_type import TraceType # noqa: F401
Expand Down Expand Up @@ -196,7 +196,7 @@
"SamplingStrategy": (_MOD_SEED, "SamplingStrategy"),
"SeedConfig": (_MOD_SEED, "SeedConfig"),
# seed_source
"DataFrameSeedSource": (_MOD_SEED_SOURCE, "DataFrameSeedSource"),
"DataFrameSeedSource": (f"{_MOD_BASE}.seed_source_dataframe", "DataFrameSeedSource"),
"HuggingFaceSeedSource": (_MOD_SEED_SOURCE, "HuggingFaceSeedSource"),
"LocalFileSeedSource": (_MOD_SEED_SOURCE, "LocalFileSeedSource"),
# utils
Expand Down Expand Up @@ -224,7 +224,10 @@ def __getattr__(name: str) -> object:
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
module = importlib.import_module(module_path)
return getattr(module, attr_name)
attr = getattr(module, attr_name)
# Cache so subsequent accesses find a real attribute and skip __getattr__.
globals()[name] = attr
return attr

raise AttributeError(f"module 'data_designer.config' has no attribute {name!r}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from data_designer.config.sampler_params import SamplerType
from data_designer.config.utils.constants import EPSILON
from data_designer.config.utils.numerical_helpers import is_float, is_int, prepare_number_for_reporting
from data_designer.lazy_heavy_imports import pd
from data_designer.plugin_manager import PluginManager

if TYPE_CHECKING:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
SamplingStrategy,
SeedConfig,
)
from data_designer.config.seed_source import DataFrameSeedSource
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
from data_designer.config.seed_source_types import SeedSourceT
from data_designer.config.utils.constants import DEFAULT_REPR_HTML_STYLE, REPR_HTML_TEMPLATE
from data_designer.config.utils.info import ConfigBuilderInfo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from data_designer.config.models import ModelConfig, ModelProvider
from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS
from data_designer.config.utils.info import InterfaceInfo
from data_designer.lazy_heavy_imports import pd

if TYPE_CHECKING:
import pandas as pd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from abc import ABC, abstractmethod
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar
from typing import Annotated, Any, Generic, Literal, TypeVar

from pydantic import BaseModel, Field, field_validator, model_validator
from typing_extensions import Self, TypeAlias

import data_designer.lazy_heavy_imports as lazy
from data_designer.config.base import ConfigBase
from data_designer.config.errors import InvalidConfigError
from data_designer.config.utils.constants import (
Expand All @@ -22,10 +23,6 @@
MIN_TOP_P,
)
from data_designer.config.utils.io_helpers import smart_load_yaml
from data_designer.lazy_heavy_imports import np

if TYPE_CHECKING:
import numpy as np

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -192,7 +189,7 @@ def sample(self) -> float:
Returns:
A float value sampled from the manual distribution.
"""
return float(np.random.choice(self.params.values, p=self.params.weights))
return float(lazy.np.random.choice(self.params.values, p=self.params.weights))


class UniformDistributionParams(ConfigBase):
Expand Down Expand Up @@ -233,7 +230,7 @@ def sample(self) -> float:
Returns:
A float value sampled from the uniform distribution.
"""
return float(np.random.uniform(low=self.params.low, high=self.params.high, size=1)[0])
return float(lazy.np.random.uniform(low=self.params.low, high=self.params.high, size=1)[0])


DistributionT: TypeAlias = UniformDistribution | ManualDistribution
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.dataset_metadata import DatasetMetadata
from data_designer.config.utils.visualization import WithRecordSamplerMixin
from data_designer.lazy_heavy_imports import pd

if TYPE_CHECKING:
import pandas as pd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from __future__ import annotations

from enum import Enum
from typing import TYPE_CHECKING, Literal
from typing import Literal

from pydantic import Field, field_validator, model_validator
from typing_extensions import Self, TypeAlias

import data_designer.lazy_heavy_imports as lazy
from data_designer.config.base import ConfigBase
from data_designer.config.utils.constants import (
AVAILABLE_LOCALES,
Expand All @@ -17,10 +18,6 @@
MAX_AGE,
MIN_AGE,
)
from data_designer.lazy_heavy_imports import pd

if TYPE_CHECKING:
import pandas as pd


class SamplerType(str, Enum):
Expand Down Expand Up @@ -118,7 +115,7 @@ class DatetimeSamplerParams(ConfigBase):
@classmethod
def _validate_param_is_datetime(cls, value: str) -> str:
try:
pd.to_datetime(value)
lazy.pd.to_datetime(value)
except ValueError:
raise ValueError(f"Invalid datetime format: {value}")
return value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@
from abc import ABC
from typing import TYPE_CHECKING, Literal

from pydantic import BaseModel, ConfigDict, Field, field_validator
from pydantic.json_schema import SkipJsonSchema
from pydantic import BaseModel, Field, field_validator
from typing_extensions import Self

from data_designer.config.utils.io_helpers import (
VALID_DATASET_FILE_EXTENSIONS,
validate_dataset_file_path,
validate_path_contains_files_of_type,
)
from data_designer.lazy_heavy_imports import pd

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -67,18 +65,3 @@ class HuggingFaceSeedSource(SeedSource):
)
token: str | None = None
endpoint: str = "https://huggingface.co"


class DataFrameSeedSource(SeedSource):
seed_type: Literal["df"] = "df"

model_config = ConfigDict(arbitrary_types_allowed=True)

df: SkipJsonSchema[pd.DataFrame] = Field(
...,
exclude=True,
description=(
"DataFrame to use directly as the seed dataset. NOTE: if you need to write a Data Designer config, "
"you must use `LocalFileSeedSource` instead, since DataFrame objects are not serializable."
),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from typing import Literal

# Keep direct pandas import: Pydantic resolves DataFrame at module load,
# and this also preserves IDE typing/autocomplete.
import pandas as pd
from pydantic import ConfigDict, Field
from pydantic.json_schema import SkipJsonSchema

from data_designer.config.seed_source import SeedSource


class DataFrameSeedSource(SeedSource):
seed_type: Literal["df"] = "df"

model_config = ConfigDict(arbitrary_types_allowed=True)

df: SkipJsonSchema[pd.DataFrame] = Field(
...,
exclude=True,
description=(
"DataFrame to use directly as the seed dataset. NOTE: if you need to write a Data Designer config, "
"you must use `LocalFileSeedSource` instead, since DataFrame objects are not serializable."
),
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from pydantic import Field
from typing_extensions import TypeAlias

from data_designer.config.seed_source import DataFrameSeedSource, HuggingFaceSeedSource, LocalFileSeedSource
from data_designer.config.seed_source import HuggingFaceSeedSource, LocalFileSeedSource
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
from data_designer.plugin_manager import PluginManager

plugin_manager = PluginManager()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
import pytest
import yaml

import data_designer.lazy_heavy_imports as lazy
from data_designer.config.analysis.column_statistics import GeneralColumnStatistics
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.column_configs import SamplerColumnConfig
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.data_designer_config import DataDesignerConfig
from data_designer.config.models import ChatCompletionInferenceParams, ModelConfig, ModelProvider
from data_designer.lazy_heavy_imports import pd

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -172,7 +172,7 @@ def stub_complete_builder(stub_data_designer_builder_config_str: str) -> DataDes

@pytest.fixture
def stub_dataframe() -> pd.DataFrame:
return pd.DataFrame(
return lazy.pd.DataFrame(
{
"name": ["John", "Jane", "Jim", "Jill", "Mike", "Mary", "Mark", "Martha", "Alex", "Alice", "Bob", "Bella"],
"age": [25, 30, 35, 40, 45, 50, 55, 60, 22, 28, 65, 38],
Expand Down Expand Up @@ -255,8 +255,8 @@ def stub_dataframe() -> pd.DataFrame:
def stub_dataset_tar_file():
with tempfile.TemporaryDirectory() as temp_dir:
# Create valid parquet files with actual data
df1 = pd.DataFrame({"id": ["1", "2"], "name": ["test", "sample"]})
df2 = pd.DataFrame({"id": ["3", "4"], "name": ["data", "example"]})
df1 = lazy.pd.DataFrame({"id": ["1", "2"], "name": ["test", "sample"]})
df2 = lazy.pd.DataFrame({"id": ["3", "4"], "name": ["data", "example"]})

# Write parquet files
os.makedirs(temp_dir + "/dataset", exist_ok=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Helper utilities for working with images."""
Expand All @@ -9,15 +9,11 @@
import io
import re
from pathlib import Path
from typing import TYPE_CHECKING

import requests

import data_designer.lazy_heavy_imports as lazy
from data_designer.config.models import ImageFormat
from data_designer.lazy_heavy_imports import Image

if TYPE_CHECKING:
from PIL import Image

# Magic bytes for image format detection
IMAGE_FORMAT_MAGIC_BYTES = {
Expand Down Expand Up @@ -136,7 +132,7 @@ def detect_image_format(image_bytes: bytes) -> ImageFormat:

# Fallback to PIL for robust detection
try:
img = Image.open(io.BytesIO(image_bytes))
img = lazy.Image.open(io.BytesIO(image_bytes))
format_str = img.format.lower() if img.format else None
if format_str in _PIL_FORMAT_TO_IMAGE_FORMAT:
return _PIL_FORMAT_TO_IMAGE_FORMAT[format_str]
Expand Down Expand Up @@ -263,7 +259,7 @@ def validate_image(image_path: Path) -> None:
ValueError: If image is corrupted or unreadable
"""
try:
with Image.open(image_path) as img:
with lazy.Image.open(image_path) as img:
img.verify()
except Exception as e:
raise ValueError(f"Image validation failed: {e}") from e
Loading