Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,19 @@ values never echoed into logs or transcripts:
```

Manual hook setup and limitations: [examples/claude_code_hook/](examples/claude_code_hook/).

- **LiteLLM guardrail** (`DataFogGuardrail`): redacts or blocks PII in
requests and responses at the gateway, for any LiteLLM-proxied provider.
In-process (~31µs per request), no sidecar service. Setup:
[examples/litellm_guardrail/](examples/litellm_guardrail/).

Both default to the high-precision entity set (`EMAIL`, `PHONE`,
`CREDIT_CARD`, `SSN`); noisier types are opt-in.
`CREDIT_CARD`, `SSN`); noisier types are opt-in. Known-safe values can be
exempted with an allowlist: `scan(text, allowlist=[...])` for exact values,
`allowlist_patterns=[...]` for full-match regexes (e.g. `^\d{10}$` to stop
unix timestamps matching as phone numbers) — available in both adapters and
the API. Presidio-style entity names (`EMAIL_ADDRESS`, `PHONE_NUMBER`,
`US_SSN`) are accepted as aliases for easy migration.

## Installation

Expand Down Expand Up @@ -137,7 +143,7 @@ Use the engine that matches your accuracy and dependency constraints:

- `regex`:
- Fastest and always available.
- Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DOB`, `ZIP`.
- Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DATE`, `ZIP_CODE` (`DOB` and `ZIP` are accepted as input aliases).
- Use `locales=["de"]` for German structured IDs such as `DE_VAT_ID`, `DE_IBAN`, `DE_TAX_ID`, `DE_POSTAL_CODE`, and passport or residence permit numbers.
- `spacy`:
- Requires `pip install datafog[nlp]`.
Expand Down
30 changes: 28 additions & 2 deletions datafog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,28 @@ def scan(
engine: str = "regex",
entity_types: list[str] | None = None,
locales: list[str] | None = None,
allowlist: list[str] | None = None,
allowlist_patterns: list[str] | None = None,
) -> ScanResult:
"""
v5-preview scan entrypoint.

Defaults to the lightweight regex engine so the core install works without
optional dependency fallback warnings.

``allowlist`` exempts exact entity texts (your own support address, doc
placeholders); ``allowlist_patterns`` exempts entities whose full text
matches a regex (e.g. ``^\\d{10}$`` so unix timestamps stop matching as
phone numbers).
"""
return _scan(text=text, engine=engine, entity_types=entity_types, locales=locales)
return _scan(
text=text,
engine=engine,
entity_types=entity_types,
locales=locales,
allowlist=allowlist,
allowlist_patterns=allowlist_patterns,
)


def redact(
Expand All @@ -171,12 +185,17 @@ def redact(
strategy: str = "token",
preset: str | None = None,
locales: list[str] | None = None,
allowlist: list[str] | None = None,
allowlist_patterns: list[str] | None = None,
) -> RedactResult:
"""
v5-preview redaction entrypoint.

If entities are provided, redact those spans. Otherwise, scan text first
using the selected engine and redact the detected entities.
using the selected engine and redact the detected entities. ``allowlist``
and ``allowlist_patterns`` exempt findings from redaction (exact text and
full-text regex match respectively); they apply to the scan path and are
rejected when explicit ``entities`` are supplied.
"""
if preset is not None:
try:
Expand All @@ -186,6 +205,11 @@ def redact(
raise ValueError(f"preset must be one of: {allowed}") from exc

if entities is not None:
if allowlist or allowlist_patterns:
raise ValueError(
"allowlist/allowlist_patterns cannot be combined with explicit "
"entities; filter the entities before calling redact"
)
return _redact_entities(text=text, entities=entities, strategy=strategy)

return _scan_and_redact(
Expand All @@ -194,6 +218,8 @@ def redact(
entity_types=entity_types,
strategy=strategy,
locales=locales,
allowlist=allowlist,
allowlist_patterns=allowlist_patterns,
)


Expand Down
91 changes: 90 additions & 1 deletion datafog/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import hashlib
import re
import warnings
from dataclasses import dataclass
from functools import lru_cache
Expand All @@ -23,6 +24,9 @@
"SOCIAL_SECURITY_NUMBER": "SSN",
"CREDIT_CARD_NUMBER": "CREDIT_CARD",
"DATE_OF_BIRTH": "DATE",
# Presidio-compatible aliases, so configs migrate without renames.
"EMAIL_ADDRESS": "EMAIL",
"US_SSN": "SSN",
}

ALL_ENTITY_TYPES = {
Expand Down Expand Up @@ -277,6 +281,74 @@ def _filter_entity_types(
return [entity for entity in entities if entity.type in allowed]


# Python's re module backtracks; a quantified group containing another
# quantifier (e.g. ``(a+)+``) can take exponential time on adversarial
# input, and entity text can be attacker-influenced (LLM messages, tool
# output). Reject that construct outright rather than matching under it.
_NESTED_QUANTIFIER = re.compile(
r"\((?:[^()\\]|\\.)*(?<!\\)[+*}](?:[^()\\]|\\.)*\)\s*[+*{]"
)
MAX_ALLOWLIST_PATTERN_LENGTH = 512
# Entities longer than this skip pattern matching (fail-safe: the finding
# is kept, never suppressed) so match time stays bounded.
MAX_PATTERN_SUBJECT_LENGTH = 512


def _compile_allowlist_patterns(
allowlist_patterns: Optional[list[str]],
) -> list["re.Pattern[str]"]:
compiled = []
for raw in allowlist_patterns or []:
if len(raw) > MAX_ALLOWLIST_PATTERN_LENGTH:
raise ValueError(
"allowlist_patterns entries must be at most "
f"{MAX_ALLOWLIST_PATTERN_LENGTH} characters"
)
if _NESTED_QUANTIFIER.search(raw):
raise ValueError(
"allowlist_patterns contains a quantified group with a nested "
f"quantifier ({raw!r}), which risks catastrophic backtracking; "
"rewrite the pattern without nesting quantifiers"
)
try:
compiled.append(re.compile(raw))
except re.error as exc:
raise ValueError(
f"allowlist_patterns contains an invalid regex: {raw!r} ({exc})"
) from None
return compiled


def _apply_allowlist(
entities: list[Entity],
allowlist: Optional[list[str]],
allowlist_patterns: Optional[list[str]],
) -> list[Entity]:
"""Drop entities whose exact text is allowlisted.

Matching semantics, deliberately strict for a security boundary:
exact values are case-sensitive with no Unicode normalization, and
patterns must fullmatch the entity text, so a partial match never
suppresses a finding. Allowlist entries and patterns are operator
configuration; treat them like code and never accept them from end
users.
"""
if not allowlist and not allowlist_patterns:
return entities
exact = set(allowlist or [])
patterns = _compile_allowlist_patterns(allowlist_patterns)
return [
entity
for entity in entities
if entity.text not in exact
and not any(
pattern.fullmatch(entity.text)
for pattern in patterns
if len(entity.text) <= MAX_PATTERN_SUBJECT_LENGTH
)
]


def _needs_ner(entity_types: Optional[list[str]]) -> bool:
if entity_types is None:
return True
Expand All @@ -289,14 +361,25 @@ def scan(
engine: str = "smart",
entity_types: Optional[list[str]] = None,
locales: Optional[list[str]] = None,
allowlist: Optional[list[str]] = None,
allowlist_patterns: Optional[list[str]] = None,
) -> ScanResult:
"""Scan text for PII entities."""
"""Scan text for PII entities.

``allowlist`` exempts exact entity texts (e.g. your own support email);
``allowlist_patterns`` exempts entities whose full text matches a regex
(e.g. ``^\\d{10}$`` to stop unix timestamps matching as phone numbers).
"""
if not isinstance(text, str):
raise TypeError("text must be a string")

if engine not in {"regex", "spacy", "gliner", "smart"}:
raise ValueError("engine must be one of: regex, spacy, gliner, smart")

# Validate patterns up front so config errors fail fast even when the
# text contains no entities.
_compile_allowlist_patterns(allowlist_patterns)

regex_entities = _regex_entities(
text,
entity_types=entity_types,
Expand All @@ -305,6 +388,7 @@ def scan(

if engine == "regex":
filtered = _filter_entity_types(regex_entities, entity_types)
filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns)
return ScanResult(
entities=_dedupe_entities(filtered), text=text, engine_used="regex"
)
Expand Down Expand Up @@ -367,6 +451,7 @@ def scan(
)

filtered = _filter_entity_types(combined, entity_types)
filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns)
deduped = _dedupe_entities(filtered)
return ScanResult(
entities=deduped,
Expand Down Expand Up @@ -437,12 +522,16 @@ def scan_and_redact(
entity_types: Optional[list[str]] = None,
strategy: str = "token",
locales: Optional[list[str]] = None,
allowlist: Optional[list[str]] = None,
allowlist_patterns: Optional[list[str]] = None,
) -> RedactResult:
"""Convenience wrapper: scan then redact."""
scan_result = scan(
text=text,
engine=engine,
entity_types=entity_types,
locales=locales,
allowlist=allowlist,
allowlist_patterns=allowlist_patterns,
)
return redact(text=text, entities=scan_result.entities, strategy=strategy)
46 changes: 41 additions & 5 deletions datafog/integrations/claude_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
- ``DATAFOG_HOOK_ENTITIES``: comma-separated entity types to detect.
Defaults to the high-precision set; noisy-in-code types (IP_ADDRESS,
DOB, ZIP) must be opted into.
- ``DATAFOG_HOOK_ALLOWLIST``: comma-separated exact values to exempt
(your own support address, documentation placeholders).
- ``DATAFOG_HOOK_ALLOWLIST_PATTERNS``: comma-separated regexes; findings
whose full text matches are exempt (note: a pattern containing a comma
cannot be expressed here).

Failure policy: fail open. A hook bug must never brick a Claude Code
session, so any unexpected error exits non-blocking with no output.
Expand Down Expand Up @@ -59,6 +64,11 @@ def _action(env: Mapping[str, str]) -> str:
return action if action in VALID_ACTIONS else "ask"


def _csv_env(env: Mapping[str, str], name: str) -> list[str]:
raw = env.get(name, "")
return [item.strip() for item in raw.split(",") if item.strip()]


def _iter_strings(value: Any) -> Iterator[str]:
"""Yield every string embedded in a JSON-like structure.

Expand All @@ -76,7 +86,12 @@ def _iter_strings(value: Any) -> Iterator[str]:
stack.extend(current)


def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]:
def _scan_findings(
value: Any,
entity_types: list[str],
allowlist: list[str] | None = None,
allowlist_patterns: list[str] | None = None,
) -> dict[str, int]:
"""Scan all strings in ``value``; return counts per entity type."""
import datafog

Expand All @@ -87,7 +102,13 @@ def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]:
break
chunk = text[: min(MAX_SCAN_CHARS, total_budget)]
total_budget -= len(chunk)
result = datafog.scan(chunk, engine="regex", entity_types=entity_types)
result = datafog.scan(
chunk,
engine="regex",
entity_types=entity_types,
allowlist=allowlist or None,
allowlist_patterns=allowlist_patterns or None,
)
for entity in result.entities:
counts[entity.type] = counts.get(entity.type, 0) + 1
return counts
Expand All @@ -104,7 +125,12 @@ def _emit(event: str, fields: dict[str, Any]) -> str:


def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str:
counts = _scan_findings(payload.get("tool_input"), _entity_types(env))
counts = _scan_findings(
payload.get("tool_input"),
_entity_types(env),
allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
)
if not counts:
return ""
tool = payload.get("tool_name", "tool")
Expand All @@ -119,7 +145,12 @@ def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str:


def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str:
counts = _scan_findings(payload.get("prompt"), _entity_types(env))
counts = _scan_findings(
payload.get("prompt"),
_entity_types(env),
allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
)
if not counts:
return ""
context = (
Expand All @@ -130,7 +161,12 @@ def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str:


def _handle_post_tool_use(payload: dict, env: Mapping[str, str]) -> str:
counts = _scan_findings(payload.get("tool_response"), _entity_types(env))
counts = _scan_findings(
payload.get("tool_response"),
_entity_types(env),
allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
)
if not counts:
return ""
tool = payload.get("tool_name", "tool")
Expand Down
Loading