DataFog · sidmohan0 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/README.md b/README.md
@@ -29,13 +29,19 @@ values never echoed into logs or transcripts:
   ```
 
   Manual hook setup and limitations: [examples/claude_code_hook/](examples/claude_code_hook/).
+
 - **LiteLLM guardrail** (`DataFogGuardrail`): redacts or blocks PII in
   requests and responses at the gateway, for any LiteLLM-proxied provider.
   In-process (~31µs per request), no sidecar service. Setup:
   [examples/litellm_guardrail/](examples/litellm_guardrail/).
 
 Both default to the high-precision entity set (`EMAIL`, `PHONE`,
-`CREDIT_CARD`, `SSN`); noisier types are opt-in.
+`CREDIT_CARD`, `SSN`); noisier types are opt-in. Known-safe values can be
+exempted with an allowlist: `scan(text, allowlist=[...])` for exact values,
+`allowlist_patterns=[...]` for full-match regexes (e.g. `^\d{10}$` to stop
+unix timestamps matching as phone numbers) — available in both adapters and
+the API. Presidio-style entity names (`EMAIL_ADDRESS`, `PHONE_NUMBER`,
+`US_SSN`) are accepted as aliases for easy migration.
 
 ## Installation
 
@@ -137,7 +143,7 @@ Use the engine that matches your accuracy and dependency constraints:
 
 - `regex`:
   - Fastest and always available.
-  - Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DOB`, `ZIP`.
+  - Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DATE`, `ZIP_CODE` (`DOB` and `ZIP` are accepted as input aliases).
   - Use `locales=["de"]` for German structured IDs such as `DE_VAT_ID`, `DE_IBAN`, `DE_TAX_ID`, `DE_POSTAL_CODE`, and passport or residence permit numbers.
 - `spacy`:
   - Requires `pip install datafog[nlp]`.

diff --git a/datafog/__init__.py b/datafog/__init__.py
@@ -153,14 +153,28 @@ def scan(
     engine: str = "regex",
     entity_types: list[str] | None = None,
     locales: list[str] | None = None,
+    allowlist: list[str] | None = None,
+    allowlist_patterns: list[str] | None = None,
 ) -> ScanResult:
     """
     v5-preview scan entrypoint.
 
     Defaults to the lightweight regex engine so the core install works without
     optional dependency fallback warnings.
+
+    ``allowlist`` exempts exact entity texts (your own support address, doc
+    placeholders); ``allowlist_patterns`` exempts entities whose full text
+    matches a regex (e.g. ``^\\d{10}$`` so unix timestamps stop matching as
+    phone numbers).
     """
-    return _scan(text=text, engine=engine, entity_types=entity_types, locales=locales)
+    return _scan(
+        text=text,
+        engine=engine,
+        entity_types=entity_types,
+        locales=locales,
+        allowlist=allowlist,
+        allowlist_patterns=allowlist_patterns,
+    )
 
 
 def redact(
@@ -171,12 +185,17 @@ def redact(
     strategy: str = "token",
     preset: str | None = None,
     locales: list[str] | None = None,
+    allowlist: list[str] | None = None,
+    allowlist_patterns: list[str] | None = None,
 ) -> RedactResult:
     """
     v5-preview redaction entrypoint.
 
     If entities are provided, redact those spans. Otherwise, scan text first
-    using the selected engine and redact the detected entities.
+    using the selected engine and redact the detected entities. ``allowlist``
+    and ``allowlist_patterns`` exempt findings from redaction (exact text and
+    full-text regex match respectively); they apply to the scan path and are
+    rejected when explicit ``entities`` are supplied.
     """
     if preset is not None:
         try:
@@ -186,6 +205,11 @@ def redact(
             raise ValueError(f"preset must be one of: {allowed}") from exc
 
     if entities is not None:
+        if allowlist or allowlist_patterns:
+            raise ValueError(
+                "allowlist/allowlist_patterns cannot be combined with explicit "
+                "entities; filter the entities before calling redact"
+            )
         return _redact_entities(text=text, entities=entities, strategy=strategy)
 
     return _scan_and_redact(
@@ -194,6 +218,8 @@ def redact(
         entity_types=entity_types,
         strategy=strategy,
         locales=locales,
+        allowlist=allowlist,
+        allowlist_patterns=allowlist_patterns,
     )
 
 

diff --git a/datafog/engine.py b/datafog/engine.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import hashlib
+import re
 import warnings
 from dataclasses import dataclass
 from functools import lru_cache
@@ -23,6 +24,9 @@
     "SOCIAL_SECURITY_NUMBER": "SSN",
     "CREDIT_CARD_NUMBER": "CREDIT_CARD",
     "DATE_OF_BIRTH": "DATE",
+    # Presidio-compatible aliases, so configs migrate without renames.
+    "EMAIL_ADDRESS": "EMAIL",
+    "US_SSN": "SSN",
 }
 
 ALL_ENTITY_TYPES = {
@@ -277,6 +281,74 @@ def _filter_entity_types(
     return [entity for entity in entities if entity.type in allowed]
 
 
+# Python's re module backtracks; a quantified group containing another
+# quantifier (e.g. ``(a+)+``) can take exponential time on adversarial
+# input, and entity text can be attacker-influenced (LLM messages, tool
+# output). Reject that construct outright rather than matching under it.
+_NESTED_QUANTIFIER = re.compile(
+    r"\((?:[^()\\]|\\.)*(?<!\\)[+*}](?:[^()\\]|\\.)*\)\s*[+*{]"
+)
+MAX_ALLOWLIST_PATTERN_LENGTH = 512
+# Entities longer than this skip pattern matching (fail-safe: the finding
+# is kept, never suppressed) so match time stays bounded.
+MAX_PATTERN_SUBJECT_LENGTH = 512
+
+
+def _compile_allowlist_patterns(
+    allowlist_patterns: Optional[list[str]],
+) -> list["re.Pattern[str]"]:
+    compiled = []
+    for raw in allowlist_patterns or []:
+        if len(raw) > MAX_ALLOWLIST_PATTERN_LENGTH:
+            raise ValueError(
+                "allowlist_patterns entries must be at most "
+                f"{MAX_ALLOWLIST_PATTERN_LENGTH} characters"
+            )
+        if _NESTED_QUANTIFIER.search(raw):
+            raise ValueError(
+                "allowlist_patterns contains a quantified group with a nested "
+                f"quantifier ({raw!r}), which risks catastrophic backtracking; "
+                "rewrite the pattern without nesting quantifiers"
+            )
+        try:
+            compiled.append(re.compile(raw))
+        except re.error as exc:
+            raise ValueError(
+                f"allowlist_patterns contains an invalid regex: {raw!r} ({exc})"
+            ) from None
+    return compiled
+
+
+def _apply_allowlist(
+    entities: list[Entity],
+    allowlist: Optional[list[str]],
+    allowlist_patterns: Optional[list[str]],
+) -> list[Entity]:
+    """Drop entities whose exact text is allowlisted.
+
+    Matching semantics, deliberately strict for a security boundary:
+    exact values are case-sensitive with no Unicode normalization, and
+    patterns must fullmatch the entity text, so a partial match never
+    suppresses a finding. Allowlist entries and patterns are operator
+    configuration; treat them like code and never accept them from end
+    users.
+    """
+    if not allowlist and not allowlist_patterns:
+        return entities
+    exact = set(allowlist or [])
+    patterns = _compile_allowlist_patterns(allowlist_patterns)
+    return [
+        entity
+        for entity in entities
+        if entity.text not in exact
+        and not any(
+            pattern.fullmatch(entity.text)
+            for pattern in patterns
+            if len(entity.text) <= MAX_PATTERN_SUBJECT_LENGTH
+        )
+    ]
+
+
 def _needs_ner(entity_types: Optional[list[str]]) -> bool:
     if entity_types is None:
         return True
@@ -289,14 +361,25 @@ def scan(
     engine: str = "smart",
     entity_types: Optional[list[str]] = None,
     locales: Optional[list[str]] = None,
+    allowlist: Optional[list[str]] = None,
+    allowlist_patterns: Optional[list[str]] = None,
 ) -> ScanResult:
-    """Scan text for PII entities."""
+    """Scan text for PII entities.
+
+    ``allowlist`` exempts exact entity texts (e.g. your own support email);
+    ``allowlist_patterns`` exempts entities whose full text matches a regex
+    (e.g. ``^\\d{10}$`` to stop unix timestamps matching as phone numbers).
+    """
     if not isinstance(text, str):
         raise TypeError("text must be a string")
 
     if engine not in {"regex", "spacy", "gliner", "smart"}:
         raise ValueError("engine must be one of: regex, spacy, gliner, smart")
 
+    # Validate patterns up front so config errors fail fast even when the
+    # text contains no entities.
+    _compile_allowlist_patterns(allowlist_patterns)
+
     regex_entities = _regex_entities(
         text,
         entity_types=entity_types,
@@ -305,6 +388,7 @@ def scan(
 
     if engine == "regex":
         filtered = _filter_entity_types(regex_entities, entity_types)
+        filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns)
         return ScanResult(
             entities=_dedupe_entities(filtered), text=text, engine_used="regex"
         )
@@ -367,6 +451,7 @@ def scan(
                 )
 
     filtered = _filter_entity_types(combined, entity_types)
+    filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns)
     deduped = _dedupe_entities(filtered)
     return ScanResult(
         entities=deduped,
@@ -437,12 +522,16 @@ def scan_and_redact(
     entity_types: Optional[list[str]] = None,
     strategy: str = "token",
     locales: Optional[list[str]] = None,
+    allowlist: Optional[list[str]] = None,
+    allowlist_patterns: Optional[list[str]] = None,
 ) -> RedactResult:
     """Convenience wrapper: scan then redact."""
     scan_result = scan(
         text=text,
         engine=engine,
         entity_types=entity_types,
         locales=locales,
+        allowlist=allowlist,
+        allowlist_patterns=allowlist_patterns,
     )
     return redact(text=text, entities=scan_result.entities, strategy=strategy)
diff --git a/datafog/integrations/claude_code.py b/datafog/integrations/claude_code.py
@@ -16,6 +16,11 @@
 - ``DATAFOG_HOOK_ENTITIES``: comma-separated entity types to detect.
   Defaults to the high-precision set; noisy-in-code types (IP_ADDRESS,
   DOB, ZIP) must be opted into.
+- ``DATAFOG_HOOK_ALLOWLIST``: comma-separated exact values to exempt
+  (your own support address, documentation placeholders).
+- ``DATAFOG_HOOK_ALLOWLIST_PATTERNS``: comma-separated regexes; findings
+  whose full text matches are exempt (note: a pattern containing a comma
+  cannot be expressed here).
 
 Failure policy: fail open. A hook bug must never brick a Claude Code
 session, so any unexpected error exits non-blocking with no output.
@@ -59,6 +64,11 @@ def _action(env: Mapping[str, str]) -> str:
     return action if action in VALID_ACTIONS else "ask"
 
 
+def _csv_env(env: Mapping[str, str], name: str) -> list[str]:
+    raw = env.get(name, "")
+    return [item.strip() for item in raw.split(",") if item.strip()]
+
+
 def _iter_strings(value: Any) -> Iterator[str]:
     """Yield every string embedded in a JSON-like structure.
 
@@ -76,7 +86,12 @@ def _iter_strings(value: Any) -> Iterator[str]:
             stack.extend(current)
 
 
-def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]:
+def _scan_findings(
+    value: Any,
+    entity_types: list[str],
+    allowlist: list[str] | None = None,
+    allowlist_patterns: list[str] | None = None,
+) -> dict[str, int]:
     """Scan all strings in ``value``; return counts per entity type."""
     import datafog
 
@@ -87,7 +102,13 @@ def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]:
             break
         chunk = text[: min(MAX_SCAN_CHARS, total_budget)]
         total_budget -= len(chunk)
-        result = datafog.scan(chunk, engine="regex", entity_types=entity_types)
+        result = datafog.scan(
+            chunk,
+            engine="regex",
+            entity_types=entity_types,
+            allowlist=allowlist or None,
+            allowlist_patterns=allowlist_patterns or None,
+        )
         for entity in result.entities:
             counts[entity.type] = counts.get(entity.type, 0) + 1
     return counts
@@ -104,7 +125,12 @@ def _emit(event: str, fields: dict[str, Any]) -> str:
 
 
 def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str:
-    counts = _scan_findings(payload.get("tool_input"), _entity_types(env))
+    counts = _scan_findings(
+        payload.get("tool_input"),
+        _entity_types(env),
+        allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
+        allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
+    )
     if not counts:
         return ""
     tool = payload.get("tool_name", "tool")
@@ -119,7 +145,12 @@ def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str:
 
 
 def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str:
-    counts = _scan_findings(payload.get("prompt"), _entity_types(env))
+    counts = _scan_findings(
+        payload.get("prompt"),
+        _entity_types(env),
+        allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
+        allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
+    )
     if not counts:
         return ""
     context = (
@@ -130,7 +161,12 @@ def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str:
 
 
 def _handle_post_tool_use(payload: dict, env: Mapping[str, str]) -> str:
-    counts = _scan_findings(payload.get("tool_response"), _entity_types(env))
+    counts = _scan_findings(
+        payload.get("tool_response"),
+        _entity_types(env),
+        allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
+        allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
+    )
     if not counts:
         return ""
     tool = payload.get("tool_name", "tool")