Skip to content

Commit ef7c795

Browse files
Sid MohanSid Mohan
Sid Mohan
authored and
Sid Mohan
committed
first test
1 parent 5e4ddcf commit ef7c795

File tree

6 files changed

+50
-673
lines changed

6 files changed

+50
-673
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@ build/
1616
node_modules
1717
datafog_debug.log
1818
sotu_2023.txt
19-
/examples/*
19+
/examples/*
20+
.DS_Store

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setup(
1010
name="datafog",
11-
version="2.2.0",
11+
version="2.3.0b3",
1212
author="Sid Mohan",
1313
author_email="[email protected]",
1414
description="Scan, redact, and manage PII in your documents before they get uploaded to a Retrieval Augmented Generation (RAG) system.",

sotu_2023.txt

-667
This file was deleted.

src/datafog/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# SSOT for the package version
2-
__version__ = "2.2.0"
2+
__version__ = "2.3.0b3"

src/datafog/pii_tools/PresidioEngine/__init__.py

+44-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,35 @@
1-
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
1+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, RecognizerResult, PatternRecognizer, Pattern
22
from presidio_analyzer.nlp_engine import NlpEngineProvider
3-
3+
from typing import List, Optional, Tuple
4+
import logging
45
from .analyzer import CustomSpacyRecognizer
56

7+
logger = logging.getLogger("presidio-engine-init").setLevel(logging.ERROR)
68

79
# Helper methods
10+
def create_ad_hoc_deny_list_recognizer(
11+
deny_list=Optional[List[str]],
12+
) -> Optional[PatternRecognizer]:
13+
if not deny_list:
14+
return None
15+
16+
deny_list_recognizer = PatternRecognizer(
17+
supported_entity="GENERIC_PII", deny_list=deny_list
18+
)
19+
return deny_list_recognizer
20+
21+
22+
def create_ad_hoc_regex_recognizer(
23+
regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
24+
) -> Optional[PatternRecognizer]:
25+
if not regex:
26+
return None
27+
pattern = Pattern(name="Regex Pattern", regex=regex, score=score)
28+
regex_recognizer = PatternRecognizer(
29+
supported_entity=entity_type, patterns=[pattern], context=context
30+
)
31+
return regex_recognizer
32+
833
def analyzer_engine():
934
"""Return AnalyzerEngine."""
1035

@@ -59,6 +84,23 @@ def scan(text, **kwargs):
5984
kwargs.setdefault("language", "en")
6085
kwargs.setdefault("score_threshold", 0.35)
6186
kwargs.setdefault("nlp_artifacts", None)
87+
kwargs.setdefault("entities", [])
88+
kwargs.setdefault("allow_list", [])
89+
kwargs.setdefault("deny_list", [])
90+
91+
"""Analyze input using Analyzer engine and input arguments (kwargs)."""
92+
if "entities" not in kwargs or "All" in kwargs["entities"]:
93+
kwargs["entities"] = None
94+
95+
if "deny_list" in kwargs and kwargs["deny_list"] is not None:
96+
ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"])
97+
kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
98+
del kwargs["deny_list"]
99+
100+
if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
101+
ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
102+
kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
103+
del kwargs["regex_params"]
62104

63105
# init analyzer instance
64106
analyzer = analyzer_engine()

src/datafog/pii_tools/PresidioEngine/analyzer.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
from presidio_analyzer import AnalysisExplanation, LocalRecognizer, RecognizerResult
55

6-
logger = logging.getLogger("presidio-module")
6+
logger = logging.getLogger("custom-spacy-recognizer").setLevel(logging.ERROR)
7+
78

89

910
class CustomSpacyRecognizer(LocalRecognizer):

0 commit comments

Comments
 (0)