Skip to content

Commit 470ac08

Browse files
authored
Merge pull request #7 from DataFog/v2.3.0b3
V2.3.0
2 parents 5e4ddcf + 6257657 commit 470ac08

File tree

10 files changed

+78
-872
lines changed

10 files changed

+78
-872
lines changed

.DS_Store

2 KB
Binary file not shown.

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@ build/
1616
node_modules
1717
datafog_debug.log
1818
sotu_2023.txt
19-
/examples/*
19+
/examples/*
20+
.DS_Store

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setup(
1010
name="datafog",
11-
version="2.2.0",
11+
version="2.3.0",
1212
author="Sid Mohan",
1313
author_email="[email protected]",
1414
description="Scan, redact, and manage PII in your documents before they get uploaded to a Retrieval Augmented Generation (RAG) system.",

sotu_2023.txt

-667
This file was deleted.

src/datafog/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# SSOT for the package version
2-
__version__ = "2.2.0"
2+
__version__ = "2.3.0"

src/datafog/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# datafog-python/src/datafog/__init__.py
22
import json
3+
import logging
34

45
import pandas as pd
56
import requests
@@ -8,6 +9,8 @@
89
from .__about__ import __version__
910
from .pii_tools import PresidioEngine
1011

12+
logger = logging.getLogger(__name__).setLevel(logging.ERROR)
13+
1114
__all__ = [
1215
"__version__",
1316
"PresidioEngine",

src/datafog/pii_tools/PresidioEngine/__init__.py

+70-1
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,69 @@
1-
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
1+
import logging
2+
from typing import List, Optional
3+
4+
from presidio_analyzer import (
5+
AnalyzerEngine,
6+
Pattern,
7+
PatternRecognizer,
8+
RecognizerRegistry,
9+
)
210
from presidio_analyzer.nlp_engine import NlpEngineProvider
311

412
from .analyzer import CustomSpacyRecognizer
513

14+
logger = logging.getLogger("presidio-engine-init").setLevel(logging.ERROR)
15+
616

717
# Helper methods
18+
def create_ad_hoc_deny_list_recognizer(
19+
deny_list=Optional[List[str]],
20+
) -> Optional[PatternRecognizer]:
21+
if not deny_list:
22+
return None
23+
24+
deny_list_recognizer = PatternRecognizer(
25+
supported_entity="CUSTOM_PII", deny_list=deny_list
26+
)
27+
return deny_list_recognizer
28+
29+
30+
def create_ad_hoc_regex_recognizer(
31+
regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
32+
) -> Optional[PatternRecognizer]:
33+
if not regex:
34+
return None
35+
pattern = Pattern(name="Regex Pattern", regex=regex, score=score)
36+
regex_recognizer = PatternRecognizer(
37+
supported_entity=entity_type, patterns=[pattern], context=context
38+
)
39+
return regex_recognizer
40+
41+
842
def analyzer_engine():
943
"""Return AnalyzerEngine."""
1044

1145
spacy_recognizer = CustomSpacyRecognizer()
1246
configuration = {
1347
"nlp_engine_name": "spacy",
1448
"models": [{"lang_code": "en", "model_name": "en_spacy_pii_fast"}],
49+
"ner_model_configuration": {
50+
"model_to_presidio_entity_mapping": {
51+
"PER": "PERSON",
52+
"PERSON": "PERSON",
53+
"NORP": "NRP",
54+
"FAC": "FACILITY",
55+
"LOC": "LOCATION",
56+
"GPE": "LOCATION",
57+
"LOCATION": "LOCATION",
58+
"ORG": "ORGANIZATION",
59+
"ORGANIZATION": "ORGANIZATION",
60+
"DATE": "DATE_TIME",
61+
"TIME": "DATE_TIME",
62+
},
63+
"low_confidence_score_multiplier": 0.4,
64+
"low_score_entity_names": ["ORG", "ORGANIZATION"],
65+
"labels_to_ignore": ["DATE_TIME"],
66+
},
1567
}
1668

1769
# Create NLP engine based on configuration
@@ -59,6 +111,23 @@ def scan(text, **kwargs):
59111
kwargs.setdefault("language", "en")
60112
kwargs.setdefault("score_threshold", 0.35)
61113
kwargs.setdefault("nlp_artifacts", None)
114+
kwargs.setdefault("entities", [])
115+
kwargs.setdefault("allow_list", [])
116+
kwargs.setdefault("deny_list", [])
117+
118+
"""Analyze input using Analyzer engine and input arguments (kwargs)."""
119+
if "entities" not in kwargs or "All" in kwargs["entities"]:
120+
kwargs["entities"] = None
121+
122+
if "deny_list" in kwargs and kwargs["deny_list"] is not None:
123+
ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"])
124+
kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
125+
del kwargs["deny_list"]
126+
127+
if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
128+
ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
129+
kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
130+
del kwargs["regex_params"]
62131

63132
# init analyzer instance
64133
analyzer = analyzer_engine()

src/datafog/pii_tools/PresidioEngine/analyzer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from presidio_analyzer import AnalysisExplanation, LocalRecognizer, RecognizerResult
55

6-
logger = logging.getLogger("presidio-module")
6+
logger = logging.getLogger("custom-spacy-recognizer").setLevel(logging.ERROR)
77

88

99
class CustomSpacyRecognizer(LocalRecognizer):

0 commit comments

Comments
 (0)