|
1 |
| -from presidio_analyzer import AnalyzerEngine, RecognizerRegistry |
| 1 | +from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, RecognizerResult, PatternRecognizer, Pattern |
2 | 2 | from presidio_analyzer.nlp_engine import NlpEngineProvider
|
3 |
| - |
| 3 | +from typing import List, Optional, Tuple |
| 4 | +import logging |
4 | 5 | from .analyzer import CustomSpacyRecognizer
|
5 | 6 |
|
| 7 | +logger = logging.getLogger("presidio-engine-init").setLevel(logging.ERROR) |
6 | 8 |
|
7 | 9 | # Helper methods
|
| 10 | +def create_ad_hoc_deny_list_recognizer( |
| 11 | + deny_list=Optional[List[str]], |
| 12 | +) -> Optional[PatternRecognizer]: |
| 13 | + if not deny_list: |
| 14 | + return None |
| 15 | + |
| 16 | + deny_list_recognizer = PatternRecognizer( |
| 17 | + supported_entity="GENERIC_PII", deny_list=deny_list |
| 18 | + ) |
| 19 | + return deny_list_recognizer |
| 20 | + |
| 21 | + |
| 22 | +def create_ad_hoc_regex_recognizer( |
| 23 | + regex: str, entity_type: str, score: float, context: Optional[List[str]] = None |
| 24 | +) -> Optional[PatternRecognizer]: |
| 25 | + if not regex: |
| 26 | + return None |
| 27 | + pattern = Pattern(name="Regex Pattern", regex=regex, score=score) |
| 28 | + regex_recognizer = PatternRecognizer( |
| 29 | + supported_entity=entity_type, patterns=[pattern], context=context |
| 30 | + ) |
| 31 | + return regex_recognizer |
| 32 | + |
8 | 33 | def analyzer_engine():
|
9 | 34 | """Return AnalyzerEngine."""
|
10 | 35 |
|
@@ -59,6 +84,23 @@ def scan(text, **kwargs):
|
59 | 84 | kwargs.setdefault("language", "en")
|
60 | 85 | kwargs.setdefault("score_threshold", 0.35)
|
61 | 86 | kwargs.setdefault("nlp_artifacts", None)
|
| 87 | + kwargs.setdefault("entities", []) |
| 88 | + kwargs.setdefault("allow_list", []) |
| 89 | + kwargs.setdefault("deny_list", []) |
| 90 | + |
| 91 | + """Analyze input using Analyzer engine and input arguments (kwargs).""" |
| 92 | + if "entities" not in kwargs or "All" in kwargs["entities"]: |
| 93 | + kwargs["entities"] = None |
| 94 | + |
| 95 | + if "deny_list" in kwargs and kwargs["deny_list"] is not None: |
| 96 | + ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"]) |
| 97 | + kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else [] |
| 98 | + del kwargs["deny_list"] |
| 99 | + |
| 100 | + if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0: |
| 101 | + ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"]) |
| 102 | + kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else [] |
| 103 | + del kwargs["regex_params"] |
62 | 104 |
|
63 | 105 | # init analyzer instance
|
64 | 106 | analyzer = analyzer_engine()
|
|
0 commit comments