1
1
import json
2
- from typing import Dict , List , Iterable , Union , Any
2
+ from typing import Any , Dict , Iterable , List , Union
3
+
3
4
from presidio_analyzer import AnalyzerEngine , BatchAnalyzerEngine , RecognizerResult
4
5
from presidio_anonymizer import AnonymizerEngine , BatchAnonymizerEngine
5
6
7
+
6
8
def presidio_batch_init ():
7
9
analyzer = AnalyzerEngine ()
8
10
batch_analyzer = BatchAnalyzerEngine (analyzer_engine = analyzer )
9
11
batch_anonymizer = BatchAnonymizerEngine (anonymizer_engine = AnonymizerEngine ())
10
12
return analyzer , batch_analyzer , batch_anonymizer
11
13
14
+
12
15
def batch_scan (text : Dict [str , str ], batch_analyzer : BatchAnalyzerEngine ) -> List [str ]:
13
16
analyzer_results = batch_analyzer .analyze_dict (text , language = "en" )
14
- return [json .dumps ({
15
- "key" : result .key ,
16
- "value" : result .value ,
17
- "recognizer_results" : serialize_recognizer_results (result .recognizer_results )
18
- }) for result in analyzer_results ]
17
+ return [
18
+ json .dumps (
19
+ {
20
+ "key" : result .key ,
21
+ "value" : result .value ,
22
+ "recognizer_results" : serialize_recognizer_results (
23
+ result .recognizer_results
24
+ ),
25
+ }
26
+ )
27
+ for result in analyzer_results
28
+ ]
29
+
30
+
31
+ from typing import Dict , Iterator , List , Optional , Union
19
32
20
- from typing import Union , List , Dict , Optional , Iterator
21
33
from presidio_analyzer import RecognizerResult
22
34
23
- def serialize_recognizer_results (recognizer_results : Union [List [RecognizerResult ], List [List [RecognizerResult ]], Iterator [RecognizerResult ]]) -> Optional [Union [List [Dict [str , Union [str , int , float , None ]]], List [List [Dict [str , Union [str , int , float , None ]]]]]]:
35
+
36
+ def serialize_recognizer_results (
37
+ recognizer_results : Union [
38
+ List [RecognizerResult ], List [List [RecognizerResult ]], Iterator [RecognizerResult ]
39
+ ]
40
+ ) -> Optional [
41
+ Union [
42
+ List [Dict [str , Union [str , int , float , None ]]],
43
+ List [List [Dict [str , Union [str , int , float , None ]]]],
44
+ ]
45
+ ]:
24
46
if isinstance (recognizer_results , list ):
25
47
if recognizer_results and isinstance (recognizer_results [0 ], RecognizerResult ):
26
- return [{
27
- "entity_type" : r .entity_type ,
28
- "start" : r .start ,
29
- "end" : r .end ,
30
- "score" : r .score ,
31
- "analysis_explanation" : r .analysis_explanation
32
- } for r in recognizer_results ]
48
+ return [
49
+ {
50
+ "entity_type" : r .entity_type ,
51
+ "start" : r .start ,
52
+ "end" : r .end ,
53
+ "score" : r .score ,
54
+ "analysis_explanation" : r .analysis_explanation ,
55
+ }
56
+ for r in recognizer_results
57
+ ]
33
58
elif recognizer_results and isinstance (recognizer_results [0 ], list ):
34
59
return [serialize_recognizer_results (rr ) for rr in recognizer_results ]
35
60
elif isinstance (recognizer_results , Iterator ):
36
61
return [serialize_recognizer_results (rr ) for rr in recognizer_results ]
37
62
else :
38
63
return None
39
-
40
- from typing import Union , List , Dict , Iterable , Any
64
+
65
+
66
+ from typing import Any , Dict , Iterable , List , Union
67
+
41
68
from presidio_anonymizer import AnonymizerEngine , BatchAnonymizerEngine
42
69
from presidio_anonymizer .entities import DictRecognizerResult , RecognizerResult
43
70
44
- def batch_redact (input_data : Union [Dict [str , str ], List [str ]], results : List [str ], anonymizer : BatchAnonymizerEngine , ** kwargs ) -> Union [List [str ], Dict [str , str ]]:
71
+
72
+ def batch_redact (
73
+ input_data : Union [Dict [str , str ], List [str ]],
74
+ results : List [str ],
75
+ anonymizer : BatchAnonymizerEngine ,
76
+ ** kwargs
77
+ ) -> Union [List [str ], Dict [str , str ]]:
45
78
if isinstance (input_data , dict ):
46
79
# Input is a dictionary, perform anonymize_dict
47
- analyzer_results = [DictRecognizerResult (key = result_dict ["key" ], value = result_dict ["value" ], recognizer_results = [
48
- RecognizerResult (entity_type = r ['entity_type' ], start = r ['start' ], end = r ['end' ], score = r ['score' ])
49
- for recognizer_result in result_dict ['recognizer_results' ] if recognizer_result
50
- for r in recognizer_result
51
- ]) for result_dict in [json .loads (result ) for result in results ]]
80
+ analyzer_results = [
81
+ DictRecognizerResult (
82
+ key = result_dict ["key" ],
83
+ value = result_dict ["value" ],
84
+ recognizer_results = [
85
+ RecognizerResult (
86
+ entity_type = r ["entity_type" ],
87
+ start = r ["start" ],
88
+ end = r ["end" ],
89
+ score = r ["score" ],
90
+ )
91
+ for recognizer_result in result_dict ["recognizer_results" ]
92
+ if recognizer_result
93
+ for r in recognizer_result
94
+ ],
95
+ )
96
+ for result_dict in [json .loads (result ) for result in results ]
97
+ ]
52
98
53
- anonymized_data = anonymizer .anonymize_dict (analyzer_results = analyzer_results , ** kwargs )
99
+ anonymized_data = anonymizer .anonymize_dict (
100
+ analyzer_results = analyzer_results , ** kwargs
101
+ )
54
102
return anonymized_data
55
103
56
104
elif isinstance (input_data , list ):
@@ -60,14 +108,22 @@ def batch_redact(input_data: Union[Dict[str, str], List[str]], results: List[str
60
108
for result in results :
61
109
result_dict = json .loads (result )
62
110
recognizer_results = [
63
- RecognizerResult (entity_type = r ['entity_type' ], start = r ['start' ], end = r ['end' ], score = r ['score' ])
64
- for recognizer_result in result_dict ['recognizer_results' ] if recognizer_result
111
+ RecognizerResult (
112
+ entity_type = r ["entity_type" ],
113
+ start = r ["start" ],
114
+ end = r ["end" ],
115
+ score = r ["score" ],
116
+ )
117
+ for recognizer_result in result_dict ["recognizer_results" ]
118
+ if recognizer_result
65
119
for r in recognizer_result
66
120
]
67
121
analyzer_results .append (recognizer_results )
68
122
69
- anonymized_texts = anonymizer .anonymize_list (texts = texts , recognizer_results_list = analyzer_results , ** kwargs )
123
+ anonymized_texts = anonymizer .anonymize_list (
124
+ texts = texts , recognizer_results_list = analyzer_results , ** kwargs
125
+ )
70
126
return anonymized_texts
71
127
72
128
else :
73
- raise ValueError ("Invalid input type. Expected Dict[str, str] or List[str]." )
129
+ raise ValueError ("Invalid input type. Expected Dict[str, str] or List[str]." )
0 commit comments