Skip to content

Commit d599f4d

Browse files
Sid MohanSid Mohan
Sid Mohan
authored and
Sid Mohan
committed
v2.3.0
1 parent ef7c795 commit d599f4d

File tree

6 files changed

+22
-203
lines changed

6 files changed

+22
-203
lines changed

.DS_Store

2 KB
Binary file not shown.

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setup(
1010
name="datafog",
11-
version="2.3.0b3",
11+
version="2.3.0",
1212
author="Sid Mohan",
1313
author_email="[email protected]",
1414
description="Scan, redact, and manage PII in your documents before they get uploaded to a Retrieval Augmented Generation (RAG) system.",

src/datafog/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# SSOT for the package version
2-
__version__ = "2.3.0b3"
2+
__version__ = "2.3.0"

src/datafog/pii_tools/PresidioEngine/__init__.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def create_ad_hoc_deny_list_recognizer(
1414
return None
1515

1616
deny_list_recognizer = PatternRecognizer(
17-
supported_entity="GENERIC_PII", deny_list=deny_list
17+
supported_entity="CUSTOM_PII", deny_list=deny_list
1818
)
1919
return deny_list_recognizer
2020

@@ -37,6 +37,25 @@ def analyzer_engine():
3737
configuration = {
3838
"nlp_engine_name": "spacy",
3939
"models": [{"lang_code": "en", "model_name": "en_spacy_pii_fast"}],
40+
"ner_model_configuration": {
41+
"model_to_presidio_entity_mapping": {
42+
"PER": "PERSON",
43+
"PERSON": "PERSON",
44+
"NORP": "NRP",
45+
"FAC": "FACILITY",
46+
"LOC": "LOCATION",
47+
"GPE": "LOCATION",
48+
"LOCATION": "LOCATION",
49+
"ORG": "ORGANIZATION",
50+
"ORGANIZATION": "ORGANIZATION",
51+
"DATE": "DATE_TIME",
52+
"TIME": "DATE_TIME",
53+
},
54+
"low_confidence_score_multiplier": 0.4,
55+
"low_score_entity_names": ["ORG", "ORGANIZATION"],
56+
"labels_to_ignore": ["DATE_TIME"],
57+
},
58+
4059
}
4160

4261
# Create NLP engine based on configuration

tests/files/output_files/output.md

-100
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@ PERSON
22
LOCATION
33
PERSON
44
PERSON
5-
DATE_TIME
65
NRP
7-
DATE_TIME
86
PERSON
97
NRP
108
NRP
@@ -16,22 +14,11 @@ PERSON
1614
LOCATION
1715
PERSON
1816
LOCATION
19-
DATE_TIME
20-
DATE_TIME
21-
DATE_TIME
22-
DATE_TIME
2317
NRP
24-
DATE_TIME
25-
DATE_TIME
26-
DATE_TIME
27-
DATE_TIME
28-
DATE_TIME
29-
DATE_TIME
3018
NRP
3119
LOCATION
3220
NRP
3321
NRP
34-
DATE_TIME
3522
NRP
3623
NRP
3724
NRP
@@ -40,35 +27,26 @@ NRP
4027
LOCATION
4128
LOCATION
4229
LOCATION
43-
DATE_TIME
4430
PERSON
4531
PERSON
46-
DATE_TIME
4732
PERSON
4833
NRP
49-
DATE_TIME
5034
LOCATION
5135
NRP
5236
NRP
5337
PERSON
5438
LOCATION
5539
LOCATION
5640
LOCATION
57-
DATE_TIME
58-
DATE_TIME
5941
NRP
60-
DATE_TIME
6142
NRP
6243
LOCATION
6344
LOCATION
64-
DATE_TIME
65-
DATE_TIME
6645
NRP
6746
LOCATION
6847
LOCATION
6948
LOCATION
7049
NRP
71-
DATE_TIME
7250
LOCATION
7351
LOCATION
7452
LOCATION
@@ -88,19 +66,13 @@ NRP
8866
LOCATION
8967
LOCATION
9068
LOCATION
91-
DATE_TIME
9269
LOCATION
93-
DATE_TIME
94-
DATE_TIME
9570
NRP
9671
NRP
9772
PERSON
98-
DATE_TIME
9973
PERSON
10074
PERSON
10175
PERSON
102-
DATE_TIME
103-
DATE_TIME
10476
LOCATION
10577
PERSON
10678
LOCATION
@@ -111,30 +83,20 @@ LOCATION
11183
NRP
11284
NRP
11385
PERSON
114-
DATE_TIME
11586
NRP
11687
NRP
117-
DATE_TIME
11888
LOCATION
11989
LOCATION
12090
PERSON
12191
NRP
12292
NRP
12393
NRP
124-
DATE_TIME
125-
DATE_TIME
126-
DATE_TIME
12794
LOCATION
128-
DATE_TIME
129-
DATE_TIME
13095
LOCATION
13196
NRP
132-
DATE_TIME
133-
DATE_TIME
13497
NRP
13598
PERSON
13699
NRP
137-
DATE_TIME
138100
LOCATION
139101
NRP
140102
LOCATION
@@ -146,64 +108,37 @@ LOCATION
146108
LOCATION
147109
PERSON
148110
NRP
149-
DATE_TIME
150111
LOCATION
151112
PERSON
152113
LOCATION
153-
DATE_TIME
154-
DATE_TIME
155114
NRP
156115
NRP
157-
DATE_TIME
158-
DATE_TIME
159-
DATE_TIME
160116
NRP
161-
DATE_TIME
162-
DATE_TIME
163117
LOCATION
164118
NRP
165119
NRP
166120
NRP
167121
NRP
168122
NRP
169-
DATE_TIME
170123
PERSON
171124
NRP
172125
NRP
173-
DATE_TIME
174-
DATE_TIME
175-
DATE_TIME
176126
NRP
177-
DATE_TIME
178-
DATE_TIME
179127
NRP
180128
NRP
181129
NRP
182-
DATE_TIME
183130
NRP
184131
LOCATION
185132
NRP
186-
DATE_TIME
187-
DATE_TIME
188-
DATE_TIME
189-
DATE_TIME
190-
DATE_TIME
191-
DATE_TIME
192133
PERSON
193-
DATE_TIME
194134
LOCATION
195-
DATE_TIME
196135
NRP
197136
NRP
198137
NRP
199138
PERSON
200139
LOCATION
201-
DATE_TIME
202140
NRP
203-
DATE_TIME
204-
DATE_TIME
205141
PERSON
206-
DATE_TIME
207142
LOCATION
208143
PERSON
209144
PERSON
@@ -217,19 +152,11 @@ PERSON
217152
PERSON
218153
PERSON
219154
LOCATION
220-
DATE_TIME
221-
DATE_TIME
222-
DATE_TIME
223155
PERSON
224-
DATE_TIME
225156
PERSON
226157
PERSON
227158
PERSON
228-
DATE_TIME
229-
DATE_TIME
230-
DATE_TIME
231159
NRP
232-
DATE_TIME
233160
LOCATION
234161
LOCATION
235162
LOCATION
@@ -240,18 +167,14 @@ PERSON
240167
PERSON
241168
PERSON
242169
NRP
243-
DATE_TIME
244-
DATE_TIME
245170
PERSON
246171
LOCATION
247172
LOCATION
248173
PERSON
249174
LOCATION
250-
DATE_TIME
251175
LOCATION
252176
PERSON
253177
NRP
254-
DATE_TIME
255178
NRP
256179
LOCATION
257180
LOCATION
@@ -260,31 +183,22 @@ PERSON
260183
LOCATION
261184
NRP
262185
LOCATION
263-
DATE_TIME
264-
DATE_TIME
265186
LOCATION
266187
LOCATION
267188
NRP
268-
DATE_TIME
269189
LOCATION
270-
DATE_TIME
271190
PERSON
272191
LOCATION
273192
LOCATION
274193
LOCATION
275194
LOCATION
276-
DATE_TIME
277195
LOCATION
278-
DATE_TIME
279-
DATE_TIME
280196
PERSON
281197
LOCATION
282198
LOCATION
283199
PERSON
284200
PERSON
285201
NRP
286-
DATE_TIME
287-
DATE_TIME
288202
PERSON
289203
NRP
290204
PERSON
@@ -294,43 +208,29 @@ LOCATION
294208
PERSON
295209
PERSON
296210
LOCATION
297-
DATE_TIME
298211
LOCATION
299-
DATE_TIME
300212
PERSON
301213
PERSON
302-
DATE_TIME
303214
LOCATION
304215
LOCATION
305216
NRP
306217
LOCATION
307218
LOCATION
308219
PERSON
309220
LOCATION
310-
DATE_TIME
311221
PERSON
312222
PERSON
313223
PERSON
314-
DATE_TIME
315224
PERSON
316225
LOCATION
317-
DATE_TIME
318226
NRP
319227
PERSON
320-
DATE_TIME
321-
DATE_TIME
322-
DATE_TIME
323-
DATE_TIME
324-
DATE_TIME
325228
PERSON
326229
LOCATION
327230
NRP
328231
NRP
329232
NRP
330-
DATE_TIME
331-
DATE_TIME
332233
NRP
333234
LOCATION
334-
DATE_TIME
335235
LOCATION
336236
LOCATION

0 commit comments

Comments
 (0)