Skip to content

Commit 2bae1f4

Browse files
augustakmagnurud
andauthored
Randomize year (#31)
* Initial commit * Randomize years synth * Updated PikePDF version * Updates * Updated version --------- Co-authored-by: Magnus Rud <[email protected]>
1 parent c816320 commit 2bae1f4

File tree

3 files changed

+69
-2
lines changed

3 files changed

+69
-2
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import random
2+
import re
3+
import copy
4+
from datetime import datetime
5+
from typing import Dict, List
6+
7+
from synthetic.core.ground_truth import parse_labels_in_ground_truth
8+
from synthetic.pdf.synthesizer import PdfSynthesizer
9+
from synthetic.pdf.utils import Font
10+
11+
12+
# Courtesy of https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729
13+
def multireplace(string, replacements):
14+
"""
15+
Given a string and a replacement map, it returns the replaced string.
16+
17+
:param str string: string to execute replacements on
18+
:param dict replacements: replacement dictionary {value to find: value to replace}
19+
:rtype: str
20+
21+
"""
22+
if not replacements:
23+
return string
24+
# Place longer ones first to keep shorter substrings from matching
25+
# where the longer ones should take place
26+
# For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against
27+
# the string 'hey abc', it should produce 'hey ABC' and not 'hey ABc'
28+
substrs = sorted(replacements, key=len, reverse=True)
29+
30+
# Create a big OR regex that matches any of the substrings to replace
31+
regexp = re.compile('|'.join(map(re.escape, substrs)))
32+
33+
# For each match, look up the new string in the replacements
34+
return regexp.sub(lambda match: replacements[match.group(0)], string)
35+
36+
37+
class RandomizeYearsSynthesizer(PdfSynthesizer):
38+
"""
39+
This synthesizer will only randomize the years in date fields
40+
"""
41+
YEARS = [str(n) for n in range(2016, 2029) if n != 2020]
42+
DATE_FIELDS = ['due_date', 'invoice_date']
43+
def __init__(self, ground_truth: List[dict], font_map: Dict[str, Font]):
44+
super().__init__(ground_truth, font_map)
45+
self.year_map = {}
46+
self.post_year_map = {}
47+
for label, value, match in parse_labels_in_ground_truth(ground_truth):
48+
if label in self.DATE_FIELDS:
49+
year = str(datetime.strptime(value.strip(), '%Y-%m-%d').year)
50+
if year not in self.year_map:
51+
dst_year = random.choice(self.YEARS)
52+
self.year_map[year[2:]] = dst_year[2:]
53+
if year == '2020':
54+
self.post_year_map[dst_year[2:]*2] = dst_year
55+
56+
def reset(self):
57+
# No need to create a new substitution map, all we need is a new year for the date fields
58+
self.year_map = {year: random.choice(self.YEARS)[2:] for year in self.year_map}
59+
60+
def modify_text(self, text: str, **kwargs):
61+
return multireplace(multireplace(text, self.year_map), self.post_year_map)
62+
63+
def create_new_ground_truth(self):
64+
ground_truth = copy.deepcopy(self.ground_truth)
65+
for label, value, match in parse_labels_in_ground_truth(ground_truth):
66+
match.context.value.update({'value': multireplace(multireplace(value, self.year_map), self.post_year_map)})
67+
return ground_truth

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ filetype>=1.0.13
66
jsonpath-ng>=1.5.3
77
numpy>=1.21.6
88
pdfminer.six>=20220319
9-
pikepdf>=5.1.2
9+
pikepdf>=7.2.0
1010
pyamg>=4.2.3
1111
scikit-image>=0.19.3
1212
scikit-learn>=1.0.2

synthetic/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
__maintainer_email__ = '[email protected]'
88
__title__ = 'lucidtech-synthetic'
99
__url__ = 'https://github.com/LucidtechAI/synthetic'
10-
__version__ = '0.4.6'
10+
__version__ = '0.5.0'

0 commit comments

Comments
 (0)