Randomize year (#31)

augustak · magnurud · web-flow · commit 2bae1f46eccb · 2023-06-07T13:35:18.000+02:00
* Initial commit

* Randomize years synth

* Updated PikePDF version

* Updates

* Updated version

---------

Co-authored-by: Magnus Rud &lt;mag.aars@gmail.com&gt;
diff --git a/examples/randomize-years/synthesizer.py b/examples/randomize-years/synthesizer.py
@@ -0,0 +1,67 @@
+import random
+import re
+import copy
+from datetime import datetime
+from typing import Dict, List
+
+from synthetic.core.ground_truth import parse_labels_in_ground_truth
+from synthetic.pdf.synthesizer import PdfSynthesizer
+from synthetic.pdf.utils import Font
+
+
+# Courtesy of https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729
+def multireplace(string, replacements):
+    """
+    Given a string and a replacement map, it returns the replaced string.
+
+    :param str string: string to execute replacements on
+    :param dict replacements: replacement dictionary {value to find: value to replace}
+    :rtype: str
+
+    """
+    if not replacements:
+        return string
+    # Place longer ones first to keep shorter substrings from matching
+    # where the longer ones should take place
+    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against
+    # the string 'hey abc', it should produce 'hey ABC' and not 'hey ABc'
+    substrs = sorted(replacements, key=len, reverse=True)
+
+    # Create a big OR regex that matches any of the substrings to replace
+    regexp = re.compile('|'.join(map(re.escape, substrs)))
+
+    # For each match, look up the new string in the replacements
+    return regexp.sub(lambda match: replacements[match.group(0)], string)
+
+
+class RandomizeYearsSynthesizer(PdfSynthesizer):
+    """
+    This synthesizer will only randomize the years in date fields
+    """
+    YEARS = [str(n) for n in range(2016, 2029) if n != 2020]
+    DATE_FIELDS = ['due_date', 'invoice_date']
+    def __init__(self, ground_truth: List[dict], font_map: Dict[str, Font]):
+        super().__init__(ground_truth, font_map)
+        self.year_map = {}
+        self.post_year_map = {}
+        for label, value, match in parse_labels_in_ground_truth(ground_truth):
+            if label in self.DATE_FIELDS:
+                year = str(datetime.strptime(value.strip(), '%Y-%m-%d').year)
+                if year not in self.year_map:
+                    dst_year = random.choice(self.YEARS)
+                    self.year_map[year[2:]] = dst_year[2:]
+                    if year == '2020':
+                        self.post_year_map[dst_year[2:]*2] = dst_year
+
+    def reset(self):
+        # No need to create a new substitution map, all we need is a new year for the date fields
+        self.year_map = {year: random.choice(self.YEARS)[2:] for year in self.year_map}
+
+    def modify_text(self, text: str, **kwargs):
+        return multireplace(multireplace(text, self.year_map), self.post_year_map)
+
+    def create_new_ground_truth(self):
+        ground_truth = copy.deepcopy(self.ground_truth)
+        for label, value, match in parse_labels_in_ground_truth(ground_truth):
+            match.context.value.update({'value': multireplace(multireplace(value, self.year_map), self.post_year_map)})
+        return ground_truth
diff --git a/requirements.txt b/requirements.txt
@@ -6,7 +6,7 @@ filetype>=1.0.13
 jsonpath-ng>=1.5.3
 numpy>=1.21.6
 pdfminer.six>=20220319
-pikepdf>=5.1.2
+pikepdf>=7.2.0
 pyamg>=4.2.3
 scikit-image>=0.19.3
 scikit-learn>=1.0.2
diff --git a/synthetic/__version__.py b/synthetic/__version__.py
@@ -7,4 +7,4 @@
 __maintainer_email__ = 'august@lucidtech.ai'
 __title__ = 'lucidtech-synthetic'
 __url__ = 'https://github.com/LucidtechAI/synthetic'
-__version__ = '0.4.6'
+__version__ = '0.5.0'