VUB-HYDR · i-be-snek · Sep 5, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/Database/scr/normalize_numbers.py b/Database/scr/normalize_numbers.py
@@ -1,4 +1,4 @@
-from math import floor, isnan
+from math import isnan
 from typing import Dict, List, Tuple, Union
 
 import regex
@@ -168,14 +168,14 @@ def __init__(self, nlp: spacy.language, locale_config: str):
             "no injuries",
             "no casualties",
             "no deaths",
-            "minimal",
             "no fatalities",
+        ]
+        self.unknown_phrases = [
+            "minimal",
             "negligible",
             "inconsequential",
             "minor",
             "limited",
-        ]
-        self.unknown_phrases = [
             "absent",
             "does not mention",
             "indefinite",
@@ -501,6 +501,25 @@ def _extract_simple_range(self, text: str) -> Tuple[float]:
                 except:
                     return None
 
+    def _get_scale(self, n_init: float | int):
+        """
+        Determine the scale of a number
+        """
+        n = int(n_init) if isinstance(n_init, float) and n_init.is_integer() else n_init
+        abs_n = abs(n)
+        n_str = str(abs_n)
+
+        if isinstance(n, int):
+            # Check if the last digit is zero
+            trailing_zeros = len(n_str) - len(n_str.rstrip("0"))
+            scale = 10**trailing_zeros
+
+        elif isinstance(n, float):
+            _, part_dec = n_str.split(".")
+            scale = 10 ** (-len(part_dec))
+
+        return n, scale
+
     def _extract_complex_range(self, text: str) -> Tuple[float, float] | None:
         phrases = {
             "approx": {"list": sorted(self.approximately, reverse=True)},
@@ -514,7 +533,9 @@ def _extract_complex_range(self, text: str) -> Tuple[float, float] | None:
             any_digit = "[\d,.]*"
             expression = "({any_digit})\s*({scales})*\s*({phrases})[,.]*\s*({any_digit})\s*({scales})*"
             expression = expression.format(
-                phrases="|".join(v["list"]), scales="|".join(self.scales), any_digit=any_digit
+                phrases="|".join(v["list"]),
+                scales="|".join(self.scales),
+                any_digit=any_digit,
             )
             matches = regex.findall(expression, text, flags=regex.IGNORECASE | regex.MULTILINE)
 
@@ -546,32 +567,20 @@ def _extract_complex_range(self, text: str) -> Tuple[float, float] | None:
                         if any([x in [y.lower() for y in text.split()] for x in self.family_synonyms])
                         else (1, 1)
                     )
-                    scale = pow(10, len(str(int(num))) - 1)
-                    multip = int(str(int(num))[0])
+                    n, scale = self._get_scale(num)
+
                     if k == "approx":
-                        return (
-                            floor(num * 0.95) * lower_mod,
-                            floor(num * 1.05) * upper_mod,
-                        )
+                        return ((max(0, n - scale) * lower_mod), (n + scale) * upper_mod)
                     if "over" in k:
                         inc = 0 if "inclusive" in k else 1
-                        return (
-                            (num + inc) * lower_mod,
-                            num + 5 if (scale == 1 and upper_mod == 1) else ((scale * (multip + 1)) - 1) * upper_mod,
-                        )
+                        return ((n + inc) * lower_mod, (n + scale + inc) * upper_mod)
+
                     if "under" in k:
                         inc = 0 if "inclusive" in k else 1
-                        if (num - (scale * multip)) / num > 0.08:
-                            _min, _max = (
-                                0 if (scale == 1 and multip == 1) else ((scale * multip) + 1) * upper_mod,
-                                (num - inc) * lower_mod,
-                            )
-                        else:
-                            _min, _max = (
-                                0 if (scale == 1 and multip == 1) else ((scale * (multip - 1)) + 1) * upper_mod,
-                                (num - inc) * lower_mod,
-                            )
-                        return (_min, _max)
+                        return (
+                            max(0, n - scale - inc) * lower_mod,
+                            max(0, n - inc) * upper_mod,
+                        )
 
     def _extract_approximate_quantifiers(self, text: str) -> Tuple[float, float] | None:
         one, ten, hun, tho, mil, bil, tri = (

diff --git a/tests/test_normalize_numbers.py b/tests/test_normalize_numbers.py
@@ -136,7 +136,7 @@ def test__check_for_approximation(self, test_input, expected):
             ("0 - 352", (0, 352)),
             ("23- 55", (23, 55)),
             ("24,501-61,672", (24501, 61672)),
-            # not meant to handle this case
+            # cases meant to fail
             (">=12", None),
             ("12", None),
             ("twelve and one hundred", None),
@@ -153,29 +153,29 @@ def test__extract_simple_range(self, test_input, expected):
             ("23mil", (23000000, 23000000, 0)),
             ("110 - 352", (110, 352, 1)),
             ("between 11 and 17 people were affected", (11, 17, 1)),
-            ("Nearly 300 homes were destroyed", (285, 315, 1)),
+            ("Nearly 300 homes were destroyed", (200, 400, 1)),
             ("$3.6 million", (3600000, 3600000, 0)),
             ("$35.63 million", (35630000, 35630000, 0)),
             ("$3.6 million", (3600000, 3600000, 0)),
-            ("Damage: At least $129 million", (129000000, 199999999, 1)),
-            ("At least 73", (73, 79, 1)),
+            ("Damage: At least $129 million", (129000000, 130000000, 1)),
+            ("At least 73", (73, 74, 1)),
             ("925000000", (925000000, 925000000, 0)),
             (925000000, (925000000, 925000000, 0)),
             (23.4, (23.4, 23.4, 0)),
-            ("More than 7010 were killed", (7011, 7999, 1)),
-            ("Less than 400", (301, 399, 1)),
+            ("More than 7010 were killed", (7011, 7021, 1)),
+            ("Less than 400", (299, 399, 1)),
             (
                 "a minimum of 410 billion",
-                (410000000000, 499999999999, 1),
+                (410000000000, 420000000000, 1),
             ),
-            ("603+", (603, 699, 1)),
-            (">=293", (293, 299, 1)),
-            ("~293", (278, 307, 1)),
-            (">= $27 million", (27000000, 29999999, 1)),
-            ("$27 million or more", (27000000, 29999999, 1)),
-            ("about A$500 million", (475000000, 525000000, 1)),
-            ("over US$500 million", (500000001, 599999999, 1)),
-            ("over USD 1.0 billion", (1000000001, 1999999999, 1)),
+            ("603+", (603, 604, 1)),
+            (">=293", (293, 294, 1)),
+            ("~293", (292, 294, 1)),
+            (">= $27 million", (27000000, 28000000, 1)),
+            ("$27 million or more", (27000000, 28000000, 1)),
+            ("about A$500 million", (400000000, 600000000, 1)),
+            ("over US$500 million", (500000001, 600000001, 1)),
+            ("over USD 1.0 billion", (1000000001, 2000000001, 1)),
             ("15 billion yuan", (15000000000, 15000000000, 0)),
             ("100 million pesos", (100000000, 100000000, 0)),
             ("between 20.2 and 30.4", (20.2, 30.4, 1)),
@@ -216,34 +216,34 @@ def test__extract_approximate_quantifiers(self, test_input, expected):
         "test_input, expected",
         [
             # approx
-            ("Almost 30", (28, 31)),  # rounded down! (floor(30*0.95), floor(30*1.05))
-            ("approximately 7000000000 dollars", (6650000000, 7350000000)),
-            ("Around 7000 homes were destroyed", (6650, 7350)),
-            ("roughly, 4 injuries had been reported", (3, 4)),  # rounded down! (floor(4*0.95), floor(4*1.05))
-            ("~45", (42, 47)),
+            ("Almost 30", (20, 40)),
+            ("approximately 7000000000 dollars", (6000000000, 8000000000)),
+            ("Around 7000 homes were destroyed", (6000, 8000)),
+            ("roughly, 4 injuries had been reported", (3, 5)),
+            ("~45", (44, 46)),
             # over
-            ("Greater than 300", (301, 399)),
-            ("The number of deaths certainly exceeded 66", (67, 69)),
-            ("more than 6 families were displaced", ((6 + 1) * 3, 6 * 5)),
-            ("at least 3600 were reported missing", (3600, 3999)),
-            ("no less than 55 injuries were reported in the media", (55, 59)),
-            ("> 45", (46, 49)),
-            (">=5", (5, 10)),  # created range by adding 5 since scale == 1
-            ("greater than or equal to 9", (9, 14)),  # created range by adding 5 since scale == 1
-            ("45+ deaths were reported by the news", (45, 49)),
-            ("311,000,000+ Euros", (311000000, 399999999)),
-            (">693 million", (693000001, 699999999)),
+            ("Greater than 300", (301, 401)),
+            ("The number of deaths certainly exceeded 66", (67, 68)),
+            ("more than 6 families were displaced", (7 * 3, 8 * 5)),  # 7*3, 7*5
+            ("at least 3600 were reported missing", (3600, 3700)),
+            ("no less than 55 injuries were reported in the media", (55, 56)),
+            ("> 45", (46, 47)),
+            (">=5", (5, 6)),
+            ("greater than or equal to 9", (9, 10)),
+            ("45+ deaths were reported by the news", (45, 46)),
+            ("311,000,000+ Euros", (311000000, 312000000)),
+            (">693 million", (693000001, 694000001)),
             # under
-            ("less than 230000000 dollars were paid out in insurance costs", (200000001, 229999999)),
-            ("No more than 23 million dollars", (20000001, 23000000)),
-            ("Up to 7 billion dollars", (6000000001, 7000000000)),
-            ("at most 3284 casualties were reported", (3001, 3284)),
-            ("Up to 7000000 dollars", (6000001, 7000000)),
-            ("Up to 7,000,000 dollars", (6000001, 7000000)),
-            ("less than 1", (0, 0)),
-            ("no more than 1 was injured", (0, 1)),
-            ("≤7000000", (6000001, 7000000)),
-            # cases this function does not handle; meant to raise BaseException
+            ("less than 230000000 dollars were paid out in insurance costs", (219999999, 229999999)),
+            ("No more than 23 million dollars", (22000000, 23000000)),
+            ("Up to 7 billion dollars", (6000000000, 7000000000)),
+            ("at most 3284 casualties were reported", (3283, 3284)),
+            ("Up to 7000000 dollars", (6000000, 7000000)),
+            ("Up to 7,000,000 dollars", (6000000, 7000000)),
+            ("less than 1", (0, 0)),  # 1,1
+            ("no more than 1 was injured", (0, 1)),  # 1,1
+            ("≤7000000", (6000000, 7000000)),
+            # cases meant to fail
             ("six families were displaced", None),
         ],
     )