🍻 Add drunken code to reset later

VUB-HYDR · Jul 30, 2024 · 2fe2351 · 2fe2351
1 parent 767c709
commit 2fe2351
Show file tree

Hide file tree

Showing 2 changed files with 203 additions and 29 deletions.
diff --git a/Database/scr/normalize_numbers.py b/Database/scr/normalize_numbers.py
@@ -60,7 +60,9 @@ def _preprocess(self, text: str):
             regex.sub(
                 "[A-Za-z]+",
                 lambda ele: (
-                    f" {ele[0]} " if ele[0] not in lookup["style_2"].keys() else f' {lookup["style_2"][ele[0]]} '
+                    f" {ele[0]} "
+                    if ele[0] not in lookup["style_2"].keys()
+                    else f' {lookup["style_2"][ele[0]]} '
                 ),
                 text,
             ),
@@ -69,7 +71,9 @@ def _preprocess(self, text: str):
         # remove any iso-4217 currency codes
         text = regex.sub(
             "[A-Z]{3}\s+",
-            lambda ele: ("" if self._check_currency(ele[0].strip()) == True else f" {ele[0]} "),
+            lambda ele: (
+                "" if self._check_currency(ele[0].strip()) == True else f" {ele[0]} "
+            ),
             text,
         ).strip()
 
@@ -120,7 +124,9 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
                 if len(regex.findall(r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b", text)) == 1:
                     other_scales = ["crore", "lakh", "crores", "lakhs"]
                     matches = regex.findall(
-                        r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b\s+(?:" + "|".join(other_scales) + ")",
+                        r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b\s+(?:"
+                        + "|".join(other_scales)
+                        + ")",
                         text,
                     )
                     if len(matches):
@@ -132,17 +138,26 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
                                 "lakh": 1e5,
                                 "lakhs": 1e5,
                             }
-                            return [self.atof(numbers[0]) * other_scales_2_num[numbers[1]]]
+                            return [
+                                self.atof(numbers[0]) * other_scales_2_num[numbers[1]]
+                            ]
                         except BaseException:
                             raise BaseException
 
                     else:
                         try:
                             # try normalizing the numbers to words, then extract the numbers
                             # (eg. "2 million" -> "two million" -> 2000000.0)
-                            assert len(regex.findall(r"[0-9]+[,.]a?[0-9]*|[0-9]+", text)) == 1, BaseException
-                            normalized_text = self._normalize_num(self.nlp(text), to_word=True)
-                            number = text2num(normalized_text, lang=self.lang, relaxed=True)
+                            assert (
+                                len(regex.findall(r"[0-9]+[,.]a?[0-9]*|[0-9]+", text))
+                                == 1
+                            ), BaseException
+                            normalized_text = self._normalize_num(
+                                self.nlp(text), to_word=True
+                            )
+                            number = text2num(
+                                normalized_text, lang=self.lang, relaxed=True
+                            )
                         except:
                             # handle decimals:
                             # if there is a decimal followed by a million/billion, etc
@@ -154,11 +169,24 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
                                 "trillion",
                             ]
                             if len(regex.findall(r"[0-9]+[.]{1}[0-9]+", text)) == 1:
-                                numbers = [token.text for token in self.nlp(text) if token.like_num]
-                                if len(numbers) == 2 and len(set(numbers[1].split(" ")).intersection(scales)) != 0:
+                                numbers = [
+                                    token.text
+                                    for token in self.nlp(text)
+                                    if token.like_num
+                                ]
+                                if (
+                                    len(numbers) == 2
+                                    and len(
+                                        set(numbers[1].split(" ")).intersection(scales)
+                                    )
+                                    != 0
+                                ):
                                     try:
                                         return [
-                                            self.atof(numbers[0]) * text2num(numbers[1], lang=self.lang, relaxed=True)
+                                            self.atof(numbers[0])
+                                            * text2num(
+                                                numbers[1], lang=self.lang, relaxed=True
+                                            )
                                         ]
                                     except BaseException:
                                         raise BaseException
@@ -168,7 +196,9 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
             raise BaseException()
         return [number]
 
-    def _extract_numbers_from_tokens(self, doc: spacy.tokens.doc.Doc) -> List[float] | BaseException:
+    def _extract_numbers_from_tokens(
+        self, doc: spacy.tokens.doc.Doc
+    ) -> List[float] | BaseException:
         numbers = []
         tmp_num = ""
         num_ranges = []
@@ -203,9 +233,9 @@ def _normalize_num(doc, to_word=False) -> str:
         new = ""
         for token in doc:
             # some times wrong tags are assigned, so we need to check both the tags and if the token is a number by regex
-            if (token.tag_ in ["CD", "SYM"] and token.text not in ["<", ">", "<=", ">="]) or (
-                regex.match(r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b", token.text)
-            ):
+            if (
+                token.tag_ in ["CD", "SYM"] and token.text not in ["<", ">", "<=", ">="]
+            ) or (regex.match(r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b", token.text)):
                 try:
                     new += num2words(token.text) if to_word else token.text
                     if token.whitespace_:
@@ -250,14 +280,18 @@ def _extract_numbers_from_entities(
                     try:
                         return self._extract_numbers_from_tokens(doc)
                     except:
-                        return self._extract_numbers_from_tokens(self.nlp(transcribed_text))
+                        return self._extract_numbers_from_tokens(
+                            self.nlp(transcribed_text)
+                        )
 
                 try:
                     number = self.atof(transcribed_text)
                     numbers.append(number)
                 except:
                     try:
-                        normalized_num = self._normalize_num(self.nlp(transcribed_text), to_word=False)
+                        normalized_num = self._normalize_num(
+                            self.nlp(transcribed_text), to_word=False
+                        )
                         numbers.append(self.atof(normalized_num))
                     except BaseException:
                         raise BaseException
@@ -270,7 +304,9 @@ def _extract_spans(
     ):
         return [(span["start"], span["end"]) for span in spans]
 
-    def _check_for_approximation(self, doc: spacy.tokens.doc.Doc, labels: List[str]) -> bool:
+    def _check_for_approximation(
+        self, doc: spacy.tokens.doc.Doc, labels: List[str]
+    ) -> bool:
         tags = " ".join([token.tag_ for token in doc])
         ent_labels = [ent.label_ for ent in doc.ents]
 
@@ -280,25 +316,52 @@ def _check_for_approximation(self, doc: spacy.tokens.doc.Doc, labels: List[str])
             return 0
         except:
             # check for common keywords
-            keywords = [
+            approx_keywords = [
                 "over",
                 "under",
-                "approxinately",
+                "approx",
                 "nearly",
                 "fewer than",
                 "greater than",
                 "more than",
                 "less than",
                 "between",
+                "a minimum of",
+                "a maximum of",
+                "almost",
+                "roughly",
+                "closely",
+                "about",
+                "at most",
+                "at least",
+                "approaching",
+                "upwards of",
+                "downwards of",
+                "roundly",
+                "circa",
+                "ca.",
+                "c.",
+                "cca",
+                "within",
             ]
-            if any([k.lower() in doc.text for k in keywords]):
+
+            not_approx_keywords = {
+                "exactly",
+                "precicely",
+                "a total of",
+                "only",
+            }
+            if any([k in doc.text.lower() for k in approx_keywords]):
                 return 1
 
+            if any([k in doc.text.lower() for k in not_approx_keywords]):
+                return 0
+
             # check for common POS tag combinations (example: "About 200 people" -> "RB CD NNS")
             # check for any math symbols (>=, ~, etc) or if a number ends with a plus/plus-minus sign
-            if any([x in tags for x in ["NFP", "IN JJS CD", "RB CD NNS", "IN CD NNS", ":"]]) or regex.findall(
-                r"[0-9]+(\+|±)|(=)*(>|<)(=)*|(~)", doc.text
-            ):
+            if any(
+                [x in tags for x in ["NFP", "IN JJS CD", "RB CD NNS", "IN CD NNS", ":"]]
+            ) or regex.findall(r"[0-9]+(\+|±)|(=)*(>|<)(=)*|(~)", doc.text):
                 return 1
 
             # check if all tokens in the string are number/money-related
@@ -308,8 +371,16 @@ def _check_for_approximation(self, doc: spacy.tokens.doc.Doc, labels: List[str])
             # check the spans only if the text contains an adverb followed by a number
             # if ent spans are the same as num spans, it's not an approx
             elif "RB CD" == tags:
-                ent_spans = self._extract_spans([ent for ent in doc.to_json()["ents"] if ent["label"] in labels])
-                num_spans = self._extract_spans([token for token in doc.to_json()["tokens"] if token["tag"] in ["CD"]])
+                ent_spans = self._extract_spans(
+                    [ent for ent in doc.to_json()["ents"] if ent["label"] in labels]
+                )
+                num_spans = self._extract_spans(
+                    [
+                        token
+                        for token in doc.to_json()["tokens"]
+                        if token["tag"] in ["CD"]
+                    ]
+                )
                 return 0 if ent_spans and ent_spans == num_spans else 1
 
             # check if there are no entities in the specified labels
@@ -336,7 +407,94 @@ def _extract_range(self, text: str) -> Tuple[float]:
                 except:
                     return None
 
-    def _extract_approximate_quantifiers(self, text: str) -> Tuple[float] | None:
+    def _extract_special_ranges(self, text: str) -> Tuple[float, float] | None:
+        approximately = [
+            "approx",
+            "nearly",
+            "almost",
+            "roughly" "closely",
+            "about",
+            "roundly",
+            "circa",
+            "ca.",
+            "c.",
+            "cca",
+            "within the range of",
+            "in the range of",
+        ]
+        over = [
+            "upwards of",
+            "approaching",
+            "a maximum of",
+            "over",
+            "greater than",
+            "more than",
+            "at least",
+            "above",
+        ]
+        under = [
+            "downwards of",
+            "under",
+            "fewer than",
+            "below",
+            "less than",
+            "a minimum of",
+            "at most",
+        ]
+
+        scales = [
+            "thousand",
+            "million",
+            "billion",
+            "trillion",
+        ]
+
+
+        for phrase_list in (approximately, over, under):
+            expression = "(\\d+\\s+)\\w*\\s+({phrase})|({phrases})\\s+(\\d+\\s+({scales}))|({phrases})\\s+(\\d+)\\s*"
+            expression = expression.format(phrases="|".join(phrase_list), scales=scales)
+            phrases = regex.findall(r"{}".format(expression), scales=scales)
+
+            if len(phrases) > 0:
+                num = None
+                for i in phrases:
+                    num = [x for x in i if x[0].isdigit()]
+                    if len(num) == 1:
+                        num = num[0]
+                        break
+                    if len(num) >= 2:
+                        # cap at first 2 numbers found!
+                        # return as range
+                        try:
+                            _min, _max = self._extract_single_number(num[0])[0], self._extract_single_number(num[1][0])
+                            return (_min, _max)
+                        except:
+                            pass
+
+        if num.isdigit():
+            num = float(num)
+            phrase_name = [x for x in phrase_list][0]
+            if phrase_name == "approximately":
+                return (num*0.95, num*100.5)
+            elif phrase_name == "over":
+                (num+1, len(num))
+            elif phrase_name == "under":
+                pass
+            pass
+            # find number
+        output = regex.findall(
+            r"(at most|fewer than)\s+",
+            lambda ele: self._extract_single_number(ele[2]),
+            text,
+        )
+        pass
+
+    # check if found in string
+    # get regex groups 1 and 2 (expression and digit\s*[money])
+    # convert digit\s*[money] to a number
+    # return (min, max)
+
+    def _extract_approximate_quantifiers(self, text: str) -> Tuple[float, float] | None:
         one, ten, hun, tho, mil, bil, tri = (
             1,
             10,
@@ -421,7 +579,13 @@ def _extract_approximate_quantifiers(self, text: str) -> Tuple[float] | None:
             },
         }
 
-        ranges = {"scales": (2, 9), "few": (2, 6), "couple": (2, 3), "dozen": (2, 6), "single_dozen": (1, 1)}
+        ranges = {
+            "scales": (2, 9),
+            "few": (2, 6),
+            "couple": (2, 3),
+            "dozen": (2, 6),
+            "single_dozen": (1, 1),
+        }
 
         def _check(_dict, key, text):
             for phrase, degree in _dict[key].items():
@@ -486,7 +650,9 @@ def extract_numbers(
                             try:
                                 # if all fails, try by normalizing the numbers to words
                                 doc = self.nlp(self._normalize_num(doc), to_words=True)
-                                numbers = self.extract_numbers_from_entities(doc, labels)
+                                numbers = self.extract_numbers_from_entities(
+                                    doc, labels
+                                )
                             except BaseException:
                                 return (None, None, None)
 

diff --git a/Database/tests/test_normalize_numbers.py b/Database/tests/test_normalize_numbers.py
@@ -121,6 +121,7 @@ def test__normalize_num_to_words(self, test_input, expected):
             ("exactly 200", 0),
             ("more than 200", 1),
             ("2,000", 0),
+            ("Greater than 700", 1)
         ],
     )
     def test__check_for_approximation(self, test_input, expected):
@@ -157,9 +158,16 @@ def test__extract_range(self, test_input, expected):
             ("$35.63 million", (35630000, 35630000, 0)),
             ("$3.6 million", (3600000, 3600000, 0)),
             ("Damage: At least $129 million", (129000000, 129000000, 1)),
-            ("At least 73", (73, 73, 1)),
+            ("At least 73", (73, 73, 1)), # guidelines v2 -> (73, 79, 1)
             ("925000000", (925000000, 925000000, 0)),
             (925000000, (925000000, 925000000, 0)),
+            (23.4, (23.4, 23.4, 0)),
+
+            # fails!
+            ("Greater than 7010 were killed", (7010, 7010, 1)), # guidelines v2 -> (7010, 7999, 1)),
+            ("Less than 400", (400, 400, 1)), # guidelines v2 ->  (300, 399, 1)
+            ("a minimum of 410 billion", (410000000000, 410000000000, 1)),  # guidelines v2 -> (410000000000, 499999999999, 1)),
+            
         ],
     )
     def test_extract_numbers(self, test_input, expected):