Skip to content

Commit

Permalink
🍻 Add drunken code to reset later
Browse files Browse the repository at this point in the history
  • Loading branch information
i-be-snek committed Jul 30, 2024
1 parent 767c709 commit 2fe2351
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 29 deletions.
222 changes: 194 additions & 28 deletions Database/scr/normalize_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ def _preprocess(self, text: str):
regex.sub(
"[A-Za-z]+",
lambda ele: (
f" {ele[0]} " if ele[0] not in lookup["style_2"].keys() else f' {lookup["style_2"][ele[0]]} '
f" {ele[0]} "
if ele[0] not in lookup["style_2"].keys()
else f' {lookup["style_2"][ele[0]]} '
),
text,
),
Expand All @@ -69,7 +71,9 @@ def _preprocess(self, text: str):
# remove any iso-4217 currency codes
text = regex.sub(
"[A-Z]{3}\s+",
lambda ele: ("" if self._check_currency(ele[0].strip()) == True else f" {ele[0]} "),
lambda ele: (
"" if self._check_currency(ele[0].strip()) == True else f" {ele[0]} "
),
text,
).strip()

Expand Down Expand Up @@ -120,7 +124,9 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
if len(regex.findall(r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b", text)) == 1:
other_scales = ["crore", "lakh", "crores", "lakhs"]
matches = regex.findall(
r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b\s+(?:" + "|".join(other_scales) + ")",
r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b\s+(?:"
+ "|".join(other_scales)
+ ")",
text,
)
if len(matches):
Expand All @@ -132,17 +138,26 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
"lakh": 1e5,
"lakhs": 1e5,
}
return [self.atof(numbers[0]) * other_scales_2_num[numbers[1]]]
return [
self.atof(numbers[0]) * other_scales_2_num[numbers[1]]
]
except BaseException:
raise BaseException

else:
try:
# try normalizing the numbers to words, then extract the numbers
# (eg. "2 million" -> "two million" -> 2000000.0)
assert len(regex.findall(r"[0-9]+[,.]a?[0-9]*|[0-9]+", text)) == 1, BaseException
normalized_text = self._normalize_num(self.nlp(text), to_word=True)
number = text2num(normalized_text, lang=self.lang, relaxed=True)
assert (
len(regex.findall(r"[0-9]+[,.]a?[0-9]*|[0-9]+", text))
== 1
), BaseException
normalized_text = self._normalize_num(
self.nlp(text), to_word=True
)
number = text2num(
normalized_text, lang=self.lang, relaxed=True
)
except:
# handle decimals:
# if there is a decimal followed by a million/billion, etc
Expand All @@ -154,11 +169,24 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
"trillion",
]
if len(regex.findall(r"[0-9]+[.]{1}[0-9]+", text)) == 1:
numbers = [token.text for token in self.nlp(text) if token.like_num]
if len(numbers) == 2 and len(set(numbers[1].split(" ")).intersection(scales)) != 0:
numbers = [
token.text
for token in self.nlp(text)
if token.like_num
]
if (
len(numbers) == 2
and len(
set(numbers[1].split(" ")).intersection(scales)
)
!= 0
):
try:
return [
self.atof(numbers[0]) * text2num(numbers[1], lang=self.lang, relaxed=True)
self.atof(numbers[0])
* text2num(
numbers[1], lang=self.lang, relaxed=True
)
]
except BaseException:
raise BaseException
Expand All @@ -168,7 +196,9 @@ def _extract_single_number(self, text: str) -> List[float] | BaseException:
raise BaseException()
return [number]

def _extract_numbers_from_tokens(self, doc: spacy.tokens.doc.Doc) -> List[float] | BaseException:
def _extract_numbers_from_tokens(
self, doc: spacy.tokens.doc.Doc
) -> List[float] | BaseException:
numbers = []
tmp_num = ""
num_ranges = []
Expand Down Expand Up @@ -203,9 +233,9 @@ def _normalize_num(doc, to_word=False) -> str:
new = ""
for token in doc:
# some times wrong tags are assigned, so we need to check both the tags and if the token is a number by regex
if (token.tag_ in ["CD", "SYM"] and token.text not in ["<", ">", "<=", ">="]) or (
regex.match(r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b", token.text)
):
if (
token.tag_ in ["CD", "SYM"] and token.text not in ["<", ">", "<=", ">="]
) or (regex.match(r"\b(?<!\.)\d+(?:,\d+)*(?:\.\d+)?\b", token.text)):
try:
new += num2words(token.text) if to_word else token.text
if token.whitespace_:
Expand Down Expand Up @@ -250,14 +280,18 @@ def _extract_numbers_from_entities(
try:
return self._extract_numbers_from_tokens(doc)
except:
return self._extract_numbers_from_tokens(self.nlp(transcribed_text))
return self._extract_numbers_from_tokens(
self.nlp(transcribed_text)
)

try:
number = self.atof(transcribed_text)
numbers.append(number)
except:
try:
normalized_num = self._normalize_num(self.nlp(transcribed_text), to_word=False)
normalized_num = self._normalize_num(
self.nlp(transcribed_text), to_word=False
)
numbers.append(self.atof(normalized_num))
except BaseException:
raise BaseException
Expand All @@ -270,7 +304,9 @@ def _extract_spans(
):
return [(span["start"], span["end"]) for span in spans]

def _check_for_approximation(self, doc: spacy.tokens.doc.Doc, labels: List[str]) -> bool:
def _check_for_approximation(
self, doc: spacy.tokens.doc.Doc, labels: List[str]
) -> bool:
tags = " ".join([token.tag_ for token in doc])
ent_labels = [ent.label_ for ent in doc.ents]

Expand All @@ -280,25 +316,52 @@ def _check_for_approximation(self, doc: spacy.tokens.doc.Doc, labels: List[str])
return 0
except:
# check for common keywords
keywords = [
approx_keywords = [
"over",
"under",
"approxinately",
"approx",
"nearly",
"fewer than",
"greater than",
"more than",
"less than",
"between",
"a minimum of",
"a maximum of",
"almost",
"roughly",
"closely",
"about",
"at most",
"at least",
"approaching",
"upwards of",
"downwards of",
"roundly",
"circa",
"ca.",
"c.",
"cca",
"within",
]
if any([k.lower() in doc.text for k in keywords]):

not_approx_keywords = {
"exactly",
"precicely",
"a total of",
"only",
}
if any([k in doc.text.lower() for k in approx_keywords]):
return 1

if any([k in doc.text.lower() for k in not_approx_keywords]):
return 0

# check for common POS tag combinations (example: "About 200 people" -> "RB CD NNS")
# check for any math symbols (>=, ~, etc) or if a number ends with a plus/plus-minus sign
if any([x in tags for x in ["NFP", "IN JJS CD", "RB CD NNS", "IN CD NNS", ":"]]) or regex.findall(
r"[0-9]+(\+|±)|(=)*(>|<)(=)*|(~)", doc.text
):
if any(
[x in tags for x in ["NFP", "IN JJS CD", "RB CD NNS", "IN CD NNS", ":"]]
) or regex.findall(r"[0-9]+(\+|±)|(=)*(>|<)(=)*|(~)", doc.text):
return 1

# check if all tokens in the string are number/money-related
Expand All @@ -308,8 +371,16 @@ def _check_for_approximation(self, doc: spacy.tokens.doc.Doc, labels: List[str])
# check the spans only if the text contains an adverb followed by a number
# if ent spans are the same as num spans, it's not an approx
elif "RB CD" == tags:
ent_spans = self._extract_spans([ent for ent in doc.to_json()["ents"] if ent["label"] in labels])
num_spans = self._extract_spans([token for token in doc.to_json()["tokens"] if token["tag"] in ["CD"]])
ent_spans = self._extract_spans(
[ent for ent in doc.to_json()["ents"] if ent["label"] in labels]
)
num_spans = self._extract_spans(
[
token
for token in doc.to_json()["tokens"]
if token["tag"] in ["CD"]
]
)
return 0 if ent_spans and ent_spans == num_spans else 1

# check if there are no entities in the specified labels
Expand All @@ -336,7 +407,94 @@ def _extract_range(self, text: str) -> Tuple[float]:
except:
return None

def _extract_approximate_quantifiers(self, text: str) -> Tuple[float] | None:
def _extract_special_ranges(self, text: str) -> Tuple[float, float] | None:
approximately = [
"approx",
"nearly",
"almost",
"roughly" "closely",
"about",
"roundly",
"circa",
"ca.",
"c.",
"cca",
"within the range of",
"in the range of",
]
over = [
"upwards of",
"approaching",
"a maximum of",
"over",
"greater than",
"more than",
"at least",
"above",
]
under = [
"downwards of",
"under",
"fewer than",
"below",
"less than",
"a minimum of",
"at most",
]

scales = [
"thousand",
"million",
"billion",
"trillion",
]


for phrase_list in (approximately, over, under):
expression = "(\\d+\\s+)\\w*\\s+({phrase})|({phrases})\\s+(\\d+\\s+({scales}))|({phrases})\\s+(\\d+)\\s*"
expression = expression.format(phrases="|".join(phrase_list), scales=scales)
phrases = regex.findall(r"{}".format(expression), scales=scales)

if len(phrases) > 0:
num = None
for i in phrases:
num = [x for x in i if x[0].isdigit()]
if len(num) == 1:
num = num[0]
break
if len(num) >= 2:
# cap at first 2 numbers found!
# return as range
try:
_min, _max = self._extract_single_number(num[0])[0], self._extract_single_number(num[1][0])
return (_min, _max)
except:
pass

if num.isdigit():
num = float(num)
phrase_name = [x for x in phrase_list][0]
if phrase_name == "approximately":
return (num*0.95, num*100.5)
elif phrase_name == "over":
(num+1, len(num))
elif phrase_name == "under":
pass
pass
# find number
output = regex.findall(
r"(at most|fewer than)\s+",
lambda ele: self._extract_single_number(ele[2]),
text,
)
pass

# check if found in string
# get regex groups 1 and 2 (expression and digit\s*[money])
# convert digit\s*[money] to a number
# return (min, max)

def _extract_approximate_quantifiers(self, text: str) -> Tuple[float, float] | None:
one, ten, hun, tho, mil, bil, tri = (
1,
10,
Expand Down Expand Up @@ -421,7 +579,13 @@ def _extract_approximate_quantifiers(self, text: str) -> Tuple[float] | None:
},
}

ranges = {"scales": (2, 9), "few": (2, 6), "couple": (2, 3), "dozen": (2, 6), "single_dozen": (1, 1)}
ranges = {
"scales": (2, 9),
"few": (2, 6),
"couple": (2, 3),
"dozen": (2, 6),
"single_dozen": (1, 1),
}

def _check(_dict, key, text):
for phrase, degree in _dict[key].items():
Expand Down Expand Up @@ -486,7 +650,9 @@ def extract_numbers(
try:
# if all fails, try by normalizing the numbers to words
doc = self.nlp(self._normalize_num(doc), to_words=True)
numbers = self.extract_numbers_from_entities(doc, labels)
numbers = self.extract_numbers_from_entities(
doc, labels
)
except BaseException:
return (None, None, None)

Expand Down
10 changes: 9 additions & 1 deletion Database/tests/test_normalize_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def test__normalize_num_to_words(self, test_input, expected):
("exactly 200", 0),
("more than 200", 1),
("2,000", 0),
("Greater than 700", 1)
],
)
def test__check_for_approximation(self, test_input, expected):
Expand Down Expand Up @@ -157,9 +158,16 @@ def test__extract_range(self, test_input, expected):
("$35.63 million", (35630000, 35630000, 0)),
("$3.6 million", (3600000, 3600000, 0)),
("Damage: At least $129 million", (129000000, 129000000, 1)),
("At least 73", (73, 73, 1)),
("At least 73", (73, 73, 1)), # guidelines v2 -> (73, 79, 1)
("925000000", (925000000, 925000000, 0)),
(925000000, (925000000, 925000000, 0)),
(23.4, (23.4, 23.4, 0)),
# fails!
("Greater than 7010 were killed", (7010, 7010, 1)), # guidelines v2 -> (7010, 7999, 1)),
("Less than 400", (400, 400, 1)), # guidelines v2 -> (300, 399, 1)
("a minimum of 410 billion", (410000000000, 410000000000, 1)), # guidelines v2 -> (410000000000, 499999999999, 1)),
],
)
def test_extract_numbers(self, test_input, expected):
Expand Down

0 comments on commit 2fe2351

Please sign in to comment.