Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update normalization of ranges to match new guidelines #87

Merged
merged 7 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 35 additions & 26 deletions Database/scr/normalize_numbers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from math import floor, isnan
from math import isnan
from typing import Dict, List, Tuple, Union

import regex
Expand Down Expand Up @@ -168,14 +168,14 @@ def __init__(self, nlp: spacy.language, locale_config: str):
"no injuries",
"no casualties",
"no deaths",
"minimal",
"no fatalities",
]
self.unknown_phrases = [
"minimal",
"negligible",
"inconsequential",
"minor",
"limited",
]
self.unknown_phrases = [
"absent",
"does not mention",
"indefinite",
Expand Down Expand Up @@ -501,6 +501,25 @@ def _extract_simple_range(self, text: str) -> Tuple[float]:
except:
return None

def _get_scale(self, n_init: float | int):
"""
Determine the scale of a number
"""
n = int(n_init) if isinstance(n_init, float) and n_init.is_integer() else n_init
abs_n = abs(n)
n_str = str(abs_n)

if isinstance(n, int):
# Check if the last digit is zero
trailing_zeros = len(n_str) - len(n_str.rstrip("0"))
scale = 10**trailing_zeros

elif isinstance(n, float):
_, part_dec = n_str.split(".")
scale = 10 ** (-len(part_dec))

return n, scale

def _extract_complex_range(self, text: str) -> Tuple[float, float] | None:
phrases = {
"approx": {"list": sorted(self.approximately, reverse=True)},
Expand All @@ -514,7 +533,9 @@ def _extract_complex_range(self, text: str) -> Tuple[float, float] | None:
any_digit = "[\d,.]*"
expression = "({any_digit})\s*({scales})*\s*({phrases})[,.]*\s*({any_digit})\s*({scales})*"
expression = expression.format(
phrases="|".join(v["list"]), scales="|".join(self.scales), any_digit=any_digit
phrases="|".join(v["list"]),
scales="|".join(self.scales),
any_digit=any_digit,
)
matches = regex.findall(expression, text, flags=regex.IGNORECASE | regex.MULTILINE)

Expand Down Expand Up @@ -546,32 +567,20 @@ def _extract_complex_range(self, text: str) -> Tuple[float, float] | None:
if any([x in [y.lower() for y in text.split()] for x in self.family_synonyms])
else (1, 1)
)
scale = pow(10, len(str(int(num))) - 1)
multip = int(str(int(num))[0])
n, scale = self._get_scale(num)

if k == "approx":
return (
floor(num * 0.95) * lower_mod,
floor(num * 1.05) * upper_mod,
)
return ((max(0, n - scale) * lower_mod), (n + scale) * upper_mod)
if "over" in k:
inc = 0 if "inclusive" in k else 1
return (
(num + inc) * lower_mod,
num + 5 if (scale == 1 and upper_mod == 1) else ((scale * (multip + 1)) - 1) * upper_mod,
)
return ((n + inc) * lower_mod, (n + scale + inc) * upper_mod)

if "under" in k:
inc = 0 if "inclusive" in k else 1
if (num - (scale * multip)) / num > 0.08:
_min, _max = (
0 if (scale == 1 and multip == 1) else ((scale * multip) + 1) * upper_mod,
(num - inc) * lower_mod,
)
else:
_min, _max = (
0 if (scale == 1 and multip == 1) else ((scale * (multip - 1)) + 1) * upper_mod,
(num - inc) * lower_mod,
)
return (_min, _max)
return (
max(0, n - scale - inc) * lower_mod,
max(0, n - inc) * upper_mod,
)

def _extract_approximate_quantifiers(self, text: str) -> Tuple[float, float] | None:
one, ten, hun, tho, mil, bil, tri = (
Expand Down
82 changes: 41 additions & 41 deletions tests/test_normalize_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def test__check_for_approximation(self, test_input, expected):
("0 - 352", (0, 352)),
("23- 55", (23, 55)),
("24,501-61,672", (24501, 61672)),
# not meant to handle this case
# cases meant to fail
(">=12", None),
("12", None),
("twelve and one hundred", None),
Expand All @@ -153,29 +153,29 @@ def test__extract_simple_range(self, test_input, expected):
("23mil", (23000000, 23000000, 0)),
("110 - 352", (110, 352, 1)),
("between 11 and 17 people were affected", (11, 17, 1)),
("Nearly 300 homes were destroyed", (285, 315, 1)),
("Nearly 300 homes were destroyed", (200, 400, 1)),
("$3.6 million", (3600000, 3600000, 0)),
("$35.63 million", (35630000, 35630000, 0)),
("$3.6 million", (3600000, 3600000, 0)),
("Damage: At least $129 million", (129000000, 199999999, 1)),
("At least 73", (73, 79, 1)),
("Damage: At least $129 million", (129000000, 130000000, 1)),
("At least 73", (73, 74, 1)),
("925000000", (925000000, 925000000, 0)),
(925000000, (925000000, 925000000, 0)),
(23.4, (23.4, 23.4, 0)),
("More than 7010 were killed", (7011, 7999, 1)),
("Less than 400", (301, 399, 1)),
("More than 7010 were killed", (7011, 7021, 1)),
("Less than 400", (299, 399, 1)),
(
"a minimum of 410 billion",
(410000000000, 499999999999, 1),
(410000000000, 420000000000, 1),
),
("603+", (603, 699, 1)),
(">=293", (293, 299, 1)),
("~293", (278, 307, 1)),
(">= $27 million", (27000000, 29999999, 1)),
("$27 million or more", (27000000, 29999999, 1)),
("about A$500 million", (475000000, 525000000, 1)),
("over US$500 million", (500000001, 599999999, 1)),
("over USD 1.0 billion", (1000000001, 1999999999, 1)),
("603+", (603, 604, 1)),
(">=293", (293, 294, 1)),
("~293", (292, 294, 1)),
(">= $27 million", (27000000, 28000000, 1)),
("$27 million or more", (27000000, 28000000, 1)),
("about A$500 million", (400000000, 600000000, 1)),
("over US$500 million", (500000001, 600000001, 1)),
("over USD 1.0 billion", (1000000001, 2000000001, 1)),
("15 billion yuan", (15000000000, 15000000000, 0)),
("100 million pesos", (100000000, 100000000, 0)),
("between 20.2 and 30.4", (20.2, 30.4, 1)),
Expand Down Expand Up @@ -216,34 +216,34 @@ def test__extract_approximate_quantifiers(self, test_input, expected):
"test_input, expected",
[
# approx
("Almost 30", (28, 31)), # rounded down! (floor(30*0.95), floor(30*1.05))
("approximately 7000000000 dollars", (6650000000, 7350000000)),
("Around 7000 homes were destroyed", (6650, 7350)),
("roughly, 4 injuries had been reported", (3, 4)), # rounded down! (floor(4*0.95), floor(4*1.05))
("~45", (42, 47)),
("Almost 30", (20, 40)),
("approximately 7000000000 dollars", (6000000000, 8000000000)),
("Around 7000 homes were destroyed", (6000, 8000)),
("roughly, 4 injuries had been reported", (3, 5)),
("~45", (44, 46)),
# over
("Greater than 300", (301, 399)),
("The number of deaths certainly exceeded 66", (67, 69)),
("more than 6 families were displaced", ((6 + 1) * 3, 6 * 5)),
("at least 3600 were reported missing", (3600, 3999)),
("no less than 55 injuries were reported in the media", (55, 59)),
("> 45", (46, 49)),
(">=5", (5, 10)), # created range by adding 5 since scale == 1
("greater than or equal to 9", (9, 14)), # created range by adding 5 since scale == 1
("45+ deaths were reported by the news", (45, 49)),
("311,000,000+ Euros", (311000000, 399999999)),
(">693 million", (693000001, 699999999)),
("Greater than 300", (301, 401)),
("The number of deaths certainly exceeded 66", (67, 68)),
("more than 6 families were displaced", (7 * 3, 8 * 5)), # 7*3, 7*5
("at least 3600 were reported missing", (3600, 3700)),
("no less than 55 injuries were reported in the media", (55, 56)),
("> 45", (46, 47)),
(">=5", (5, 6)),
("greater than or equal to 9", (9, 10)),
("45+ deaths were reported by the news", (45, 46)),
("311,000,000+ Euros", (311000000, 312000000)),
(">693 million", (693000001, 694000001)),
# under
("less than 230000000 dollars were paid out in insurance costs", (200000001, 229999999)),
("No more than 23 million dollars", (20000001, 23000000)),
("Up to 7 billion dollars", (6000000001, 7000000000)),
("at most 3284 casualties were reported", (3001, 3284)),
("Up to 7000000 dollars", (6000001, 7000000)),
("Up to 7,000,000 dollars", (6000001, 7000000)),
("less than 1", (0, 0)),
("no more than 1 was injured", (0, 1)),
("≤7000000", (6000001, 7000000)),
# cases this function does not handle; meant to raise BaseException
("less than 230000000 dollars were paid out in insurance costs", (219999999, 229999999)),
("No more than 23 million dollars", (22000000, 23000000)),
("Up to 7 billion dollars", (6000000000, 7000000000)),
("at most 3284 casualties were reported", (3283, 3284)),
("Up to 7000000 dollars", (6000000, 7000000)),
("Up to 7,000,000 dollars", (6000000, 7000000)),
("less than 1", (0, 0)), # 1,1
("no more than 1 was injured", (0, 1)), # 1,1
("≤7000000", (6000000, 7000000)),
# cases meant to fail
("six families were displaced", None),
],
)
Expand Down
Loading