Skip to content

Commit

Permalink
Validate currency in schema (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
i-be-snek authored Nov 6, 2024
1 parent 703ec60 commit 2e3bcb9
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 51 deletions.
27 changes: 5 additions & 22 deletions Database/gold_from_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
from datetime import datetime

import pandas as pd
from iso4217 import Currency
from tqdm import tqdm

from Database.scr.log_utils import Logging
from Database.scr.normalize_locations import NormalizeLocation
from Database.scr.normalize_utils import NormalizeUtils

tqdm.pandas()
utils = NormalizeUtils()


def flatten(xss):
Expand All @@ -34,25 +35,7 @@ def fix_column_names(df):
return df


def _check_currency(currency_text: str) -> bool:
try:
Currency(currency_text)
return True
except ValueError as err:
logger.error(err)
return False


def _check_date(year: int, month: int, day: int) -> bool:
try:
datetime(year, month, day)
return True
except ValueError as err:
logger.error(f"Y: {year}; M: {month}; D: {day}. Error: {err}")
return False


def _split_range(text: str) -> tuple[float, None]:
def _split_range(text: str) -> tuple[str | None, str | None]:
r = text.split("-")
if len(r) == 1:
return (r[0], r[0])
Expand Down Expand Up @@ -223,7 +206,7 @@ def flatten_data_table():
logger.info(f"Validating Units for monetary type columns...")
for col in currency_unit_cols:
data_table[f"{col}_valid_currency"] = data_table[col].progress_apply(
lambda x: all([_check_currency(y) for y in x]) if x else True
lambda x: all([utils.check_currency(y) for y in x]) if x else True
)
assert all(data_table[f"{col}_valid_currency"])
data_table.drop(columns=[f"{col}_valid_currency"], inplace=True)
Expand Down Expand Up @@ -253,7 +236,7 @@ def flatten_data_table():
f"{date_type}_Date_Month",
f"{date_type}_Date_Day",
]
].progress_apply(lambda x: _check_date(x[0], x[1], x[2]) if all(x) else True)
].progress_apply(lambda x: utils.check_date(year=x[0], month=x[1], day=x[2]) if all(x) else True)

assert all(data_table[f"{date_type}_valid_date"])
data_table.drop(columns=[f"{date_type}_valid_date"], inplace=True)
Expand Down
45 changes: 23 additions & 22 deletions Database/insert_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from pandarallel import pandarallel
from tqdm import tqdm

from Database.scr.normalize_utils import GeoJsonUtils, Logging, NormalizeUtils
from Database.scr.normalize_utils import (
CategoricalValidation,
GeoJsonUtils,
Logging,
NormalizeUtils,
)

pandarallel.initialize(progress_bar=False, nb_workers=5)

Expand Down Expand Up @@ -111,6 +116,7 @@
args = parser.parse_args()
logger = Logging.get_logger(f"database-insertion", level="INFO", filename="v1_full_run_insertion_raw.log")
utils = NormalizeUtils()
validation = CategoricalValidation()

connection = sqlite3.connect(args.database_name)
cursor = connection.cursor()
Expand All @@ -128,10 +134,10 @@
logger.info(f"Inserting {main_level}...\n")
for f in tqdm(files, desc="Files"):
data = pd.read_parquet(f"{args.file_dir}/{f}", engine="fastparquet")
data = utils.replace_nulls(data)

logger.info("Converting everything to strings...")
for c in data.columns:
data[c] = data[c].astype(str)
# turn invalid currencies to None (for L1 only)
data = data.parallel_apply(lambda row: validation.validate_currency_monetary_impact(row), axis=1)

# geojson in l1 is always of type list
if args.dump_geojson_to_file:
Expand All @@ -156,12 +162,14 @@
axis=1,
)

for i in ["GeoJson", "Norm"]:
data[f"{col}_{i}"] = data[f"{col}_{i}"].astype(str)
logger.info("Converting everything to strings...")
data.replace(float("nan"), None, inplace=True)
for c in data.columns:
data[c] = data[c].astype(str)

# change if_exists to "append" to avoid overwriting the database
# choose "replace" to overwrite the database with a fresh copy of the data
for i in tqdm(range(len(data)), desc=f"Inserting {f} into {args.database_name}"):
for i in tqdm(range(len(data)), desc=f"Inserting {args.event_level} into {args.database_name}"):
try:
data.iloc[i : i + 1].to_sql(
name=args.target_table,
Expand Down Expand Up @@ -193,20 +201,11 @@
data = pd.read_parquet(f"{args.file_dir}/{f}", engine="fastparquet")
data = utils.replace_nulls(data)

logger.info("Converting everything to strings...")
for c in data.columns:
data[c] = data[c].astype(str)

logger.info(f"Popping GeoJson files out for level {args.event_level} and onto disk")
if args.dump_geojson_to_file:
for col, _type in event_levels[args.event_level]["location_columns"].items():
logger.info(f"Processing GeoJson column {col} in {args.event_level}; File: {f}")
if _type == list:
for i in ["GeoJson", "Norm"]:
data[f"{col}_{i}"] = data[f"{col}_{i}"].parallel_apply(
lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)

data[f"{col}_GeoJson"] = data.parallel_apply(
lambda row: (
[
Expand All @@ -219,8 +218,6 @@
),
axis=1,
)
for i in ["GeoJson", "Norm"]:
data[f"{col}_{i}"] = data[f"{col}_{i}"].astype(str)

elif _type == str:
data[f"{col}_GeoJson"] = data.parallel_apply(
Expand All @@ -231,12 +228,15 @@
),
axis=1,
)
for i in ["GeoJson", "Norm"]:
data[f"{col}_{i}"] = data[f"{col}_{i}"].astype(str)

# change if_exists to "append" to avoid overwriting the database
# choose "replace" to overwrite the database with a fresh copy of the data
for i in tqdm(range(len(data)), desc=f"Inserting {f} into {args.database_name}"):
logger.info("Converting everything to strings...")
data.replace(float("nan"), None, inplace=True)
for c in data.columns:
data[c] = data[c].astype(str)

for i in tqdm(range(len(data)), desc=f"Inserting {args.event_level} into {args.database_name}"):
try:
data.iloc[i : i + 1].to_sql(
name=args.target_table,
Expand All @@ -253,7 +253,8 @@
err_row["ERROR"] = err
errors = pd.concat([errors, err_row], ignore_index=True)

geojson_utils.store_non_english_nids()
if args.dump_geojson_to_file:
geojson_utils.store_non_english_nids()

if errors.shape != (0, 0):
from time import time
Expand Down
6 changes: 3 additions & 3 deletions Database/schema/L1_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ CREATE TABLE Total_Summary (
Event_ID TEXT PRIMARY KEY NOT NULL CHECK (length(Event_ID) == 7), /* COMMENT 'UID' */
Event_Names OBJECT NOT NULL, /* COMMENT 'Array' */
Sources OBJECT NOT NULL, /* COMMENT 'Array' */
Main_Event TEXT NOT NULL CHECK (Main_Event IN ("Flood", "Extratropical Storm/Cyclone", "Tropical Storm/Cyclone", "Extreme Temperature", "Drought", "Wildfire", "Tornado",)); /* COMMENT 'Categorical' */
Main_Event TEXT NOT NULL CHECK (Main_Event IN ('Flood', 'Extratropical Storm/Cyclone', 'Tropical Storm/Cyclone', 'Extreme Temperature', 'Drought', 'Wildfire', 'Tornado')), /* COMMENT 'Categorical' */
Hazards OBJECT NOT NULL, /* COMMENT 'Array', categorical */

Administrative_Areas_Norm OBJECT NOT NULL, /* COMMENT 'Array' of TEXT/NULL */
Expand Down Expand Up @@ -51,7 +51,7 @@ CREATE TABLE Total_Summary (
Total_Insured_Damage_Min REAL CHECK (Total_Insured_Damage_Min > 0 OR Total_Insured_Damage_Min == NULL),
Total_Insured_Damage_Max REAL CHECK (Total_Insured_Damage_Max > 0 OR Total_Insured_Damage_Max == NULL),
Total_Insured_Damage_Approx INTEGER CHECK (Total_Insured_Damage_Approx == 1 OR Total_Insured_Damage_Approx == 0 OR Total_Insured_Damage_Approx == NULL), /* COMMENT 'Boolean' */
Total_Insured_Damage_Unit TEXT, /* COMMENT 'currency' */
Total_Insured_Damage_Unit TEXT CHECK (Total_Insured_Damage_Unit in ('None', 'AFN', 'EUR', 'ALL', 'DZD', 'USD', 'AOA', 'XCD', 'ARS', 'AMD', 'AWG', 'AUD', 'AZN', 'BSD', 'BHD', 'BDT', 'BBD', 'BYN', 'BZD', 'XOF', 'BMD', 'INR', 'BTN', 'BOB', 'BOV', 'BAM', 'BWP', 'NOK', 'BRL', 'BND', 'BGN', 'BIF', 'CVE', 'KHR', 'XAF', 'CAD', 'KYD', 'CLP', 'CLF', 'CNY', 'COP', 'COU', 'KMF', 'CDF', 'NZD', 'CRC', 'HRK', 'CUP', 'CUC', 'ANG', 'CZK', 'DKK', 'DJF', 'DOP', 'EGP', 'SVC', 'ERN', 'SZL', 'ETB', 'FKP', 'FJD', 'XPF', 'GMD', 'GEL', 'GHS', 'GIP', 'GTQ', 'GBP', 'GNF', 'GYD', 'HTG', 'HNL', 'HKD', 'HUF', 'ISK', 'IDR', 'XDR', 'IRR', 'IQD', 'ILS', 'JMD', 'JPY', 'JOD', 'KZT', 'KES', 'KPW', 'KRW', 'KWD', 'KGS', 'LAK', 'LBP', 'LSL', 'ZAR', 'LRD', 'LYD', 'CHF', 'MOP', 'MKD', 'MGA', 'MWK', 'MYR', 'MVR', 'MRU', 'MUR', 'XUA', 'MXN', 'MXV', 'MDL', 'MNT', 'MAD', 'MZN', 'MMK', 'NAD', 'NPR', 'NIO', 'NGN', 'OMR', 'PKR', 'PAB', 'PGK', 'PYG', 'PEN', 'PHP', 'PLN', 'QAR', 'RON', 'RUB', 'RWF', 'SHP', 'WST', 'STN', 'SAR', 'RSD', 'SCR', 'SLL', 'SLE', 'SGD', 'XSU', 'SBD', 'SOS', 'SSP', 'LKR', 'SDG', 'SRD', 'SEK', 'CHE', 'CHW', 'SYP', 'TWD', 'TJS', 'TZS', 'THB', 'TOP', 'TTD', 'TND', 'TRY', 'TMT', 'UGX', 'UAH', 'AED', 'USN', 'UYU', 'UYI', 'UYW', 'UZS', 'VUV', 'VES', 'VED', 'VND', 'YER', 'ZMW', 'ZWL', 'XBA', 'XBB', 'XBC', 'XBD', 'XTS', 'XXX', 'XAU', 'XPD', 'XPT', 'XAG')), /* COMMENT 'currency' */
Total_Insured_Damage_Inflation_Adjusted INTEGER, /* COMMENT 'Boolean' */
Total_Insured_Damage_Inflation_Adjusted_Year INTEGER CHECK (
length(Total_Insured_Damage_Inflation_Adjusted_Year) == 4
Expand All @@ -62,7 +62,7 @@ CREATE TABLE Total_Summary (
Total_Damage_Min REAL CHECK (Total_Damage_Min > 0 OR Total_Damage_Min == NULL),
Total_Damage_Max REAL CHECK (Total_Damage_Max > 0 OR Total_Damage_Max == NULL),
Total_Damage_Approx INTEGER CHECK (Total_Damage_Approx == 1 OR Total_Damage_Approx == 0 OR Total_Damage_Approx == NULL), /* COMMENT 'Boolean' */
Total_Damage_Unit TEXT, /* COMMENT 'currency' */
Total_Damage_Unit TEXT CHECK (Total_Damage_Unit in ('None', 'AFN', 'EUR', 'ALL', 'DZD', 'USD', 'AOA', 'XCD', 'ARS', 'AMD', 'AWG', 'AUD', 'AZN', 'BSD', 'BHD', 'BDT', 'BBD', 'BYN', 'BZD', 'XOF', 'BMD', 'INR', 'BTN', 'BOB', 'BOV', 'BAM', 'BWP', 'NOK', 'BRL', 'BND', 'BGN', 'BIF', 'CVE', 'KHR', 'XAF', 'CAD', 'KYD', 'CLP', 'CLF', 'CNY', 'COP', 'COU', 'KMF', 'CDF', 'NZD', 'CRC', 'HRK', 'CUP', 'CUC', 'ANG', 'CZK', 'DKK', 'DJF', 'DOP', 'EGP', 'SVC', 'ERN', 'SZL', 'ETB', 'FKP', 'FJD', 'XPF', 'GMD', 'GEL', 'GHS', 'GIP', 'GTQ', 'GBP', 'GNF', 'GYD', 'HTG', 'HNL', 'HKD', 'HUF', 'ISK', 'IDR', 'XDR', 'IRR', 'IQD', 'ILS', 'JMD', 'JPY', 'JOD', 'KZT', 'KES', 'KPW', 'KRW', 'KWD', 'KGS', 'LAK', 'LBP', 'LSL', 'ZAR', 'LRD', 'LYD', 'CHF', 'MOP', 'MKD', 'MGA', 'MWK', 'MYR', 'MVR', 'MRU', 'MUR', 'XUA', 'MXN', 'MXV', 'MDL', 'MNT', 'MAD', 'MZN', 'MMK', 'NAD', 'NPR', 'NIO', 'NGN', 'OMR', 'PKR', 'PAB', 'PGK', 'PYG', 'PEN', 'PHP', 'PLN', 'QAR', 'RON', 'RUB', 'RWF', 'SHP', 'WST', 'STN', 'SAR', 'RSD', 'SCR', 'SLL', 'SLE', 'SGD', 'XSU', 'SBD', 'SOS', 'SSP', 'LKR', 'SDG', 'SRD', 'SEK', 'CHE', 'CHW', 'SYP', 'TWD', 'TJS', 'TZS', 'THB', 'TOP', 'TTD', 'TND', 'TRY', 'TMT', 'UGX', 'UAH', 'AED', 'USN', 'UYU', 'UYI', 'UYW', 'UZS', 'VUV', 'VES', 'VED', 'VND', 'YER', 'ZMW', 'ZWL', 'XBA', 'XBB', 'XBC', 'XBD', 'XTS', 'XXX', 'XAU', 'XPD', 'XPT', 'XAG')), /* COMMENT 'currency' */
Total_Damage_Inflation_Adjusted INTEGER, /* COMMENT 'Boolean' */
Total_Damage_Inflation_Adjusted_Year INTEGER CHECK (
length(Total_Damage_Inflation_Adjusted_Year) == 4
Expand Down
2 changes: 1 addition & 1 deletion Database/schema/L2_schema_template.sql
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ CREATE TABLE Instance_Per_Administrative_Areas_type_monetary (
Num_Min REAL NOT NULL CHECK (Num_Min >= 0),
Num_Max REAL NOT NULL CHECK (Num_Max >= 0),
Num_Approx INTEGER NOT NULL CHECK (Num_Approx == 1 OR Num_Approx == 0), /* COMMENT 'Boolean' */
Num_Unit TEXT NOT NULL, /* COMMENT 'currency' */
Num_Unit TEXT NOT NULL CHECK (Num_Unit in ('AFN', 'EUR', 'ALL', 'DZD', 'USD', 'AOA', 'XCD', 'ARS', 'AMD', 'AWG', 'AUD', 'AZN', 'BSD', 'BHD', 'BDT', 'BBD', 'BYN', 'BZD', 'XOF', 'BMD', 'INR', 'BTN', 'BOB', 'BOV', 'BAM', 'BWP', 'NOK', 'BRL', 'BND', 'BGN', 'BIF', 'CVE', 'KHR', 'XAF', 'CAD', 'KYD', 'CLP', 'CLF', 'CNY', 'COP', 'COU', 'KMF', 'CDF', 'NZD', 'CRC', 'HRK', 'CUP', 'CUC', 'ANG', 'CZK', 'DKK', 'DJF', 'DOP', 'EGP', 'SVC', 'ERN', 'SZL', 'ETB', 'FKP', 'FJD', 'XPF', 'GMD', 'GEL', 'GHS', 'GIP', 'GTQ', 'GBP', 'GNF', 'GYD', 'HTG', 'HNL', 'HKD', 'HUF', 'ISK', 'IDR', 'XDR', 'IRR', 'IQD', 'ILS', 'JMD', 'JPY', 'JOD', 'KZT', 'KES', 'KPW', 'KRW', 'KWD', 'KGS', 'LAK', 'LBP', 'LSL', 'ZAR', 'LRD', 'LYD', 'CHF', 'MOP', 'MKD', 'MGA', 'MWK', 'MYR', 'MVR', 'MRU', 'MUR', 'XUA', 'MXN', 'MXV', 'MDL', 'MNT', 'MAD', 'MZN', 'MMK', 'NAD', 'NPR', 'NIO', 'NGN', 'OMR', 'PKR', 'PAB', 'PGK', 'PYG', 'PEN', 'PHP', 'PLN', 'QAR', 'RON', 'RUB', 'RWF', 'SHP', 'WST', 'STN', 'SAR', 'RSD', 'SCR', 'SLL', 'SLE', 'SGD', 'XSU', 'SBD', 'SOS', 'SSP', 'LKR', 'SDG', 'SRD', 'SEK', 'CHE', 'CHW', 'SYP', 'TWD', 'TJS', 'TZS', 'THB', 'TOP', 'TTD', 'TND', 'TRY', 'TMT', 'UGX', 'UAH', 'AED', 'USN', 'UYU', 'UYI', 'UYW', 'UZS', 'VUV', 'VES', 'VED', 'VND', 'YER', 'ZMW', 'ZWL', 'XBA', 'XBB', 'XBC', 'XBD', 'XTS', 'XXX', 'XAU', 'XPD', 'XPT', 'XAG')), /* COMMENT 'currency' */
Num_Inflation_Adjusted INTEGER, /* COMMENT 'Boolean' */
Num_Inflation_Adjusted_Year INTEGER CHECK (
length(Num_Inflation_Adjusted_Year) == 4
Expand Down
2 changes: 1 addition & 1 deletion Database/schema/L3_schema_template.sql
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ CREATE TABLE Specific_Instance_Per_Administrative_Area_type_monetary (
Num_Min REAL NOT NULL CHECK (Num_Min >= 0),
Num_Max REAL NOT NULL CHECK (Num_Max >= 0),
Num_Approx INTEGER NOT NULL CHECK (Num_Approx == 1 OR Num_Approx == 0), /* COMMENT 'Boolean' */
Num_Unit TEXT NOT NULL, /* COMMENT 'currency' */
Num_Unit TEXT NOT NULL CHECK (Num_Unit in ('AFN', 'EUR', 'ALL', 'DZD', 'USD', 'AOA', 'XCD', 'ARS', 'AMD', 'AWG', 'AUD', 'AZN', 'BSD', 'BHD', 'BDT', 'BBD', 'BYN', 'BZD', 'XOF', 'BMD', 'INR', 'BTN', 'BOB', 'BOV', 'BAM', 'BWP', 'NOK', 'BRL', 'BND', 'BGN', 'BIF', 'CVE', 'KHR', 'XAF', 'CAD', 'KYD', 'CLP', 'CLF', 'CNY', 'COP', 'COU', 'KMF', 'CDF', 'NZD', 'CRC', 'HRK', 'CUP', 'CUC', 'ANG', 'CZK', 'DKK', 'DJF', 'DOP', 'EGP', 'SVC', 'ERN', 'SZL', 'ETB', 'FKP', 'FJD', 'XPF', 'GMD', 'GEL', 'GHS', 'GIP', 'GTQ', 'GBP', 'GNF', 'GYD', 'HTG', 'HNL', 'HKD', 'HUF', 'ISK', 'IDR', 'XDR', 'IRR', 'IQD', 'ILS', 'JMD', 'JPY', 'JOD', 'KZT', 'KES', 'KPW', 'KRW', 'KWD', 'KGS', 'LAK', 'LBP', 'LSL', 'ZAR', 'LRD', 'LYD', 'CHF', 'MOP', 'MKD', 'MGA', 'MWK', 'MYR', 'MVR', 'MRU', 'MUR', 'XUA', 'MXN', 'MXV', 'MDL', 'MNT', 'MAD', 'MZN', 'MMK', 'NAD', 'NPR', 'NIO', 'NGN', 'OMR', 'PKR', 'PAB', 'PGK', 'PYG', 'PEN', 'PHP', 'PLN', 'QAR', 'RON', 'RUB', 'RWF', 'SHP', 'WST', 'STN', 'SAR', 'RSD', 'SCR', 'SLL', 'SLE', 'SGD', 'XSU', 'SBD', 'SOS', 'SSP', 'LKR', 'SDG', 'SRD', 'SEK', 'CHE', 'CHW', 'SYP', 'TWD', 'TJS', 'TZS', 'THB', 'TOP', 'TTD', 'TND', 'TRY', 'TMT', 'UGX', 'UAH', 'AED', 'USN', 'UYU', 'UYI', 'UYW', 'UZS', 'VUV', 'VES', 'VED', 'VND', 'YER', 'ZMW', 'ZWL', 'XBA', 'XBB', 'XBC', 'XBD', 'XTS', 'XXX', 'XAU', 'XPD', 'XPT', 'XAG')), /* COMMENT 'currency' */
Num_Inflation_Adjusted INTEGER, /* COMMENT 'Boolean' */
Num_Inflation_Adjusted_Year INTEGER CHECK (
length(Num_Inflation_Adjusted_Year) == 4
Expand Down
35 changes: 33 additions & 2 deletions Database/scr/normalize_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import os
import pathlib
import re
from typing import Tuple, Union
from datetime import datetime
from typing import Any, Tuple, Union

import pandas as pd
import pycountry
import shortuuid
from dateparser.date import DateDataParser
from dateparser.search import search_dates
from iso4217 import Currency
from spacy import language as spacy_language
from unidecode import unidecode

Expand Down Expand Up @@ -215,13 +217,29 @@ def df_to_json(
pathlib.Path(target_dir).mkdir(parents=True, exist_ok=True)
slc.to_json(fname, **json_wargs)

def check_currency(self, currency_text: str) -> bool:
try:
Currency(currency_text)
return True
except ValueError as err:
self.logger.error(f"Bad currency found: `{currency_text}`: {err}")
return False

def check_date(self, year: int, month: int, day: int) -> bool:
try:
datetime(year, month, day)
return True
except ValueError as err:
self.logger.error(f"Y: {year}; M: {month}; D: {day}. Error: {err}")
return False


class NormalizeJsonOutput:
def __init__(self):
self.logger = Logging.get_logger("normalize-utils-json")

@staticmethod
def infer_date_from_dict(x: any) -> str:
def infer_date_from_dict(x: Any) -> str:
"""
This function normalizes date output in various formats by some LLMs.
Current usecases:
Expand Down Expand Up @@ -656,3 +674,16 @@ def validate_main_event_hazard_relation(
except BaseException as err:
self.logger.error(f"Could not validate relationship between {hazards} and {main_event}. Error: {err}")
return row

def validate_currency_monetary_impact(self, row: dict) -> dict:
cols = ["Total_{}_Min", "Total_{}_Max", "Total_{}_Approx", "Total_{}_Unit", "Total_{}_Inflation_Adjusted"]

for category in ["Damage", "Insured_Damage"]:
try:
Currency(row[f"Total_{category}_Unit"])
except ValueError as err:
self.logger.error(f"""Invalid currency {row[f"Total_{category}_Unit"]}. Error: {err}""")
for c in cols:
cat = c.format(category)
row[cat] = None
return row

0 comments on commit 2e3bcb9

Please sign in to comment.