Skip to content

Commit

Permalink
173 Validate categorical fields Main_Event and Hazards (#181)
Browse files Browse the repository at this point in the history
  • Loading branch information
i-be-snek authored Oct 29, 2024
1 parent fb9a707 commit b41b580
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 2 deletions.
30 changes: 29 additions & 1 deletion Database/parse_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from Database.scr.log_utils import Logging
from Database.scr.normalize_locations import NormalizeLocation
from Database.scr.normalize_numbers import NormalizeNumber
from Database.scr.normalize_utils import NormalizeUtils
from Database.scr.normalize_utils import CategoricalValidation, NormalizeUtils

tqdm.pandas()

Expand Down Expand Up @@ -224,6 +224,33 @@ def parse_main_events(df: pd.DataFrame, target_columns: list):
events["Event_Names"] = events[event_name_col].progress_apply(
lambda x: ([x.strip()] if isinstance(x, str) else ([y.strip() for y in x]) if isinstance(x, list) else None)
)

hazards, main_event = "Hazards", "Main_Event"
if hazards in events.columns:
logger.info(f"STEP: Validation of Categorical Types for col {hazards}")
events[hazards] = events[hazards].apply(
lambda hazard_list: [
y
for y in [
validation.validate_categorical(h, categories=validation.hazards_categories) for h in hazard_list
]
if y
]
if hazard_list
else None
)

if main_event in events.columns:
logger.info(f"STEP: Validation of Categorical Types for col {main_event}")
events[main_event] = events[main_event].progress_apply(
lambda main_event_type: validation.validate_categorical(
main_event_type, categories=list(validation.main_event_categories.keys())
)
)
if all([x in events.columns for x in [hazards, main_event]]):
logger.info(f"STEP: Validation relationship between col {hazards} and col {main_event}")
events = events.progress_apply(lambda row: validation.validate_main_event_hazard_relation(row), axis=1)

logger.info("Converting annotation columns to strings to store in sqlite3")
annotation_cols = [col for col in events.columns if col.endswith(("_with_annotation", "_Annotation"))]

Expand Down Expand Up @@ -702,6 +729,7 @@ def get_target_cols() -> tuple[list]:
pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)

utils = NormalizeUtils()
validation = CategoricalValidation()
nlp = utils.load_spacy_model(args.spaCy_model_name)

norm_num = NormalizeNumber(nlp, locale_config=args.locale_config)
Expand Down
2 changes: 1 addition & 1 deletion Database/schema/L1_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ CREATE TABLE Total_Summary (
Event_ID TEXT PRIMARY KEY NOT NULL CHECK (length(Event_ID) == 7), /* COMMENT 'UID' */
Event_Names OBJECT NOT NULL, /* COMMENT 'Array' */
Sources OBJECT NOT NULL, /* COMMENT 'Array' */
Main_Event TEXT NOT NULL, /* COMMENT 'Categorical' */
Main_Event TEXT NOT NULL CHECK (Main_Event IN ("Flood", "Extratropical Storm/Cyclone", "Tropical Storm/Cyclone", "Extreme Temperature", "Drought", "Wildfire", "Tornado",)); /* COMMENT 'Categorical' */
Hazards OBJECT NOT NULL, /* COMMENT 'Array', categorical */

Administrative_Areas_Norm OBJECT NOT NULL, /* COMMENT 'Array' of TEXT/NULL */
Expand Down
44 changes: 44 additions & 0 deletions Database/scr/normalize_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,3 +590,47 @@ def geojson_to_file(self, geojson_obj: str, area_name: str) -> str:
self.logger.debug(f"Could not process GeoJson to file. Error: {err}")
return None
return nid


class CategoricalValidation:
def __init__(self):
self.logger = Logging.get_logger("categorical-validation-utils")
self.main_event_categories = {
"Flood": ["Flood"],
"Extratropical Storm/Cyclone": ["Wind", "Flood", "Blizzard", "Hail"],
"Tropical Storm/Cyclone": ["Wind", "Flood", "Lightning"],
"Extreme Temperature": ["Heatwave", "Cold Spell"],
"Drought": ["Drought"],
"Wildfire": ["Wildfire"],
"Tornado": ["Wind"],
}

self.hazards_categories = [
"Wind",
"Flood",
"Blizzard",
"Hail",
"Drought",
"Heatwave",
"Lightning",
"Cold Spell",
"Wildfire",
]

def validate_categorical(self, text: str, categories: list) -> str | None:
try:
cat_idx = [x.lower() for x in categories].index(text.lower())
return categories[cat_idx]
except BaseException as err:
self.logger.warning(f"Value `{text}` may be invalid for this category. Error: {err}")
return

def validate_main_event_hazard_relation(
self, row: dict, hazards: str = "Hazards", main_event: str = "Main_Event"
) -> dict:
try:
related_hazards = [x for x in self.main_event_categories[row[main_event]]]
row[hazards] = list(set([h for h in row[hazards] if h.lower() in [x.lower() for x in related_hazards]]))
except BaseException as err:
self.logger.error(f"Could not validate relationship between {hazards} and {main_event}. Error: {err}")
return row

0 comments on commit b41b580

Please sign in to comment.