Skip to content

Commit

Permalink
🚧 Fill l3->l2 impact gap
Browse files Browse the repository at this point in the history
  • Loading branch information
i-be-snek committed Nov 13, 2024
1 parent 4255fcf commit 5ebf4bc
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 54 deletions.
95 changes: 46 additions & 49 deletions Database/fill_data_gap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,66 +17,63 @@
args = parser.parse_args()
dg_util = DataGapUtils()
l1, l2, l3 = dg_util.load_data(input_dir=args.input_dir)
logger.info("Data loaded!")
event_id, date_year_suffix = "Event_ID", "_Date_Year"
admin_areas = "Administrative_Areas"
admin_area = "Administrative_Area"
num_min, num_max, num_approx = "Num_Min", "Num_Max", "Num_Approx"

for event_id in list(l1[event_id].unique()):
logger.info("Filling the time gap...")
for e_id in list(l1[event_id].unique()):
replace_with_date = (
l1.loc[l1[event_id] == event_id][[x for x in l1.columns if date_year_suffix in x]].iloc[0].to_dict()
l1.loc[l1[event_id] == e_id][[x for x in l1.columns if date_year_suffix in x]].iloc[0].to_dict()
)

for level in [l2, l3]:
for impact in level.keys():
level[impact][level[impact][event_id] == event_id] = level[impact][
level[impact][event_id] == event_id
].apply(lambda row: dg_util.fill_date(row, replace_with_date=replace_with_date), axis=1)
level[impact][level[impact][event_id] == e_id] = level[impact][level[impact][event_id] == e_id].apply(
lambda row: dg_util.fill_date(row, replace_with_date=replace_with_date),
axis=1,
)

for e_id in list(l1[event_id].unique()):
l1_areas = l1.loc[l1[event_id] == e_id][f"{admin_areas}_Norm"].iloc[0]
area_col_suffix = ["Norm", "Type", "GID", "GeoJson"]
l1_target_area_cols = [f"{admin_areas}_{s}" for s in area_col_suffix]
# Replace NaNs will NoneType
for level in [l2, l3]:
for impact in level.keys():
level[impact].replace(float("nan"), None, inplace=True)

for impact in l2.keys():
try:
l2_series = l2[impact][l2[impact][event_id] == e_id][f"{admin_areas}_Norm"]
logger.info("Filling impacts upward (l3->l2) if an l3 administrative area is missing from l2")
new_l2_rows: dict[str, list] = {
"Affected": [],
"Buildings_Damaged": [],
"Damage": [],
"Deaths": [],
"Displaced": [],
"Homeless": [],
"Injuries": [],
"Insured_Damage": [],
}

if not l2_series.empty:
for n in range(len(l2_series)):
l2_areas, l2_idx = {}, []
for e_id in list(l1[event_id].unique()):
for impact in l3.keys():
l3_areas = l3[impact][(~l3[impact][num_min].isna()) & (l3[impact][event_id] == e_id)][
f"{admin_area}_Norm"
].tolist()

l2_list = l2_series.iloc[n]
l2_idx = [l2_list.index(area) for area in l2_list if area not in l1_areas]
l2_areas = dg_util.flatten(l2[impact][l2[impact][event_id] == e_id][f"{admin_areas}_Norm"].tolist())

if l2_idx:
logger.info(f"Filling area data gap for Event_ID {e_id} for {impact} at l2")
target_area_cols = [f"{admin_areas}_{s}" for s in area_col_suffix]
l2_areas = l2[impact][l2[impact][event_id] == e_id][target_area_cols].to_dict(orient="list")
for k, v in l2_areas.items():
l2_areas[k] = [v[n][idx] for idx in l2_idx]
l1.loc[l1[event_id] == e_id][l1_target_area_cols].apply(
lambda row: dg_util.fill_area(row, l2_areas, area_col=admin_areas),
axis=1,
)
except BaseException as err:
logger.error(f"Could not fill area data gap for {impact} at l2. Error: {err}")
# check l3 impacts not found in l2
areas_not_in_l2 = [x for x in l3_areas if x not in l2_areas]
if areas_not_in_l2:
for area in areas_not_in_l2:
logger.info(
f"Administrative Area in l3 missing in l2 found for impact {impact} in Event_ID {e_id}. Area(s): {area}"
)
l3_rows = l3[impact][
(~l3[impact][num_min].isna())
& (l3[impact][event_id] == e_id)
& (l3[impact][f"{admin_area}_Norm"] == area)
].to_dict(orient="records")
for r in l3_rows:
new_l2_rows[impact].append(dg_util.l3_to_l2(l3_row=r))

for impact in l3.keys():
try:
l3_series = l3[impact][l3[impact][event_id] == e_id][f"{admin_area}_Norm"]
if not l3_series.empty:
for n in range(len(l3_series)):
l3_area = {}
l3_str = l3_series.iloc[n]
if isinstance(l3_str, str) and l3_str not in l1_areas:
logger.info(f"Filling area data gap for Event_ID {e_id} for {impact} at l3")
target_area_cols = [f"{admin_area}_{s}" for s in area_col_suffix]
l3_area = l3[impact][l3[impact][event_id] == e_id][target_area_cols].iloc[n].to_dict()
for k, v in l3_area.items():
l3_area[k] = [v]
l1.loc[l1[event_id] == e_id][l1_target_area_cols].apply(
lambda row: dg_util.fill_area(row, l3_area, area_col=admin_area),
axis=1,
)
except BaseException as err:
logger.error(f"Could not fill area data gap for {impact} at l3. Error: {err}")
print(new_l2_rows)
# TODO: append new_l2_rows
25 changes: 20 additions & 5 deletions Database/scr/normalize_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,16 @@ def load_data(self, input_dir: str) -> tuple[pd.DataFrame, dict[str, pd.DataFram

self.logger.info("Loading l1 files...")
l1 = pd.read_parquet(l1_filename, engine="fastparquet")
l1 = norm_utils.replace_nulls(l1).replace({float("nan"): None})
l1 = norm_utils.replace_nulls(l1)
l1.replace({float("nan"): None}, inplace=True)
l2 = {}
self.logger.info("Loading l2 files...")

for f, c in tqdm(zip(l2_filenames, l2_categories), desc="L2 files..."):
try:
tmp_df = pd.read_parquet(f"{input_dir}/l2/{f}", engine="fastparquet")
tmp_df = norm_utils.replace_nulls(tmp_df).replace({float("nan"): None})
tmp_df = norm_utils.replace_nulls(tmp_df)
tmp_df.replace({float("nan"): None}, inplace=True)

l2[c] = tmp_df
del tmp_df
Expand All @@ -44,7 +46,8 @@ def load_data(self, input_dir: str) -> tuple[pd.DataFrame, dict[str, pd.DataFram
for f, c in tqdm(zip(l3_filenames, l3_categories), desc="L3 files..."):
try:
tmp_df = pd.read_parquet(f"{input_dir}/l3/{f}", engine="fastparquet")
tmp_df = norm_utils.replace_nulls(tmp_df).replace({float("nan"): None})
tmp_df = norm_utils.replace_nulls(tmp_df)
tmp_df.replace({float("nan"): None}, inplace=True)

l3[c] = tmp_df
del tmp_df
Expand All @@ -70,5 +73,17 @@ def fill_area(row: dict, missing_areas: dict[str, list], area_col: str) -> dict:
return row

@staticmethod
def fill_count(row: dict, replace_with_count: dict):
pass
def l3_to_l2(l3_row: dict) -> dict:
l2_row = {}
for k in l3_row.keys():
if "Administrative_Area" in k:
l2_name = k.replace("Area", "Areas")
l2_row[l2_name] = [l3_row[k]]
del l2_name
elif "Location" not in k:
l2_row[k] = l3_row[k]
return l2_row

@staticmethod
def flatten(xss: list[list]) -> list:
return [x for xs in xss for x in xs]

0 comments on commit 5ebf4bc

Please sign in to comment.