Skip to content

Commit

Permalink
Evaluating subevents (#43)
Browse files Browse the repository at this point in the history
Co-authored-by: Shorouq <[email protected]>
  • Loading branch information
i-be-snek and i-be-snek authored Jul 31, 2024
1 parent 7d1b825 commit 6ab138d
Show file tree
Hide file tree
Showing 37 changed files with 1,490 additions and 137 deletions.
40 changes: 40 additions & 0 deletions .github/workflows/run_eval_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Run Unit Tests for Evaluation scripts via Pytest

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
name: Run Tests
runs-on: ubuntu-latest
steps:
- uses: actions/[email protected]
with:
fetch-depth: 0
- name: Set up Python 3.11
uses: actions/[email protected]
with:
python-version: 3.11
- id: cache-dependencies
name: Cache dependencies
uses: actions/[email protected]
with:
path: ${{ github.workspace }}/.venv
key: dependencies-${{ hashFiles('**/poetry.lock') }}
restore-keys: dependencies-
- name: Install dependencies
if: steps.cache-dependencies.cache-hit != 'true'
run: |
python3 -m pip install -U pip poetry
poetry --version
poetry check --no-interaction
poetry config virtualenvs.in-project true
poetry install --no-interaction
- name: Run tests
run: |
poetry run pytest -ra -s tests
Binary file modified Database/gold/ImpactDB_DataTable_Validation.xlsx
Binary file not shown.
3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Affected.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Buildings_Damaged.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Damage.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Deaths.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Displaced.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Events.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Homeless.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Injured.parquet

This file was deleted.

3 changes: 0 additions & 3 deletions Database/gold/gold_from_excel/Insured_Damage.parquet

This file was deleted.

3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Affected.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Buildings_Damaged.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Damage.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Deaths.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Displaced.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Events.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Homeless.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Injured.parquet
Git LFS file not shown
3 changes: 3 additions & 0 deletions Database/gold/specific_instances/Insured_Damage.parquet
Git LFS file not shown
13 changes: 8 additions & 5 deletions Database/gold_from_excel.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pathlib
import argparse
import pathlib
import re

import pandas as pd
from scr.normalize_utils import Logging

from Database.scr.normalize_utils import Logging

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
Expand Down Expand Up @@ -58,6 +59,7 @@ def flatten(xss):
# main and specific impact events have these three column sets in common
shared_cols = [
"Event_ID",
"Event_ID_decimal",
"Source",
"Event_Name",
]
Expand Down Expand Up @@ -115,7 +117,8 @@ def flatten(xss):
for i in ["Insured_Damage", "Damage"]:
convert_to_boolean.extend([x for x in specific_impacts_columns[i] if "_Adjusted" in x and "_Year" not in x])

convert_to_float = ["Event_ID"]
convert_to_float = ["Event_ID_decimal"]


def flatten_data_table():
logger.info("Loading excel file...")
Expand Down Expand Up @@ -197,7 +200,7 @@ def flatten_data_table():
)

logger.info("Splitting main events from specific impact")
data_table["main"] = data_table.Event_ID.apply(lambda x: float(x).is_integer())
data_table["main"] = data_table.Event_ID_decimal.apply(lambda x: float(x).is_integer())
data_table["main"].value_counts()

logger.info("Storing Main Events table")
Expand Down Expand Up @@ -271,4 +274,4 @@ def flatten_data_table():
logger.info(f"Creating {args.output_dir} if it does not exist!")
pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)

flatten_data_table()
flatten_data_table()
5 changes: 3 additions & 2 deletions Database/merge_json_output.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import pathlib
from scr.normalize_utils import Logging, NormalizeJsonOutput

from Database.scr.normalize_utils import Logging, NormalizeJsonOutput

if __name__ == "__main__":
logger = Logging.get_logger("merge-mixtral-or-mistral-output")
Expand Down Expand Up @@ -34,7 +35,7 @@
logger.info(args)

logger.info(f"Creating {args.output_dir} if it does not exist!")
pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)

json_utils = NormalizeJsonOutput()
dfs = json_utils.merge_json(args.input_dir)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#### Post-processed files

This is where parsed LLM outputs are stored in .parquet

Suggested breakdown:

```shell
Expand All @@ -8,8 +10,8 @@ Suggested breakdown:
└── nlp4climate # <-- ℹ️ Broader name to group experiments
├── dev # <-- ℹ️ dev set, specific to this group of experiments
│ ├── gpt4_experiment.parquet
│ └── mistral_experiment.json
│ └── mistral_experiment.parquet
└── test # <-- ℹ️ test set, specific to this group of experiments
├── gpt4_experiment.parquet
└── mistral_experiment.json
└── mistral_experiment.parquet
```
28 changes: 15 additions & 13 deletions Database/parse_events.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import argparse
import re
import pathlib
import re

import pandas as pd

from scr.normalize_locations import NormalizeLocation
from scr.normalize_numbers import NormalizeNumber
from scr.normalize_utils import Logging, NormalizeUtils
from Database.scr.normalize_locations import NormalizeLocation
from Database.scr.normalize_numbers import NormalizeNumber
from Database.scr.normalize_utils import Logging, NormalizeUtils

if __name__ == "__main__":
logger = Logging.get_logger("parse_events")
Expand Down Expand Up @@ -81,7 +82,7 @@
logger.info(f"Passed args: {args}")

logger.info(f"Creating {args.output_dir} if it does not exist!")
pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)

utils = NormalizeUtils()
nlp = utils.load_spacy_model(args.spaCy_model_name)
Expand Down Expand Up @@ -138,7 +139,7 @@
logger.info(f"Normalizing boolean column {inflation_adjusted_col}")
events[inflation_adjusted_col] = events[inflation_adjusted_col].replace(
{_no: False, _yes: True}, regex=True
)
)

logger.info("Normalizing nulls")
events = utils.replace_nulls(events)
Expand Down Expand Up @@ -203,7 +204,6 @@
)

if args.location_column in events.columns and args.country_column in events.columns:

logger.info("Normalizing Locations")
events["Location_Tmp"] = events["Location"].apply(
lambda locations: (
Expand Down Expand Up @@ -312,17 +312,19 @@

sub_event = pd.concat([sub_event.Event_ID, sub_event[col].apply(pd.Series)], axis=1)

logger.info(f"Dropping any columns with non-str column names due to None types in the dicts {[c for c in sub_event.columns if not isinstance(c, str)]}")
logger.info(
f"Dropping any columns with non-str column names due to None types in the dicts {[c for c in sub_event.columns if not isinstance(c, str)]}"
)
sub_event = sub_event[[c for c in sub_event.columns if isinstance(c, str)]]

logger.info(f"Normalizing nulls for subevent {col}")
sub_event = utils.replace_nulls(sub_event)

specific_total_cols = [
col
for col in sub_event.columns
if col.startswith("Num_")
or col.endswith("_Damage")
or col.endswith("Damage")
and "Date" not in col
and args.location_column not in col
]
Expand Down Expand Up @@ -389,10 +391,10 @@
lambda country: (norm_loc.get_gadm_gid(country=country) if country else None)
)

'''
"""
logger.info(f"Dropping columns with no locations for subevent {col}")
sub_event.dropna(subset=[f"Location_{location_col}"], how="all", inplace=True)
'''
"""
logger.info(f"Normalizing location names for subevent {col}")
sub_event[
[
Expand Down Expand Up @@ -427,7 +429,7 @@
)

def normalize_location_rows_if_country(row):
# if location and country are identical in subevents, generalize country normalization
# if location and country are identical in subevents, generalize country normalization
if row[f"Location_{location_col}"] == row[args.country_column]:
for i in ["Norm", "Type", "GeoJson", "GID"]:
row[f"Location_{location_col}_{i}"] = row[f"Country_{i}"]
Expand Down
6 changes: 3 additions & 3 deletions Database/scr/normalize_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def get_gadm_gid(
@staticmethod
def extract_locations(
text: str,
) -> tuple[list] | None:
) -> tuple[list[str]]:
"""
Extracts countries and sublocations from the '|, &' string format
Example:
Expand All @@ -496,7 +496,7 @@ def extract_locations(
try:
split_by_pipe = text.split("|")
except BaseException:
return
return [], []
try:
if split_by_pipe:
for s in split_by_pipe:
Expand All @@ -507,7 +507,7 @@ def extract_locations(
locations.extend([locations_tmp])
return countries, locations
except BaseException:
return
return [], []

def _debug(self, response):
self.logger.debug(type(response))
Expand Down
Empty file added Evaluation/__init__.py
Empty file.
4 changes: 2 additions & 2 deletions Evaluation/comparer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import normaliser
from Evaluation.normaliser import Normaliser


class Comparer:
Expand All @@ -8,7 +8,7 @@ def __init__(self, null_penalty: bool, target_columns: list[str]):
"""Initialisation."""
# Penalty score if one field is None, but not the other
self.null_penalty = null_penalty
self.norm = normaliser.Normaliser()
self.norm = Normaliser()
self.target_columns = target_columns

def target_col(self, l) -> list:
Expand Down
Loading

0 comments on commit 6ab138d

Please sign in to comment.