Evaluating subevents (#43)

Co-authored-by: Shorouq <[email protected]>
VUB-HYDR · Jul 31, 2024 · 6ab138d · 6ab138d
1 parent 7d1b825
commit 6ab138d
Show file tree

Hide file tree

Showing 37 changed files with 1,490 additions and 137 deletions.
diff --git a/.github/workflows/run_eval_tests.yml b/.github/workflows/run_eval_tests.yml
@@ -0,0 +1,40 @@
+name: Run Unit Tests for Evaluation scripts via Pytest
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    name: Run Tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/[email protected]
+        with:
+          fetch-depth: 0
+      - name: Set up Python 3.11
+        uses: actions/[email protected]
+        with:
+          python-version: 3.11
+      - id: cache-dependencies
+        name: Cache dependencies
+        uses: actions/[email protected]
+        with:
+          path: ${{ github.workspace }}/.venv
+          key: dependencies-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: dependencies-
+      - name: Install dependencies
+        if: steps.cache-dependencies.cache-hit != 'true'
+        run: |
+          python3 -m pip install -U pip poetry
+          poetry --version
+          poetry check --no-interaction
+          poetry config virtualenvs.in-project true
+          poetry install --no-interaction
+      - name: Run tests
+        run: |
+          poetry run pytest -ra -s tests
diff --git a/Database/gold/ImpactDB_DataTable_Validation.xlsx b/Database/gold/ImpactDB_DataTable_Validation.xlsx
diff --git a/Database/gold/gold_from_excel/Affected.parquet b/Database/gold/gold_from_excel/Affected.parquet
diff --git a/Database/gold/gold_from_excel/Buildings_Damaged.parquet b/Database/gold/gold_from_excel/Buildings_Damaged.parquet
diff --git a/Database/gold/gold_from_excel/Damage.parquet b/Database/gold/gold_from_excel/Damage.parquet
diff --git a/Database/gold/gold_from_excel/Deaths.parquet b/Database/gold/gold_from_excel/Deaths.parquet
diff --git a/Database/gold/gold_from_excel/Displaced.parquet b/Database/gold/gold_from_excel/Displaced.parquet
diff --git a/Database/gold/gold_from_excel/Events.parquet b/Database/gold/gold_from_excel/Events.parquet
diff --git a/Database/gold/gold_from_excel/Homeless.parquet b/Database/gold/gold_from_excel/Homeless.parquet
diff --git a/Database/gold/gold_from_excel/Injured.parquet b/Database/gold/gold_from_excel/Injured.parquet
diff --git a/Database/gold/gold_from_excel/Insured_Damage.parquet b/Database/gold/gold_from_excel/Insured_Damage.parquet
diff --git a/Database/gold/specific_instances/Affected.parquet b/Database/gold/specific_instances/Affected.parquet
diff --git a/Database/gold/specific_instances/Buildings_Damaged.parquet b/Database/gold/specific_instances/Buildings_Damaged.parquet
diff --git a/Database/gold/specific_instances/Damage.parquet b/Database/gold/specific_instances/Damage.parquet
diff --git a/Database/gold/specific_instances/Deaths.parquet b/Database/gold/specific_instances/Deaths.parquet
diff --git a/Database/gold/specific_instances/Displaced.parquet b/Database/gold/specific_instances/Displaced.parquet
diff --git a/Database/gold/specific_instances/Events.parquet b/Database/gold/specific_instances/Events.parquet
diff --git a/Database/gold/specific_instances/Homeless.parquet b/Database/gold/specific_instances/Homeless.parquet
diff --git a/Database/gold/specific_instances/Injured.parquet b/Database/gold/specific_instances/Injured.parquet
diff --git a/Database/gold/specific_instances/Insured_Damage.parquet b/Database/gold/specific_instances/Insured_Damage.parquet
diff --git a/Database/gold_from_excel.py b/Database/gold_from_excel.py
@@ -1,9 +1,10 @@
-import pathlib
 import argparse
+import pathlib
 import re
 
 import pandas as pd
-from scr.normalize_utils import Logging
+
+from Database.scr.normalize_utils import Logging
 
 pd.set_option("display.max_rows", None)
 pd.set_option("display.max_columns", None)
@@ -58,6 +59,7 @@ def flatten(xss):
 # main and specific impact events have these three column sets in common
 shared_cols = [
     "Event_ID",
+    "Event_ID_decimal",
     "Source",
     "Event_Name",
 ]
@@ -115,7 +117,8 @@ def flatten(xss):
 for i in ["Insured_Damage", "Damage"]:
     convert_to_boolean.extend([x for x in specific_impacts_columns[i] if "_Adjusted" in x and "_Year" not in x])
 
-convert_to_float = ["Event_ID"]
+convert_to_float = ["Event_ID_decimal"]
+
 
 def flatten_data_table():
     logger.info("Loading excel file...")
@@ -197,7 +200,7 @@ def flatten_data_table():
         )
 
     logger.info("Splitting main events from specific impact")
-    data_table["main"] = data_table.Event_ID.apply(lambda x: float(x).is_integer())
+    data_table["main"] = data_table.Event_ID_decimal.apply(lambda x: float(x).is_integer())
     data_table["main"].value_counts()
 
     logger.info("Storing Main Events table")
@@ -271,4 +274,4 @@ def flatten_data_table():
     logger.info(f"Creating {args.output_dir} if it does not exist!")
     pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)
 
-    flatten_data_table()
+    flatten_data_table()
diff --git a/Database/merge_json_output.py b/Database/merge_json_output.py
@@ -1,6 +1,7 @@
 import argparse
 import pathlib
-from scr.normalize_utils import Logging, NormalizeJsonOutput
+
+from Database.scr.normalize_utils import Logging, NormalizeJsonOutput
 
 if __name__ == "__main__":
     logger = Logging.get_logger("merge-mixtral-or-mistral-output")
@@ -34,7 +35,7 @@
     logger.info(args)
 
     logger.info(f"Creating {args.output_dir} if it does not exist!")
-    pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True) 
+    pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)
 
     json_utils = NormalizeJsonOutput()
     dfs = json_utils.merge_json(args.input_dir)

diff --git a/Database/output/nlp4climate/README.md → Database/output/README.md b/Database/output/nlp4climate/README.md → Database/output/README.md
@@ -1,5 +1,7 @@
 #### Post-processed files
 
+This is where parsed LLM outputs are stored in .parquet
+
 Suggested breakdown:
 
 ```shell
@@ -8,8 +10,8 @@ Suggested breakdown:
 └── nlp4climate # <-- ℹ️ Broader name to group experiments
     ├── dev # <-- ℹ️ dev set, specific to this group of experiments
     │   ├── gpt4_experiment.parquet
-    │   └── mistral_experiment.json
+    │   └── mistral_experiment.parquet
     └── test # <-- ℹ️ test set, specific to this group of experiments
         ├── gpt4_experiment.parquet
-        └── mistral_experiment.json
+        └── mistral_experiment.parquet
 ```
diff --git a/Database/parse_events.py b/Database/parse_events.py
@@ -1,11 +1,12 @@
 import argparse
-import re
 import pathlib
+import re
+
 import pandas as pd
 
-from scr.normalize_locations import NormalizeLocation
-from scr.normalize_numbers import NormalizeNumber
-from scr.normalize_utils import Logging, NormalizeUtils
+from Database.scr.normalize_locations import NormalizeLocation
+from Database.scr.normalize_numbers import NormalizeNumber
+from Database.scr.normalize_utils import Logging, NormalizeUtils
 
 if __name__ == "__main__":
     logger = Logging.get_logger("parse_events")
@@ -81,7 +82,7 @@
     logger.info(f"Passed args: {args}")
 
     logger.info(f"Creating {args.output_dir} if it does not exist!")
-    pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True) 
+    pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True)
 
     utils = NormalizeUtils()
     nlp = utils.load_spacy_model(args.spaCy_model_name)
@@ -138,7 +139,7 @@
             logger.info(f"Normalizing boolean column {inflation_adjusted_col}")
             events[inflation_adjusted_col] = events[inflation_adjusted_col].replace(
                 {_no: False, _yes: True}, regex=True
-                )
+            )
 
         logger.info("Normalizing nulls")
         events = utils.replace_nulls(events)
@@ -203,7 +204,6 @@
             )
 
         if args.location_column in events.columns and args.country_column in events.columns:
-
             logger.info("Normalizing Locations")
             events["Location_Tmp"] = events["Location"].apply(
                 lambda locations: (
@@ -312,17 +312,19 @@
 
             sub_event = pd.concat([sub_event.Event_ID, sub_event[col].apply(pd.Series)], axis=1)
 
-            logger.info(f"Dropping any columns with non-str column names due to None types in the dicts {[c for c in sub_event.columns if not isinstance(c, str)]}")
+            logger.info(
+                f"Dropping any columns with non-str column names due to None types in the dicts {[c for c in sub_event.columns if not isinstance(c, str)]}"
+            )
             sub_event = sub_event[[c for c in sub_event.columns if isinstance(c, str)]]
-            
+
             logger.info(f"Normalizing nulls for subevent {col}")
             sub_event = utils.replace_nulls(sub_event)
 
             specific_total_cols = [
                 col
                 for col in sub_event.columns
                 if col.startswith("Num_")
-                or col.endswith("_Damage")
+                or col.endswith("Damage")
                 and "Date" not in col
                 and args.location_column not in col
             ]
@@ -389,10 +391,10 @@
                 lambda country: (norm_loc.get_gadm_gid(country=country) if country else None)
             )
 
-            '''
+            """
             logger.info(f"Dropping columns with no locations for subevent {col}")
             sub_event.dropna(subset=[f"Location_{location_col}"], how="all", inplace=True)
-            '''
+            """
             logger.info(f"Normalizing location names for subevent {col}")
             sub_event[
                 [
@@ -427,7 +429,7 @@
             )
 
             def normalize_location_rows_if_country(row):
-            # if location and country are identical in subevents, generalize country normalization
+                # if location and country are identical in subevents, generalize country normalization
                 if row[f"Location_{location_col}"] == row[args.country_column]:
                     for i in ["Norm", "Type", "GeoJson", "GID"]:
                         row[f"Location_{location_col}_{i}"] = row[f"Country_{i}"]

diff --git a/Database/scr/normalize_locations.py b/Database/scr/normalize_locations.py
@@ -485,7 +485,7 @@ def get_gadm_gid(
     @staticmethod
     def extract_locations(
         text: str,
-    ) -> tuple[list] | None:
+    ) -> tuple[list[str]]:
         """
         Extracts countries and sublocations from the '|, &' string format
         Example:
@@ -496,7 +496,7 @@ def extract_locations(
         try:
             split_by_pipe = text.split("|")
         except BaseException:
-            return
+            return [], []
         try:
             if split_by_pipe:
                 for s in split_by_pipe:
@@ -507,7 +507,7 @@ def extract_locations(
                     locations.extend([locations_tmp])
             return countries, locations
         except BaseException:
-            return
+            return [], []
 
     def _debug(self, response):
         self.logger.debug(type(response))

diff --git a/Evaluation/__init__.py b/Evaluation/__init__.py
diff --git a/Evaluation/comparer.py b/Evaluation/comparer.py
@@ -1,4 +1,4 @@
-import normaliser
+from Evaluation.normaliser import Normaliser
 
 
 class Comparer:
@@ -8,7 +8,7 @@ def __init__(self, null_penalty: bool, target_columns: list[str]):
         """Initialisation."""
         # Penalty score if one field is None, but not the other
         self.null_penalty = null_penalty
-        self.norm = normaliser.Normaliser()
+        self.norm = Normaliser()
         self.target_columns = target_columns
 
     def target_col(self, l) -> list: