diff --git a/Database/gold/gold_from_excel_2/Affected.parquet b/Database/gold/gold_from_excel_2/Affected.parquet deleted file mode 100644 index ab2283090..000000000 --- a/Database/gold/gold_from_excel_2/Affected.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04cf088bd965b9c65af9e28b2427e64d0acbada0fdb67b9f956b12321be5f4bc -size 27316 diff --git a/Database/gold/gold_from_excel_2/Buildings_Damaged.parquet b/Database/gold/gold_from_excel_2/Buildings_Damaged.parquet deleted file mode 100644 index 21f800396..000000000 --- a/Database/gold/gold_from_excel_2/Buildings_Damaged.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6536b9f8e83bac69a746f0e12afec9dde21ab1e0bb740c06952ba8245a500dd8 -size 28526 diff --git a/Database/gold/gold_from_excel_2/Damage.parquet b/Database/gold/gold_from_excel_2/Damage.parquet deleted file mode 100644 index 680b7f4c9..000000000 --- a/Database/gold/gold_from_excel_2/Damage.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33397cf34322b3713754580e9fdac2597264380bb9dc02ec956d723b19bf90da -size 29985 diff --git a/Database/gold/gold_from_excel_2/Deaths.parquet b/Database/gold/gold_from_excel_2/Deaths.parquet deleted file mode 100644 index ddf839d9d..000000000 --- a/Database/gold/gold_from_excel_2/Deaths.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f008ca007eb1c12a9b94899df06d50aaf7771710743e35af322004254b2ba1e7 -size 28477 diff --git a/Database/gold/gold_from_excel_2/Displaced.parquet b/Database/gold/gold_from_excel_2/Displaced.parquet deleted file mode 100644 index 7da37003b..000000000 --- a/Database/gold/gold_from_excel_2/Displaced.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61f0a601dfb9191c17b42050a3d414edf9629583a79a708889077c34f40a84bb -size 27660 diff --git a/Database/gold/gold_from_excel_2/Events.parquet b/Database/gold/gold_from_excel_2/Events.parquet deleted file mode 100644 index 696474fe0..000000000 --- a/Database/gold/gold_from_excel_2/Events.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86155cdf45bc57dfee0c8508da5f704abef572334652d0408a46b6b0ea6c93ce -size 40090 diff --git a/Database/gold/gold_from_excel_2/Homeless.parquet b/Database/gold/gold_from_excel_2/Homeless.parquet deleted file mode 100644 index 7ee79297b..000000000 --- a/Database/gold/gold_from_excel_2/Homeless.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:45e933c3d06ba8ecc8dba20349cab8864d32cd11247bff693bfc633df1c16095 -size 27116 diff --git a/Database/gold/gold_from_excel_2/Injured.parquet b/Database/gold/gold_from_excel_2/Injured.parquet deleted file mode 100644 index b27d720a9..000000000 --- a/Database/gold/gold_from_excel_2/Injured.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e0045f950ff2d740bb506183ec8fb1493da5643d10ace107234db976eb074ee -size 27320 diff --git a/Database/gold/gold_from_excel_2/Insured_Damage.parquet b/Database/gold/gold_from_excel_2/Insured_Damage.parquet deleted file mode 100644 index 571eb7bc8..000000000 --- a/Database/gold/gold_from_excel_2/Insured_Damage.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b7799dcee9ed7c84ebd9fee1bca6b150b29025df1dec0d6dd5c0e953034421d -size 28771 diff --git a/Database/scr/normalize_utils.py b/Database/scr/normalize_utils.py index baa61c755..e9493dbc2 100644 --- a/Database/scr/normalize_utils.py +++ b/Database/scr/normalize_utils.py @@ -285,8 +285,8 @@ def merge_json(self, file_path_dir: str) -> list[pd.DataFrame]: return dfs - @staticmethod def save_json( + self, dfs: list[pd.DataFrame], model_name: str, output_dir: str, @@ -314,11 +314,12 @@ def save_json( """ Takes a list of dataframes, merges it into a single file, and stores file in output_dir with the correct set and model names """ - captured_columns = set([x for xs in [df.columns for df in dfs] for x in xs]) + captured_columns = set([x for xs in [df.keys() for df in dfs] for x in xs]) + self.logger.info(f"Captured Columns: {captured_columns}") model_output = pd.DataFrame(dfs, columns=[c for c in columns if c in captured_columns]) filename = f"{output_dir}/{model_name}.json" model_output.to_json( filename, orient="records", ) - return filename + return filename \ No newline at end of file diff --git a/README.md b/README.md index ec123766b..d37ea391e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Wikimapcts is the first version of climate impact dataset creating by generative AI GPT4.0 -### Dependencies +## Dependencies Prerequisite: - Install [`poetry`](https://python-poetry.org/docs/#installation) Then activate a virtual environment and install the dependencies: @@ -26,27 +26,28 @@ pre-commit installed at .git/hooks/pre-commit git lfs install ``` -### Quickstart +## Quickstart -#### Parsing and evaluation pipeline +### Parsing and evaluation pipeline If you have generated some LLM output and would like to test it against the dev and test gold sets, here is a list of command to enable you to experiment with this yourself. -1. Choose a new experiment name! You will use this for the whole pipeline +#### (Step 1) Experiment name - **PRESTEP**: - If the system output is split across several files (such as Mixtral and Mistral system outputs), then first merge it: +Choose a new experiment name! You will use this for the whole pipeline. - ```shell - poetry run python3 Database/merge_json_output.py \ - --input_dir Database/raw// \ - --output_dir Database/raw/ \ - --model_name - ``` +#### PRESTEP (before Step 2): +If the system output is split across several files (such as Mixtral and Mistral system outputs), then first merge it: +```shell +poetry run python3 Database/merge_json_output.py \ +--input_dir Database/raw// \ +--output_dir Database/raw/ \ +--model_name +``` > [!WARNING] -> Your raw system output files should always land in the [`Database/raw/`] directory! +> Your raw system output files should always land in the `Database/raw/` directory! > [!TIP] > JSON files can be formatted easily with pre-commit: @@ -55,31 +56,32 @@ If you have generated some LLM output and would like to test it against the dev > pre-commit run --files Database/raw//> > ``` -2. Once all system output files are merged into a single JSON file (**or if this was already the case, such as with GPT4 output**), you can parse them so they are ready to be evaluated. - The parsing script [`Database/parse_events.py`](Database/parse_events.py) will normalize numbers (to min and max) and locations (using OpenStreetMap) and output a JSON file. +#### (Step 2) Parsing events and subevents - ```shell - - poetry run python3 Database/parse_events.py \ - --raw_dir Database/raw/ \ - --filename \ - --output_dir Database/output/ \ - - # "sub", "main" or "all" - --event_type all \ +Once all system output files are merged into a single JSON file (**or if this was already the case, such as with GPT4 output**), you can parse them so they are ready to be evaluated. + The parsing script [`Database/parse_events.py`](Database/parse_events.py) will normalize numbers (to min and max) and locations (using OpenStreetMap) and output a JSON file. - # if your country and location columns have a different name - # you can specify it here (otherwise, defaults to - # "Country" and "Location" (respectively)): - --country_column "Custom_Country_Column" \ - --location_column "Locations" - ``` +```shell +poetry run python3 Database/parse_events.py \ +--raw_dir Database/raw/ \ +--filename \ +--output_dir Database/output/ \ + +# "sub", "main" or "all" +--event_type all \ + +# if your country and location columns have a different name +# you can specify it here (otherwise, defaults to +# "Country" and "Location" (respectively)): +--country_column "Custom_Country_Column" \ +--location_column "Locations" +``` > [!WARNING] > Normalizing countries will go slow the first time. This is because we are using a free API (currently!). However, each time this script is run locally, geopy will cache the results, meaning that it will go faster the next time you run it on your local branch. Allow for 15-20 minutes the first time. -3. Evaluate against the dev and test sets +#### (Step 2) Evaluate against the dev and test sets ##### (A) Choose your config and columns The python dictionary in weights.py contains different weight configs. For example, the experiments nlp4climate weighs all the column types equally but excludes the "Event_Name" from evaluation. @@ -131,7 +133,7 @@ poetry run python3 Evaluation/evaluator.py --sys-file Database/output/nlp4clima --weights_config nlp4climate ``` -#### Parsing and normalization +### Parsing and normalization If you have new events to add to the database, first parse them and insert them. @@ -145,7 +147,7 @@ If you have new events to add to the database, first parse them and insert them. poetry run python3 Database/parse_events.py --help ``` -#### Inserting +### Inserting - To insert new main events: ```shell @@ -168,7 +170,7 @@ If you have new events to add to the database, first parse them and insert them. poetry run python3 Database/parse_events.py --help ``` -#### Database-related +### Database-related - To generate the database according to [`Database/schema.sql`](Database/schema.sql): ```shell @@ -201,9 +203,19 @@ To be implemented: > Please don't track or push excel sheets into the repository > The file `Database/gold/ImpactDB_DataTable_Validation.xlsx` has the latest gold annotations from 01/06/2024 and will be updated in the future. -#### Develop +### Develop + +Always pull a fresh copy of the `main` branch first! To add a new feature, check out a new branch from the `main` branch, make changes there, and push the new branch upstream to open a PR. PRs should result in a **squash commit** in the `main` branch. **It is recommended to code responsibly and ask someone to review your code. You can always tag @i-be-snek as a reviewer.** -Always pull a fresh copy of the `main` branch first! To add a new feature, check out a new branch from the `main` branch, make changes there, and push the new branch upstream to open a PR. PRs should result in a **squash commit** in the `main` branch. It is recommended to code responsibly and ask someone to review your code. +And don't forget to pull large files from Git Large File Storage! + +``` +# always pull first +git pull main + +# fetch all large files +git lfs fetch --all +``` Make sure any new dependencies are handled by `poetry`.You should be tracking and pushing both `poetry.lock` and `pyproject.toml` files. There is no need to manually add dependencies to the `pyproject.toml` file. Instead, use `poetry` commands: @@ -216,7 +228,7 @@ poetry add pandas -G main poetry add ipykernel@6.29.4 -G dev ``` -#### Problems? +### Problems? Start an Issue on GitHub if you find a bug in the code or have suggestions for a feature you need. If you run into an error or problem, please include the error trace or logs! :D @@ -224,7 +236,7 @@ If you run into an error or problem, please include the error trace or logs! :D > [!TIP] > Consult this [Github Cheat Sheet](https://education.github.com/git-cheat-sheet-education.pdf) -#### Sources & Citations +### Sources & Citations - GADM world data | `Database/data/gadm_world.csv` https://gadm.org/license.html