Full run pipeline (#158)

Co-authored-by: Ni Li <[email protected]>
VUB-HYDR · Oct 16, 2024 · fa3a25a · fa3a25a
1 parent a79fcfc
commit fa3a25a
Show file tree

Hide file tree

Showing 3,562 changed files with 8,760,893 additions and 86 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,8 +15,8 @@ Visualizations/**/plots
 **/*.env
 
 #ignore .jsonl file for batch process
-**/*basic.jsonl
-**/*impact.jsonl
+**/*.jsonl
+
 
 # ... except for this excel file (gold flat-format annotation table)
 !Database/gold/ImpactDB_DataTable_Validation.xlsx
@@ -35,3 +35,8 @@ geopy_cache.sqlite
 
 # vscode
 .vscode
+
+# full runs
+**/*.json.fixed.json
+**/geojson
+*full_run*/
diff --git a/Database/Prompts/batch_output_retrivel.py b/Database/Prompts/batch_output_retrivel.py
@@ -115,26 +115,31 @@ def get_message_by_custom_id(batch_responses, custom_id):
                 batch_id = batch.id
                 output_file_id = batch.output_file_id
                 # Retrieve the batch details
-                client.batches.retrieve(batch_id)
-                # Retrieve the file content associated with the output_file_id
-                file_response = client.files.content(output_file_id)
-                batch_responses = file_response.text
-
-                # Iterate over the parsed JSON lines and find all matching custom_ids
-                res = [json.loads(line) for line in batch_responses.strip().splitlines()]
-                for i in res:
-                    custom_id = i.get("custom_id", "")
-                    if Event_ID in custom_id:
-                        try:
-                            # Retrieve the message content for the matching custom_id
-                            message_content = get_message_by_custom_id(res, custom_id)
-
-                            # Update the df dictionary with the message content directly
-                            df.update(message_content)
-                        except json.JSONDecodeError as e:
-                            # If a JSONDecodeError occurs, log the error in the df
-                            df["Json_Error"] = str(e)
+                try:
+                    client.batches.retrieve(batch_id)
+                    # Retrieve the file content associated with the output_file_id
+                    file_response = client.files.content(output_file_id)
+                    batch_responses = file_response.text
+
+                    # Iterate over the parsed JSON lines and find all matching custom_ids
+                    res = [json.loads(line) for line in batch_responses.strip().splitlines()]
+                    for i in res:
+                        custom_id = i.get("custom_id", "")
+
+                        if custom_id is not None and Event_ID in custom_id:
+                            try:
+                                # Retrieve the message content for the matching custom_id
+                                message_content = get_message_by_custom_id(res, custom_id)
+
+                                # Update the df dictionary with the message content directly
+                                df.update(message_content)
+                            except json.JSONDecodeError as e:
+                                # If a JSONDecodeError occurs, log the error in the df
+                                df["Json_Error"] = str(e)
                     # Append the dictionary to the response list
+                except ValueError:
+                    pass
+
         response.append(df)
 
     out_file_path = (

diff --git a/Database/Prompts/prompts.py b/Database/Prompts/prompts.py
@@ -3,7 +3,7 @@
 # V_2 is the list of prompts for L1-3 with annotation gives the header names, finalized in 20240715
 # V_3_1 is a version based on V2, but with freezed variable names as the schema we confirmed, 20240823
 # V_3_2 is the version based on V3, but in L1, only prompt the model to capture countries, and we use this as the final version for test and full run
-# V_3_2 is the version based on V3_2, but the infobox and the whole text are feed in the end of the prompt.
+# V_3_3 is the version based on V3_2, but the infobox and the whole text are feed in the end of the prompt.
 # V_4 is the one with two prompts for each impact category, one prompt for L1/2 and one for L3
 # V_5 is the one with three prompts for each impact category
 # V_6 is a version based on V3 but force the model not to generate null for non-nullable items, and also for the L1 only ask for country information
@@ -1177,7 +1177,7 @@
       Ensure to capture all instances of affected people mentioned in the article, including direct and indirect causes. Only Give Json output, no extra explanation needed. """
     ],
     "buildings_damaged": [
-        """Based on the provided article {Info_Box} {Whole_Text},
+        """Based on information box {Info_Box} and header-content pair article {Whole_Text},
       extract the number of damaged buildings associated with the {Event_Name},
       covering a wide range of building types such as structures, homes, houses, households, apartments, office buildings, retail stores, hotels, schools, hospitals, and more,
       along with supporting annotations from the article. The number of damaged buildings information can be splited into 3 parts,

diff --git a/Database/Prompts/run_prompts.py b/Database/Prompts/run_prompts.py
@@ -7,12 +7,10 @@
 import openai
 from dotenv import load_dotenv
 
-# the prompt list need to use the same variable names in our schema, and each key contains 1+ prompts
-
-from Database.Prompts.prompts import V_3_3 as target_prompts
-
+from Database.Prompts.prompts import V_3_2 as target_prompts
 from Database.scr.log_utils import Logging
 
+# the prompt list need to use the same variable names in our schema, and each key contains 1+ prompts
 if __name__ == "__main__":
     logger = Logging.get_logger("run prompts")