Skip to content

Commit

Permalink
Full run pipeline (#158)
Browse files Browse the repository at this point in the history
Co-authored-by: Ni Li <[email protected]>
  • Loading branch information
i-be-snek and liniiiiii authored Oct 16, 2024
1 parent a79fcfc commit fa3a25a
Show file tree
Hide file tree
Showing 3,562 changed files with 8,760,893 additions and 86 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
9 changes: 7 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ Visualizations/**/plots
**/*.env

#ignore .jsonl file for batch process
**/*basic.jsonl
**/*impact.jsonl
**/*.jsonl


# ... except for this excel file (gold flat-format annotation table)
!Database/gold/ImpactDB_DataTable_Validation.xlsx
Expand All @@ -35,3 +35,8 @@ geopy_cache.sqlite

# vscode
.vscode

# full runs
**/*.json.fixed.json
**/geojson
*full_run*/
43 changes: 24 additions & 19 deletions Database/Prompts/batch_output_retrivel.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,26 +115,31 @@ def get_message_by_custom_id(batch_responses, custom_id):
batch_id = batch.id
output_file_id = batch.output_file_id
# Retrieve the batch details
client.batches.retrieve(batch_id)
# Retrieve the file content associated with the output_file_id
file_response = client.files.content(output_file_id)
batch_responses = file_response.text

# Iterate over the parsed JSON lines and find all matching custom_ids
res = [json.loads(line) for line in batch_responses.strip().splitlines()]
for i in res:
custom_id = i.get("custom_id", "")
if Event_ID in custom_id:
try:
# Retrieve the message content for the matching custom_id
message_content = get_message_by_custom_id(res, custom_id)

# Update the df dictionary with the message content directly
df.update(message_content)
except json.JSONDecodeError as e:
# If a JSONDecodeError occurs, log the error in the df
df["Json_Error"] = str(e)
try:
client.batches.retrieve(batch_id)
# Retrieve the file content associated with the output_file_id
file_response = client.files.content(output_file_id)
batch_responses = file_response.text

# Iterate over the parsed JSON lines and find all matching custom_ids
res = [json.loads(line) for line in batch_responses.strip().splitlines()]
for i in res:
custom_id = i.get("custom_id", "")

if custom_id is not None and Event_ID in custom_id:
try:
# Retrieve the message content for the matching custom_id
message_content = get_message_by_custom_id(res, custom_id)

# Update the df dictionary with the message content directly
df.update(message_content)
except json.JSONDecodeError as e:
# If a JSONDecodeError occurs, log the error in the df
df["Json_Error"] = str(e)
# Append the dictionary to the response list
except ValueError:
pass

response.append(df)

out_file_path = (
Expand Down
4 changes: 2 additions & 2 deletions Database/Prompts/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# V_2 is the list of prompts for L1-3 with annotation gives the header names, finalized in 20240715
# V_3_1 is a version based on V2, but with freezed variable names as the schema we confirmed, 20240823
# V_3_2 is the version based on V3, but in L1, only prompt the model to capture countries, and we use this as the final version for test and full run
# V_3_2 is the version based on V3_2, but the infobox and the whole text are feed in the end of the prompt.
# V_3_3 is the version based on V3_2, but the infobox and the whole text are feed in the end of the prompt.
# V_4 is the one with two prompts for each impact category, one prompt for L1/2 and one for L3
# V_5 is the one with three prompts for each impact category
# V_6 is a version based on V3 but force the model not to generate null for non-nullable items, and also for the L1 only ask for country information
Expand Down Expand Up @@ -1177,7 +1177,7 @@
Ensure to capture all instances of affected people mentioned in the article, including direct and indirect causes. Only Give Json output, no extra explanation needed. """
],
"buildings_damaged": [
"""Based on the provided article {Info_Box} {Whole_Text},
"""Based on information box {Info_Box} and header-content pair article {Whole_Text},
extract the number of damaged buildings associated with the {Event_Name},
covering a wide range of building types such as structures, homes, houses, households, apartments, office buildings, retail stores, hotels, schools, hospitals, and more,
along with supporting annotations from the article. The number of damaged buildings information can be splited into 3 parts,
Expand Down
6 changes: 2 additions & 4 deletions Database/Prompts/run_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@
import openai
from dotenv import load_dotenv

# the prompt list need to use the same variable names in our schema, and each key contains 1+ prompts

from Database.Prompts.prompts import V_3_3 as target_prompts

from Database.Prompts.prompts import V_3_2 as target_prompts
from Database.scr.log_utils import Logging

# the prompt list need to use the same variable names in our schema, and each key contains 1+ prompts
if __name__ == "__main__":
logger = Logging.get_logger("run prompts")

Expand Down
Loading

0 comments on commit fa3a25a

Please sign in to comment.