Skip to content

Commit

Permalink
Relates to issue pashpashpash#2: prompt.py citations use {}, not []
Browse files Browse the repository at this point in the history
  • Loading branch information
1ashtray committed Jan 17, 2025
1 parent 71546e3 commit ed28388
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
6 changes: 3 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def process_batch(start_idx, pages):
)
return start_idx, response

with concurrent.futures.ThreadPoolExecutor() as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
futures = []
for i in range(0, len(volume), batch_size):
pages = volume[i : i + batch_size]
Expand Down Expand Up @@ -144,8 +144,8 @@ def process_batch(start_idx, pages):
print("Profile pages:", profile_pages)
print("Chapter pages:", chapter_pages)

chapter_pages = [1]
profile_pages = [0]
#chapter_pages = [1]
#profile_pages = [0]
print(f"{len(volume)}")
print("\n__________\n")
print("Saving important pages to disk for QA...")
Expand Down
6 changes: 3 additions & 3 deletions citation_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def clean_and_relocate_citations_sequence(text):
)

# Define a pattern to match sequences of citations that come before a period
pattern = re.compile(r"((?:\[\^\d+\])+)(\.)")
pattern = re.compile(r"((?:\[\^\{\d+\}\])+)(\.)")
# Relocate the entire sequence of citations to after the period
relocated_text = re.sub(pattern, r"\2\1", cleaned_text)

Expand All @@ -21,15 +21,15 @@ def clean_and_relocate_citations_sequence(text):
def extract_text_and_citations(text, images, images_unscaled):
text = clean_and_relocate_citations_sequence(text)
# Split text by citations, capturing the citations as well
parts = re.split(r"(\[\^[\d\]]+\])", text)
parts = re.split(r"(\[\^\{\d+\}\])", text)

# Initialize variables to store the current text block and its citations
current_text = ""
citations = []
output = []

for part in parts:
if re.match(r"\[\^[\d\]]+\]", part):
if re.match(r"\[\^\{\d+\}\]", part):
current_citations = [int(num) for num in re.findall(r"\d+", part)]
valid_citations = [num for num in current_citations if num < len(images)]
citations.extend(valid_citations)
Expand Down

0 comments on commit ed28388

Please sign in to comment.