Relates to issue pashpashpash#2: prompt.py citations use {}, not []

1ashtray · Jan 17, 2025 · ed28388 · ed28388
1 parent 71546e3
commit ed28388
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 6 deletions.
diff --git a/app.py b/app.py
@@ -115,7 +115,7 @@ def process_batch(start_idx, pages):
         )
         return start_idx, response
 
-    with concurrent.futures.ThreadPoolExecutor() as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
         futures = []
         for i in range(0, len(volume), batch_size):
             pages = volume[i : i + batch_size]
@@ -144,8 +144,8 @@ def process_batch(start_idx, pages):
     print("Profile pages:", profile_pages)
     print("Chapter pages:", chapter_pages)
 
-    chapter_pages = [1]
-    profile_pages = [0]
+    #chapter_pages = [1]
+    #profile_pages = [0]
     print(f"{len(volume)}")
     print("\n__________\n")
     print("Saving important pages to disk for QA...")

diff --git a/citation_processing.py b/citation_processing.py
@@ -11,7 +11,7 @@ def clean_and_relocate_citations_sequence(text):
     )
 
     # Define a pattern to match sequences of citations that come before a period
-    pattern = re.compile(r"((?:\[\^\d+\])+)(\.)")
+    pattern = re.compile(r"((?:\[\^\{\d+\}\])+)(\.)")
     # Relocate the entire sequence of citations to after the period
     relocated_text = re.sub(pattern, r"\2\1", cleaned_text)
 
@@ -21,15 +21,15 @@ def clean_and_relocate_citations_sequence(text):
 def extract_text_and_citations(text, images, images_unscaled):
     text = clean_and_relocate_citations_sequence(text)
     # Split text by citations, capturing the citations as well
-    parts = re.split(r"(\[\^[\d\]]+\])", text)
+    parts = re.split(r"(\[\^\{\d+\}\])", text)
 
     # Initialize variables to store the current text block and its citations
     current_text = ""
     citations = []
     output = []
 
     for part in parts:
-        if re.match(r"\[\^[\d\]]+\]", part):
+        if re.match(r"\[\^\{\d+\}\]", part):
             current_citations = [int(num) for num in re.findall(r"\d+", part)]
             valid_citations = [num for num in current_citations if num < len(images)]
             citations.extend(valid_citations)