-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractor.py
230 lines (193 loc) · 11.3 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import time
import logging
import os
import pandas as pd
import fitz # PyMuPDF
import os
import time
import json
import spacy
import scispacy # (optional: this import ensures SciSpaCy is installed)
from pyclowder.extractors import Extractor
import pyclowder.files
# create log object with current module name
log = logging.getLogger(__name__)
class PyMuPDFExtractor(Extractor):
def __init__(self):
Extractor.__init__(self)
# add any additional arguments to parser
# self.parser.add_argument('--max', '-m', type=int, nargs='?', default=-1,
# help='maximum number (default=-1)')
# parse command line and load default logging configuration
self.setup()
# setup logging for the extractor
logging.getLogger('pyclowder').setLevel(logging.INFO)
logging.getLogger('__main__').setLevel(logging.INFO)
# Load the SciSpaCy model
self.nlp = spacy.load("en_core_sci_sm")
def extract_sections(self, page):
"""
Extract words from the page, build a combined text string, and use the SciSpaCy
model to segment the text into sentences.
This function uses page.get_text("words") to retrieve all words (with coordinates),
then concatenates them (in reading order) inserting spaces between words.
Finally, it returns a list of sentence texts extracted by the SciSpaCy sentence tokenizer.
"""
# Each word is a tuple: (x0, y0, x1, y1, word, block_no, line_no, word_no)
words = page.get_text("words")
# Sort words in reading order.
words.sort(key=lambda w: (w[5], w[6], w[7]))
# Build a combined text string and record each word's offset.
combined_text_parts = []
current_offset = 0
word_offsets = [] # List of tuples (start, end) for each word.
for w in words:
word_text = w[4]
combined_text_parts.append(word_text)
start = current_offset
end = start + len(word_text)
word_offsets.append((start, end))
# Assume a single space between words.
current_offset = end + 1
combined_text = " ".join(combined_text_parts)
# Process the combined text with SciSpaCy.
doc = self.nlp(combined_text)
sentences = [sent.text.strip() for sent in doc.sents]
return sentences
def process_message(self, connector, host, secret_key, resource, parameters):
# Process the file and upload the results
# uncomment to see the resource
# log.info(resource)
# {'type': 'file', 'id': '6435b226e4b02b1506038ec5', 'intermediate_id': '6435b226e4b02b1506038ec5', 'name': 'N18-3011.pdf', 'file_ext': '.pdf', 'parent': {'type': 'dataset', 'id': '64344255e4b0a99d8062e6e0'}, 'local_paths': ['/tmp/tmp2hw6l5ra.pdf']}
input_file = resource["local_paths"][0]
input_file_id = resource['id']
dataset_id = resource['parent'].get('id')
input_filename = os.path.splitext(os.path.basename(resource["name"]))[0]
input_file_ext = resource['file_ext']
if input_file_ext == ".pdf":
output_json_file = os.path.join(input_filename + "-pymupdf" + ".json")
output_csv_file = os.path.join(input_filename + "-pymupdf" + ".csv")
else:
raise ValueError("Input file is not a PDF")
# These process messages will appear in the Clowder UI under Extractions.
connector.message_process(resource, "Loading contents of file for pymupdf extraction...")
# -------------------- Main Processing Loop --------------------
# We'll save two outputs for each PDF:
# 1. The annotated PDF (without black boxes for sentences).
# 2. A JSON file containing only the list of extracted sentences (per page).
try:
start_time = time.time()
doc = fitz.open(input_file)
# Create a new document for the annotated PDF.
# new_doc = fitz.open()
# This dictionary will hold the sentences extracted per page.
sentences_output = {"pages": []}
all_sentences = []
for page_number in range(len(doc)):
page = doc.load_page(page_number)
# Create a new page in the new document with the same dimensions.
# new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
# new_page.show_pdf_page(new_page.rect, doc, page_number) # Copy original content.
# Use PyMuPDF's text extraction; use OCR if needed.
# if not page.get_text():
# tp = page.get_textpage_ocr(dpi=300, full=True)
# else:
# tp = page.get_textpage()
# (We still extract the text dictionary if needed for further processing.)
# text_dict = tp.extractDICT()
# Extract sentences using our SciSpaCy helper.
sentences = self.extract_sections(page)
for sentence in sentences:
all_sentences.append({"file": os.path.basename(input_file), "section": "", "sentence": sentence, "coordinates": ""})
# Save the sentences (without coordinates) for this page.
sentences_output["pages"].append({"page_number": page_number, "sentences": sentences})
# (Optional) If you want to visualize, you can draw bounding boxes for other elements,
# but here we skip drawing the sentence boxes to avoid clutter.
# For example, you might still draw text spans in red if desired:
# for block in text_dict.get("blocks", []):
# if "lines" in block:
# for line in block["lines"]:
# for span in line["spans"]:
# bbox = span["bbox"]
# # Draw text span boxes in red (you can comment this out if not needed)
# new_page.draw_rect(bbox, color=(1, 0, 0), width=0.5)
# # (Optional) Draw table bounding boxes if found.
# tables = page.find_tables()
# if tables:
# for table in tables:
# table_bbox = table.bbox
# new_page.draw_rect(table_bbox, color=(0.5, 0, 0), width=1.5)
# print(f"Table found in {pdf_file}, page {page_number}: {table_bbox}")
# Save the annotated PDF in the output folder.
# new_doc.save(output_pdf_file, garbage=4, deflate=True, clean=True)
# new_doc.close()
doc.close()
# Save the JSON file containing only the extracted sentences.
with open(output_json_file, "w", encoding="utf-8") as jf:
json.dump(sentences_output, jf, ensure_ascii=False, indent=4)
# convert sentences to csv
sentences_df = pd.DataFrame(all_sentences, columns=['file', 'section', 'sentence', 'coordinates'])
sentences_df = sentences_df.astype(str)
sentences_df.to_csv(output_csv_file, index=False)
end_time = time.time()
processing_time = end_time - start_time
log.info(f"Processed {input_filename} in {processing_time:.2f} seconds.")
log.info("Output Json file generated : %s", output_json_file)
log.info("Output CSV file generated : %s", output_csv_file)
connector.message_process(resource, "PyMuPDF extraction completed.")
# clean existing duplicate
files_in_dataset = pyclowder.datasets.get_file_list(connector, host, secret_key, dataset_id)
for file in files_in_dataset:
if file["filename"] == output_csv_file or file["filename"] == output_json_file:
url = '%sapi/files/%s?key=%s' % (host, file["id"], secret_key)
connector.delete(url, verify=connector.ssl_verify if connector else True)
connector.message_process(resource, "Check for duplicate files...")
# upload to clowder
connector.message_process(resource, "Uploading output files to Clowder...")
json_fileid = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, output_json_file)
csv_fileid = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, output_csv_file)
# upload metadata to dataset
extracted_files = [
{"file_id": input_file_id, "filename": input_filename, "description": "Input pdf file"},
{"file_id": json_fileid, "filename": output_json_file, "description": "PyMuPDF JSON output file"},
{"file_id": csv_fileid, "filename": output_csv_file, "description": "PyMuPDF CSV output file"},
]
content = {"extractor": "pymupdf-extractor", "extracted_files": extracted_files}
context = "http://clowder.ncsa.illinois.edu/contexts/metadata.jsonld"
#created_at = datetime.now().strftime("%a %d %B %H:%M:%S UTC %Y")
user_id = "http://clowder.ncsa.illinois.edu/api/users" # TODO: can update user id in config
agent = {"@type": "user", "user_id": user_id}
metadata = {"@context": [context], "agent": agent, "content": content}
pyclowder.datasets.upload_metadata(connector, host, secret_key, dataset_id, metadata)
except Exception as e:
log.error(f"PyMuPDF Extractor Error processing file {input_filename} : {e}")
connector.message_process(resource, f"PyMuPDF Extractor Error processing file {input_filename} : {e}")
if __name__ == "__main__":
# uncomment for testing
# input_file = "data/2020.acl-main.207.pdf"
# output_pdf_file = os.path.join(os.path.splitext(os.path.basename(input_file))[0] + "-pymupdf" + ".pdf")
# output_json_file = os.path.join(os.path.splitext(os.path.basename(input_file))[0] + "-pymupdf" + ".json")
# output_csv_file = os.path.join(os.path.splitext(os.path.basename(input_file))[0] + "-pymupdf" + ".csv")
# extractor = PyMuPDFExtractor()
# doc = fitz.open(input_file)
# # This dictionary will hold the sentences extracted per page.
# sentences_output = {"pages": []}
# all_sentences = []
# for page_number in range(len(doc)):
# page = doc.load_page(page_number)
# # Extract sentences using our SciSpaCy helper.
# sentences = extractor.extract_sections(page)
# # Save the sentences for this page.
# for sentence in sentences:
# all_sentences.append({"file": os.path.basename(input_file), "section": "", "sentence": sentence, "coordinates": ""})
# sentences_output["pages"].append({"page_number": page_number, "sentences": sentences})
# doc.close()
# # Save the JSON file containing only the extracted sentences.
# with open(output_json_file, "w", encoding="utf-8") as jf:
# json.dump(sentences_output, jf, ensure_ascii=False, indent=4)
# # convert sentences to csv
# sentences_df = pd.DataFrame(all_sentences, columns=['file', 'section', 'sentence', 'coordinates'])
# sentences_df = sentences_df.astype(str)
# sentences_df.to_csv(output_csv_file, index=False)
extractor = PyMuPDFExtractor()
extractor.start()