Skip to content

Commit 0e71f61

Browse files
authored
6 fix filename json output (#7)
* updated input filename * updated extractors info with correct repo name * updated filenames * logging resource * fixed filenames * lof filenames * more logs * right filetype * corrected log format * cleaned code * clean logs * updated changelog * 8 convert json output to txt (#9) * renamed doc2json to doc2txt * code cleanup * added json2txt conversion * changing to localhost for testing * for local testing * create init * update renamed module * updated renamed module doc2txt * upadted arguments * updated arguments * moved process json to process pdf * updated import * added back extractor * back to extractor grobid * update writing to file * some extra comments * updated text data lookup * updated text key extraction * catch return files * updated extractor info * updated changelog
1 parent c8ef835 commit 0e71f61

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+148
-78
lines changed

Diff for: CHANGELOG

+2
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
88

99
### Added
1010
- AllenAI s2orc-doc2json as a clowder extractor. [#3](https://github.com/clowder-framework/text-extractor/issues/3)
11+
- Conversion of json output to text. [#8](https://github.com/clowder-framework/text-extractor/issues/8)
1112

1213
### Changed
1314

1415

1516
### Fixed
17+
- Output json filename changed to same name as input file. [#6](https://github.com/clowder-framework/text-extractor/issues/6)
1618

Diff for: Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
FROM python:3.10
22

3-
COPY doc2json ./doc2json
3+
COPY doc2txt ./doc2txt
44
COPY tests ./tests
55
COPY build_run.sh setup.py textextractor.py requirements.txt extractor_info.json ./
66

File renamed without changes.

Diff for: doc2json/config.py renamed to doc2txt/config.py

File renamed without changes.

Diff for: doc2json/flask/app.py renamed to doc2txt/flask/app.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
import hashlib
55
import requests
66
from flask import Flask, request, jsonify, flash, url_for, redirect, render_template, send_file
7-
from doc2json.grobid2json.process_pdf import process_pdf_stream
8-
from doc2json.tex2json.process_tex import process_tex_stream
9-
from doc2json.jats2json.process_jats import process_jats_stream
7+
from doc2txt.grobid2json.process_pdf import process_pdf_stream
8+
from doc2txt.tex2json.process_tex import process_tex_stream
9+
from doc2txt.jats2json.process_jats import process_jats_stream
1010

1111
app = Flask(__name__)
1212

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

Diff for: doc2json/grobid2json/grobid/grobid_client.py renamed to doc2txt/grobid2json/grobid/grobid_client.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import time
66
import glob
77
import logging
8-
from doc2json.grobid2json.grobid.client import ApiClient
8+
from doc2txt.grobid2json.grobid.client import ApiClient
99
import ntpath
1010
from typing import List
1111

@@ -124,28 +124,31 @@ def process_pdf_stream(self, pdf_file: str, pdf_strm: bytes, output: str, servic
124124
elif status != 200:
125125
with open(os.path.join(output, "failed.log"), "a+") as failed:
126126
failed.write(pdf_file.strip(".pdf") + "\n")
127-
print('Processing failed with error ' + str(status))
127+
log.error('Processing failed with error %s', str(status))
128128
return ""
129129
else:
130130
return res.text
131131

132-
def process_pdf(self, pdf_file: str, output: str, service: str) -> None:
132+
def process_pdf(self, pdf_file: str, input_filename: str, output: str, service: str) -> None:
133133
# check if TEI file is already produced
134134
# we use ntpath here to be sure it will work on Windows too
135-
pdf_file_name = ntpath.basename(pdf_file)
136-
filename = os.path.join(output, os.path.splitext(pdf_file_name)[0] + '.tei.xml')
135+
#pdf_file_name = ntpath.basename(pdf_file)
136+
filename = os.path.join(output, input_filename + '.tei.xml')
137137
if os.path.isfile(filename):
138138
return
139139

140-
log.info("PDF File to process is %s", pdf_file)
140+
log.info("Processing pdf file in path %s with name %s", pdf_file, input_filename)
141141
pdf_strm = open(pdf_file, 'rb').read()
142-
tei_text = self.process_pdf_stream(pdf_file, pdf_strm, output, service)
142+
tei_text = self.process_pdf_stream(input_filename, pdf_strm, output, service)
143143

144144
# writing TEI file
145145
if tei_text:
146146
with io.open(filename, 'w+', encoding='utf8') as tei_file:
147-
log.info("writing to tei file %s", tei_file)
147+
log.info("Writing to tei file %s", tei_file)
148148
tei_file.write(tei_text)
149+
else:
150+
log.error("TEI processing unsuccessful")
151+
149152

150153
def process_citation(self, bib_string: str, log_file: str) -> str:
151154
# process citation raw string and return corresponding dict
File renamed without changes.

Diff for: doc2json/grobid2json/process_pdf.py renamed to doc2txt/grobid2json/process_pdf.py

+28-20
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
from bs4 import BeautifulSoup
77
from typing import Optional, Dict
88

9-
from doc2json.grobid2json.grobid.grobid_client import GrobidClient
10-
from doc2json.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, convert_tei_xml_soup_to_s2orc_json
9+
from doc2txt.grobid2json.grobid.grobid_client import GrobidClient
10+
from doc2txt.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, convert_tei_xml_soup_to_s2orc_json
11+
from doc2txt.json2txt.json2txt import process_json
1112

1213
BASE_TEMP_DIR = 'temp'
1314
BASE_OUTPUT_DIR = 'output'
14-
BASE_LOG_DIR = 'log'
1515

1616
# create log object with current module name
1717
log = logging.getLogger(__name__)
@@ -40,54 +40,61 @@ def process_pdf_stream(input_file: str, sha: str, input_stream: bytes, grobid_co
4040

4141
def process_pdf_file(
4242
input_file: str,
43-
temp_dir: str = BASE_TEMP_DIR,
44-
output_dir: str = BASE_OUTPUT_DIR,
43+
input_filename :str,
44+
temp_dir: str,
45+
output_dir: str,
4546
grobid_config: Optional[Dict] = None
46-
) -> str:
47+
) -> [str, str, str]:
4748
"""
4849
Process a PDF file and get JSON representation
49-
:param input_file:
50+
:param input_file: input file resource
51+
:param input_filename: input filename resource
5052
:param temp_dir:
5153
:param output_dir:
52-
:return:
54+
:return: xml output file, json output file, txt output file
5355
"""
5456
os.makedirs(temp_dir, exist_ok=True)
5557
os.makedirs(output_dir, exist_ok=True)
5658

57-
# get paper id as the name of the file
58-
paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
59-
tei_file = os.path.join(temp_dir, f'{paper_id}.tei.xml')
60-
output_file = os.path.join(output_dir, f'{paper_id}.json')
61-
log.info("Files %s, %s, %s", paper_id, tei_file, output_file)
59+
# filenames for tei and json outputs
60+
tei_file = os.path.join(temp_dir, f'{input_filename}.tei.xml')
61+
json_file = os.path.join(output_dir, f'{input_filename}.json')
62+
txt_file = os.path.join(output_dir, f'{input_filename}.txt')
6263

6364
# check if input file exists and output file doesn't
6465
if not os.path.exists(input_file):
6566
raise FileNotFoundError(f"{input_file} doesn't exist")
66-
if os.path.exists(output_file):
67-
print(f'{output_file} already exists!')
67+
if os.path.exists(json_file):
68+
log.warning(f'{json_file} already exists!')
6869

6970
# process PDF through Grobid -> TEI.XML
7071
client = GrobidClient(grobid_config)
7172
# TODO: compute PDF hash
7273
# TODO: add grobid version number to output
73-
client.process_pdf(input_file, temp_dir, "processFulltextDocument")
74+
client.process_pdf(input_file, input_filename, temp_dir, "processFulltextDocument")
7475

7576
# process TEI.XML -> JSON
7677
assert os.path.exists(tei_file)
7778
paper = convert_tei_xml_file_to_s2orc_json(tei_file)
7879

7980
# write to file
80-
with open(output_file, 'w') as outf:
81+
with open(json_file, 'w') as outf:
8182
json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
8283

83-
return output_file
84+
# extract text field from json and write to file
85+
output_txt = process_json(json_file, "text")
86+
with open(txt_file, 'w') as outfile:
87+
for text in output_txt:
88+
outfile.write(f"{text}\n")
89+
90+
return tei_file, json_file, txt_file
8491

8592

8693
if __name__ == '__main__':
8794
parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
8895
parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
8996
parser.add_argument("-t", "--temp", default=BASE_TEMP_DIR, help="path to the temp dir for putting tei xml files")
90-
parser.add_argument("-o", "--output", default=BASE_OUTPUT_DIR, help="path to the output dir for putting json files")
97+
parser.add_argument("-o", "--output", default=BASE_OUTPUT_DIR, help="path to the output dir for putting json and txt files")
9198
parser.add_argument("-k", "--keep", action='store_true')
9299

93100
args = parser.parse_args()
@@ -102,7 +109,8 @@ def process_pdf_file(
102109
os.makedirs(temp_path, exist_ok=True)
103110
os.makedirs(output_path, exist_ok=True)
104111

105-
process_pdf_file(input_path, temp_path, output_path)
112+
input_filename = os.path.splitext(os.path.basename(input_path))[0]
113+
tei_file, json_file, txt_file = process_pdf_file(input_path, input_filename, temp_path, output_path)
106114

107115
runtime = round(time.time() - start_time, 3)
108116
print("runtime: %s seconds " % (runtime))

Diff for: doc2json/grobid2json/tei_to_json.py renamed to doc2txt/grobid2json/tei_to_json.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
from bs4 import BeautifulSoup, NavigableString
88
from typing import List, Dict, Tuple
99

10-
from doc2json.s2orc import Paper
10+
from doc2txt.s2orc import Paper
1111

12-
from doc2json.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml
13-
from doc2json.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD
14-
from doc2json.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse
15-
from doc2json.utils.refspan_util import sub_spans_and_update_indices
12+
from doc2txt.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml
13+
from doc2txt.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD
14+
from doc2txt.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse
15+
from doc2txt.utils.refspan_util import sub_spans_and_update_indices
1616

1717

1818
REPLACE_TABLE_TOKS = {
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

Diff for: doc2txt/json2txt/json2txt.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Convert json to text
2+
3+
import json
4+
import logging
5+
6+
# create log object with current module name
7+
log = logging.getLogger(__name__)
8+
9+
10+
def process_json(input_file, key):
11+
"""
12+
Method to convert json file to text.
13+
Extracts data from the key field and return a list of strings as text
14+
Args:
15+
input_file (str): Json input file
16+
key (str): Json field key to extract data
17+
Returns:
18+
output (list): List of text data extracted from json
19+
"""
20+
json_file = open(input_file)
21+
json_data = json.load(json_file) # load json object to a dictionary
22+
# if using grobid, one can also use the pdf_parse key and title key.
23+
title_text = json_data["title"]
24+
pdf_json_data = json_data["pdf_parse"]
25+
abstract_data = pdf_json_data["abstract"]
26+
body_data = pdf_json_data["body_text"]
27+
output = []
28+
# append title text to output
29+
output.append(title_text)
30+
# append abstract text in body
31+
for i in item_generator(abstract_data, key):
32+
output.append(i)
33+
# append body text in body
34+
for i in item_generator(body_data, key):
35+
output.append(i)
36+
37+
json_file.close()
38+
39+
return output
40+
41+
42+
def item_generator(json_data, lookup_key):
43+
"""
44+
Method to extract a field from nested json data.
45+
Extracts data from the key field and return the value
46+
Args:
47+
json_data (str): Json input data
48+
lookup_key (str): Json field key to extract data
49+
Returns:
50+
output (list): List of text data extracted from json
51+
"""
52+
if isinstance(json_data, dict):
53+
for k, v in json_data.items():
54+
if k == lookup_key:
55+
yield v
56+
else:
57+
# yield from item_generator(v, lookup_key)
58+
# no nested lookups. only text in the first dictionary items is appended to output
59+
pass
60+
elif isinstance(json_data, list):
61+
for item in json_data:
62+
yield from item_generator(item, lookup_key)

Diff for: doc2json/s2orc.py renamed to doc2txt/s2orc.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from datetime import datetime
66
from typing import Dict, List, Optional
7-
from doc2json.config import *
7+
from doc2txt.config import *
88

99

1010
CORRECT_KEYS = {
File renamed without changes.

Diff for: doc2json/spp2json/process_pdf.py renamed to doc2txt/spp2json/process_pdf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import time
55
from typing import Dict
66

7-
from doc2json.spp2json.spp.spp_client import SppClient
8-
from doc2json.spp2json.spp.spp_json_to_s2orc_json import convert_spp_json_to_s2orc_json
7+
from doc2txt.spp2json.spp.spp_client import SppClient
8+
from doc2txt.spp2json.spp.spp_json_to_s2orc_json import convert_spp_json_to_s2orc_json
99

1010

1111

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

Diff for: doc2txt/utils/__init__.py

Whitespace-only changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

Diff for: extractor_info.json

+6-8
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,24 @@
11
{
22
"@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
3-
"name": "textextractor",
4-
"version": "0.1",
5-
"description": "extracts text from pdf files",
3+
"name": "extractors-pdf2text",
4+
"version": "0.4",
5+
"description": "Extracts text from pdf files. Creates an xml, json and txt file and uploads to Clowder dataset. Uses Grobid service and AllenAI s2orc-doc2json",
66
"author": "Mathew, Minu <[email protected]>; Lo, Kyle and Wang, Lucy Lu and Neumann, Mark and Kinney, Rodney and Weld, Daniel",
77
"contributors": [],
88
"contexts": [{}],
99
"repository": [
1010
{
1111
"repType": "git",
12-
"repUrl": "https://github.com/clowder-framework/text-extractor"
12+
"repUrl": "https://github.com/clowder-framework/extractors-s2orc-pdf2text"
1313
}
1414
],
1515
"process": {
1616
"file": [
17-
"application/pdf",
18-
"application/*",
19-
"text/plain"
17+
"application/pdf"
2018
]
2119
},
2220
"external_services": [],
2321
"dependencies": [],
2422
"bibtex": [],
25-
"labels": ["Type/Any"]
23+
"labels": ["Type/PDF"]
2624
}

Diff for: setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import setuptools
33

44
setuptools.setup(
5-
name='doc2json',
5+
name='doc2txt',
66
version='0.1',
77
packages=setuptools.find_packages(),
88
install_requires=[

Diff for: tests/test_end_to_end.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
import unittest
33
import shutil
44

5-
from doc2json.grobid2json.process_pdf import process_pdf_file
6-
from doc2json.tex2json.process_tex import process_tex_file
7-
from doc2json.jats2json.process_jats import process_jats_file
5+
from doc2txt.grobid2json.process_pdf import process_pdf_file
6+
from doc2txt.tex2json.process_tex import process_tex_file
7+
from doc2txt.jats2json.process_jats import process_jats_file
88

99
TEST_PDF_INPUT_DATA = os.path.join('tests', 'pdf')
1010
TEST_PDF_TEMP_DATA = os.path.join('tests', 'pdf_temp')

Diff for: tests/test_read_write.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import unittest
33
import json
44

5-
from doc2json.s2orc import load_s2orc
5+
from doc2txt.s2orc import load_s2orc
66

77
JSON_INPUT_DATA = os.path.join('tests', 'pdf', 'N18-3011.json')
88

Diff for: tests/test_s2orc_versions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import unittest
33
import json
44

5-
from doc2json.s2orc import load_s2orc
5+
from doc2txt.s2orc import load_s2orc
66

77
TEST_S2ORC_INPUT_DATA = os.path.join('tests', 's2orc')
88
TEST_S2ORC_CURRENT = os.path.join(TEST_S2ORC_INPUT_DATA, '20210101')

0 commit comments

Comments
 (0)