clowder-framework
diff --git a/Diff for: ‎CHANGELOG
+2 b/Diff for: ‎CHANGELOG
+2
diff --git a/Diff for: ‎Dockerfile
+1-1 b/Diff for: ‎Dockerfile
+1-1
diff --git a/Diff for: ‎doc2json/__init__.py renamed to ‎doc2txt/__init__.py b/Diff for: ‎doc2json/__init__.py renamed to ‎doc2txt/__init__.py
diff --git a/Diff for: ‎doc2json/config.py renamed to ‎doc2txt/config.py b/Diff for: ‎doc2json/config.py renamed to ‎doc2txt/config.py
diff --git a/Diff for: ‎doc2json/flask/app.py renamed to ‎doc2txt/flask/app.py
+3-3 b/Diff for: ‎doc2json/flask/app.py renamed to ‎doc2txt/flask/app.py
+3-3
diff --git a/Diff for: ‎doc2json/flask/static/style.css renamed to ‎doc2txt/flask/static/style.css b/Diff for: ‎doc2json/flask/static/style.css renamed to ‎doc2txt/flask/static/style.css
diff --git a/Diff for: ‎doc2json/flask/templates/home.html renamed to ‎doc2txt/flask/templates/home.html b/Diff for: ‎doc2json/flask/templates/home.html renamed to ‎doc2txt/flask/templates/home.html
diff --git a/Diff for: ‎doc2json/grobid2json/__init__.py renamed to ‎doc2txt/grobid2json/__init__.py b/Diff for: ‎doc2json/grobid2json/__init__.py renamed to ‎doc2txt/grobid2json/__init__.py
diff --git a/Diff for: ‎doc2json/grobid2json/grobid/Readme.md renamed to ‎doc2txt/grobid2json/grobid/Readme.md b/Diff for: ‎doc2json/grobid2json/grobid/Readme.md renamed to ‎doc2txt/grobid2json/grobid/Readme.md
diff --git a/Diff for: ‎doc2json/grobid2json/grobid/__init__.py renamed to ‎doc2txt/grobid2json/grobid/__init__.py b/Diff for: ‎doc2json/grobid2json/grobid/__init__.py renamed to ‎doc2txt/grobid2json/grobid/__init__.py
diff --git a/Diff for: ‎doc2json/grobid2json/grobid/client.py renamed to ‎doc2txt/grobid2json/grobid/client.py b/Diff for: ‎doc2json/grobid2json/grobid/client.py renamed to ‎doc2txt/grobid2json/grobid/client.py
diff --git a/Diff for: ‎doc2json/grobid2json/grobid/config.yaml renamed to ‎doc2txt/grobid2json/grobid/config.yaml b/Diff for: ‎doc2json/grobid2json/grobid/config.yaml renamed to ‎doc2txt/grobid2json/grobid/config.yaml
diff --git a/Diff for: ‎doc2json/grobid2json/grobid/grobid.properties renamed to ‎doc2txt/grobid2json/grobid/grobid.properties b/Diff for: ‎doc2json/grobid2json/grobid/grobid.properties renamed to ‎doc2txt/grobid2json/grobid/grobid.properties
diff --git a/Diff for: ‎doc2json/grobid2json/grobid/grobid_client.py renamed to ‎doc2txt/grobid2json/grobid/grobid_client.py
+11-8 b/Diff for: ‎doc2json/grobid2json/grobid/grobid_client.py renamed to ‎doc2txt/grobid2json/grobid/grobid_client.py
+11-8
diff --git a/Diff for: ‎doc2json/grobid2json/pdf_to_tei.py renamed to ‎doc2txt/grobid2json/pdf_to_tei.py b/Diff for: ‎doc2json/grobid2json/pdf_to_tei.py renamed to ‎doc2txt/grobid2json/pdf_to_tei.py
diff --git a/Diff for: ‎doc2json/grobid2json/process_pdf.py renamed to ‎doc2txt/grobid2json/process_pdf.py
+28-20 b/Diff for: ‎doc2json/grobid2json/process_pdf.py renamed to ‎doc2txt/grobid2json/process_pdf.py
+28-20
diff --git a/Diff for: ‎doc2json/grobid2json/tei_to_json.py renamed to ‎doc2txt/grobid2json/tei_to_json.py
+5-5 b/Diff for: ‎doc2json/grobid2json/tei_to_json.py renamed to ‎doc2txt/grobid2json/tei_to_json.py
+5-5
diff --git a/Diff for: ‎doc2json/jats2json/__init__.py renamed to ‎doc2txt/jats2json/__init__.py b/Diff for: ‎doc2json/jats2json/__init__.py renamed to ‎doc2txt/jats2json/__init__.py
diff --git a/Diff for: ‎doc2json/jats2json/jats_to_json.py renamed to ‎doc2txt/jats2json/jats_to_json.py b/Diff for: ‎doc2json/jats2json/jats_to_json.py renamed to ‎doc2txt/jats2json/jats_to_json.py
diff --git a/Diff for: ‎doc2json/jats2json/pmc_utils/__init__.py renamed to ‎doc2txt/jats2json/pmc_utils/__init__.py b/Diff for: ‎doc2json/jats2json/pmc_utils/__init__.py renamed to ‎doc2txt/jats2json/pmc_utils/__init__.py
diff --git a/Diff for: ‎doc2json/jats2json/pmc_utils/all_tag_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/all_tag_utils.py b/Diff for: ‎doc2json/jats2json/pmc_utils/all_tag_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/all_tag_utils.py
diff --git a/Diff for: ‎doc2json/jats2json/pmc_utils/back_tag_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/back_tag_utils.py b/Diff for: ‎doc2json/jats2json/pmc_utils/back_tag_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/back_tag_utils.py
diff --git a/Diff for: ‎doc2json/jats2json/pmc_utils/extract_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/extract_utils.py b/Diff for: ‎doc2json/jats2json/pmc_utils/extract_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/extract_utils.py
diff --git a/Diff for: ‎doc2json/jats2json/pmc_utils/front_tag_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/front_tag_utils.py b/Diff for: ‎doc2json/jats2json/pmc_utils/front_tag_utils.py renamed to ‎doc2txt/jats2json/pmc_utils/front_tag_utils.py
diff --git a/Diff for: ‎doc2json/jats2json/pmc_utils/tests.py renamed to ‎doc2txt/jats2json/pmc_utils/tests.py b/Diff for: ‎doc2json/jats2json/pmc_utils/tests.py renamed to ‎doc2txt/jats2json/pmc_utils/tests.py
diff --git a/Diff for: ‎doc2json/jats2json/process_jats.py renamed to ‎doc2txt/jats2json/process_jats.py b/Diff for: ‎doc2json/jats2json/process_jats.py renamed to ‎doc2txt/jats2json/process_jats.py
diff --git a/Diff for: ‎doc2json/spp2json/__init__.py renamed to ‎doc2txt/json2txt/__init__.py b/Diff for: ‎doc2json/spp2json/__init__.py renamed to ‎doc2txt/json2txt/__init__.py
diff --git a/Diff for: ‎doc2txt/json2txt/json2txt.py
+62 b/Diff for: ‎doc2txt/json2txt/json2txt.py
+62
diff --git a/Diff for: ‎doc2json/s2orc.py renamed to ‎doc2txt/s2orc.py
+1-1 b/Diff for: ‎doc2json/s2orc.py renamed to ‎doc2txt/s2orc.py
+1-1
diff --git a/Diff for: ‎doc2json/spp2json/spp/__init__.py renamed to ‎doc2txt/spp2json/__init__.py b/Diff for: ‎doc2json/spp2json/spp/__init__.py renamed to ‎doc2txt/spp2json/__init__.py
diff --git a/Diff for: ‎doc2json/spp2json/process_pdf.py renamed to ‎doc2txt/spp2json/process_pdf.py
+2-2 b/Diff for: ‎doc2json/spp2json/process_pdf.py renamed to ‎doc2txt/spp2json/process_pdf.py
+2-2
diff --git a/Diff for: ‎doc2json/tex2json/__init__.py renamed to ‎doc2txt/spp2json/spp/__init__.py b/Diff for: ‎doc2json/tex2json/__init__.py renamed to ‎doc2txt/spp2json/spp/__init__.py
diff --git a/Diff for: ‎doc2json/spp2json/spp/spp_client.py renamed to ‎doc2txt/spp2json/spp/spp_client.py b/Diff for: ‎doc2json/spp2json/spp/spp_client.py renamed to ‎doc2txt/spp2json/spp/spp_client.py
diff --git a/Diff for: ‎doc2json/spp2json/spp/spp_json_to_s2orc_json.py renamed to ‎doc2txt/spp2json/spp/spp_json_to_s2orc_json.py b/Diff for: ‎doc2json/spp2json/spp/spp_json_to_s2orc_json.py renamed to ‎doc2txt/spp2json/spp/spp_json_to_s2orc_json.py
diff --git a/Diff for: ‎doc2json/utils/__init__.py renamed to ‎doc2txt/tex2json/__init__.py b/Diff for: ‎doc2json/utils/__init__.py renamed to ‎doc2txt/tex2json/__init__.py
diff --git a/Diff for: ‎doc2json/tex2json/process_tex.py renamed to ‎doc2txt/tex2json/process_tex.py b/Diff for: ‎doc2json/tex2json/process_tex.py renamed to ‎doc2txt/tex2json/process_tex.py
diff --git a/Diff for: ‎doc2json/tex2json/tex_to_xml.py renamed to ‎doc2txt/tex2json/tex_to_xml.py b/Diff for: ‎doc2json/tex2json/tex_to_xml.py renamed to ‎doc2txt/tex2json/tex_to_xml.py
diff --git a/Diff for: ‎doc2json/tex2json/xml_to_json.py renamed to ‎doc2txt/tex2json/xml_to_json.py b/Diff for: ‎doc2json/tex2json/xml_to_json.py renamed to ‎doc2txt/tex2json/xml_to_json.py
diff --git a/Diff for: ‎doc2txt/utils/__init__.py b/Diff for: ‎doc2txt/utils/__init__.py
diff --git a/Diff for: ‎doc2json/utils/citation_util.py renamed to ‎doc2txt/utils/citation_util.py b/Diff for: ‎doc2json/utils/citation_util.py renamed to ‎doc2txt/utils/citation_util.py
diff --git a/Diff for: ‎doc2json/utils/grobid_util.py renamed to ‎doc2txt/utils/grobid_util.py b/Diff for: ‎doc2json/utils/grobid_util.py renamed to ‎doc2txt/utils/grobid_util.py
diff --git a/Diff for: ‎doc2json/utils/latex_util.py renamed to ‎doc2txt/utils/latex_util.py b/Diff for: ‎doc2json/utils/latex_util.py renamed to ‎doc2txt/utils/latex_util.py
diff --git a/Diff for: ‎doc2json/utils/refspan_util.py renamed to ‎doc2txt/utils/refspan_util.py b/Diff for: ‎doc2json/utils/refspan_util.py renamed to ‎doc2txt/utils/refspan_util.py
diff --git a/Diff for: ‎doc2json/utils/soup_utils.py renamed to ‎doc2txt/utils/soup_utils.py b/Diff for: ‎doc2json/utils/soup_utils.py renamed to ‎doc2txt/utils/soup_utils.py
diff --git a/Diff for: ‎extractor_info.json
+6-8 b/Diff for: ‎extractor_info.json
+6-8
diff --git a/Diff for: ‎setup.py
+1-1 b/Diff for: ‎setup.py
+1-1
diff --git a/Diff for: ‎tests/test_end_to_end.py
+3-3 b/Diff for: ‎tests/test_end_to_end.py
+3-3
diff --git a/Diff for: ‎tests/test_read_write.py
+1-1 b/Diff for: ‎tests/test_read_write.py
+1-1
diff --git a/Diff for: ‎tests/test_s2orc_versions.py
+1-1 b/Diff for: ‎tests/test_s2orc_versions.py
+1-1
@@ -8,9 +8,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ### Added
 - AllenAI s2orc-doc2json as a clowder extractor. [#3](https://github.com/clowder-framework/text-extractor/issues/3)
+- Conversion of json output to text. [#8](https://github.com/clowder-framework/text-extractor/issues/8)
 
 ### Changed
 
 
 ### Fixed
+- Output json filename changed to same name as input file. [#6](https://github.com/clowder-framework/text-extractor/issues/6)
 
@@ -1,6 +1,6 @@
 FROM python:3.10
 
-COPY doc2json ./doc2json
+COPY doc2txt ./doc2txt
 COPY tests ./tests
 COPY build_run.sh setup.py textextractor.py requirements.txt extractor_info.json ./
 
 
@@ -4,9 +4,9 @@
 import hashlib
 import requests
 from flask import Flask, request, jsonify, flash, url_for, redirect, render_template, send_file
-from doc2json.grobid2json.process_pdf import process_pdf_stream
-from doc2json.tex2json.process_tex import process_tex_stream
-from doc2json.jats2json.process_jats import process_jats_stream
+from doc2txt.grobid2json.process_pdf import process_pdf_stream
+from doc2txt.tex2json.process_tex import process_tex_stream
+from doc2txt.jats2json.process_jats import process_jats_stream
 
 app = Flask(__name__)
 
 
@@ -5,7 +5,7 @@
 import time
 import glob
 import logging
-from doc2json.grobid2json.grobid.client import ApiClient
+from doc2txt.grobid2json.grobid.client import ApiClient
 import ntpath
 from typing import List
 
@@ -124,28 +124,31 @@ def process_pdf_stream(self, pdf_file: str, pdf_strm: bytes, output: str, servic
         elif status != 200:
             with open(os.path.join(output, "failed.log"), "a+") as failed:
                 failed.write(pdf_file.strip(".pdf") + "\n")
-            print('Processing failed with error ' + str(status))
+            log.error('Processing failed with error %s', str(status))
             return ""
         else:
             return res.text
 
-    def process_pdf(self, pdf_file: str, output: str, service: str) -> None:
+    def process_pdf(self, pdf_file: str, input_filename: str, output: str, service: str) -> None:
         # check if TEI file is already produced
         # we use ntpath here to be sure it will work on Windows too
-        pdf_file_name = ntpath.basename(pdf_file)
-        filename = os.path.join(output, os.path.splitext(pdf_file_name)[0] + '.tei.xml')
+        #pdf_file_name = ntpath.basename(pdf_file)
+        filename = os.path.join(output, input_filename + '.tei.xml')
         if os.path.isfile(filename):
             return
 
-        log.info("PDF File to process is %s", pdf_file)
+        log.info("Processing pdf file in path %s with name %s", pdf_file, input_filename)
         pdf_strm = open(pdf_file, 'rb').read()
-        tei_text = self.process_pdf_stream(pdf_file, pdf_strm, output, service)
+        tei_text = self.process_pdf_stream(input_filename, pdf_strm, output, service)
 
         # writing TEI file
         if tei_text:
             with io.open(filename, 'w+', encoding='utf8') as tei_file:
-                log.info("writing to tei file %s", tei_file)
+                log.info("Writing to tei file %s", tei_file)
                 tei_file.write(tei_text)
+        else:
+            log.error("TEI processing unsuccessful")
+
 
     def process_citation(self, bib_string: str, log_file: str) -> str:
         # process citation raw string and return corresponding dict
 
@@ -6,12 +6,12 @@
 from bs4 import BeautifulSoup
 from typing import Optional, Dict
 
-from doc2json.grobid2json.grobid.grobid_client import GrobidClient
-from doc2json.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, convert_tei_xml_soup_to_s2orc_json
+from doc2txt.grobid2json.grobid.grobid_client import GrobidClient
+from doc2txt.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, convert_tei_xml_soup_to_s2orc_json
+from doc2txt.json2txt.json2txt import process_json
 
 BASE_TEMP_DIR = 'temp'
 BASE_OUTPUT_DIR = 'output'
-BASE_LOG_DIR = 'log'
 
 # create log object with current module name
 log = logging.getLogger(__name__)
@@ -40,54 +40,61 @@ def process_pdf_stream(input_file: str, sha: str, input_stream: bytes, grobid_co
 
 def process_pdf_file(
         input_file: str,
-        temp_dir: str = BASE_TEMP_DIR,
-        output_dir: str = BASE_OUTPUT_DIR,
+        input_filename :str,
+        temp_dir: str,
+        output_dir: str,
         grobid_config: Optional[Dict] = None
-) -> str:
+) -> [str, str, str]:
     """
     Process a PDF file and get JSON representation
-    :param input_file:
+    :param input_file: input file resource
+    :param input_filename: input filename resource
     :param temp_dir:
     :param output_dir:
-    :return:
+    :return: xml output file, json output file, txt output file
     """
     os.makedirs(temp_dir, exist_ok=True)
     os.makedirs(output_dir, exist_ok=True)
 
-    # get paper id as the name of the file
-    paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
-    tei_file = os.path.join(temp_dir, f'{paper_id}.tei.xml')
-    output_file = os.path.join(output_dir, f'{paper_id}.json')
-    log.info("Files %s, %s, %s", paper_id, tei_file, output_file)
+    # filenames for tei and json outputs
+    tei_file = os.path.join(temp_dir, f'{input_filename}.tei.xml')
+    json_file = os.path.join(output_dir, f'{input_filename}.json')
+    txt_file = os.path.join(output_dir, f'{input_filename}.txt')
 
     # check if input file exists and output file doesn't
     if not os.path.exists(input_file):
         raise FileNotFoundError(f"{input_file} doesn't exist")
-    if os.path.exists(output_file):
-        print(f'{output_file} already exists!')
+    if os.path.exists(json_file):
+        log.warning(f'{json_file} already exists!')
 
     # process PDF through Grobid -> TEI.XML
     client = GrobidClient(grobid_config)
     # TODO: compute PDF hash
     # TODO: add grobid version number to output
-    client.process_pdf(input_file, temp_dir, "processFulltextDocument")
+    client.process_pdf(input_file, input_filename, temp_dir, "processFulltextDocument")
 
     # process TEI.XML -> JSON
     assert os.path.exists(tei_file)
     paper = convert_tei_xml_file_to_s2orc_json(tei_file)
 
     # write to file
-    with open(output_file, 'w') as outf:
+    with open(json_file, 'w') as outf:
         json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
 
-    return output_file
+    # extract text field from json and write to file
+    output_txt = process_json(json_file, "text")
+    with open(txt_file, 'w') as outfile:
+        for text in output_txt:
+            outfile.write(f"{text}\n")
+
+    return tei_file, json_file, txt_file
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
     parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
     parser.add_argument("-t", "--temp", default=BASE_TEMP_DIR, help="path to the temp dir for putting tei xml files")
-    parser.add_argument("-o", "--output", default=BASE_OUTPUT_DIR, help="path to the output dir for putting json files")
+    parser.add_argument("-o", "--output", default=BASE_OUTPUT_DIR, help="path to the output dir for putting json and txt files")
     parser.add_argument("-k", "--keep", action='store_true')
 
     args = parser.parse_args()
@@ -102,7 +109,8 @@ def process_pdf_file(
     os.makedirs(temp_path, exist_ok=True)
     os.makedirs(output_path, exist_ok=True)
 
-    process_pdf_file(input_path, temp_path, output_path)
+    input_filename = os.path.splitext(os.path.basename(input_path))[0]
+    tei_file, json_file, txt_file = process_pdf_file(input_path, input_filename, temp_path, output_path)
 
     runtime = round(time.time() - start_time, 3)
     print("runtime: %s seconds " % (runtime))
 
@@ -7,12 +7,12 @@
 from bs4 import BeautifulSoup, NavigableString
 from typing import List, Dict, Tuple
 
-from doc2json.s2orc import Paper
+from doc2txt.s2orc import Paper
 
-from doc2json.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml
-from doc2json.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD
-from doc2json.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse
-from doc2json.utils.refspan_util import sub_spans_and_update_indices
+from doc2txt.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml
+from doc2txt.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD
+from doc2txt.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse
+from doc2txt.utils.refspan_util import sub_spans_and_update_indices
 
 
 REPLACE_TABLE_TOKS = {
 
@@ -0,0 +1,62 @@
+# Convert json to text
+
+import json
+import logging
+
+# create log object with current module name
+log = logging.getLogger(__name__)
+
+
+def process_json(input_file, key):
+    """
+    Method to convert json file to text.
+    Extracts data from the key field and return a list of strings as text
+    Args:
+        input_file (str): Json input file
+        key (str): Json field key to extract data
+    Returns:
+        output (list): List of text data extracted from json
+    """
+    json_file = open(input_file)
+    json_data = json.load(json_file)  # load json object to a dictionary
+    # if using grobid, one can also use the pdf_parse key and title key.
+    title_text = json_data["title"]
+    pdf_json_data = json_data["pdf_parse"]
+    abstract_data = pdf_json_data["abstract"]
+    body_data = pdf_json_data["body_text"]
+    output = []
+    # append title text to output
+    output.append(title_text)
+    # append abstract text in body
+    for i in item_generator(abstract_data, key):
+        output.append(i)
+    # append body text in body
+    for i in item_generator(body_data, key):
+        output.append(i)
+
+    json_file.close()
+
+    return output
+
+
+def item_generator(json_data, lookup_key):
+    """
+    Method to extract a field from nested json data.
+    Extracts data from the key field and return the value
+    Args:
+        json_data (str): Json input data
+        lookup_key (str): Json field key to extract data
+    Returns:
+        output (list): List of text data extracted from json
+    """
+    if isinstance(json_data, dict):
+        for k, v in json_data.items():
+            if k == lookup_key:
+                yield v
+            else:
+                # yield from item_generator(v, lookup_key)
+                # no nested lookups. only text in the first dictionary items is appended to output
+                pass
+    elif isinstance(json_data, list):
+        for item in json_data:
+            yield from item_generator(item, lookup_key)
@@ -4,7 +4,7 @@
 
 from datetime import datetime
 from typing import Dict, List, Optional
-from doc2json.config import *
+from doc2txt.config import *
 
 
 CORRECT_KEYS = {
 
@@ -4,8 +4,8 @@
 import time
 from typing import Dict
 
-from doc2json.spp2json.spp.spp_client import SppClient
-from doc2json.spp2json.spp.spp_json_to_s2orc_json import convert_spp_json_to_s2orc_json
+from doc2txt.spp2json.spp.spp_client import SppClient
+from doc2txt.spp2json.spp.spp_json_to_s2orc_json import convert_spp_json_to_s2orc_json
 
 
 
 
@@ -1,26 +1,24 @@
 {
   "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
-  "name": "textextractor",
-  "version": "0.1",
-  "description": "extracts text from pdf files",
+  "name": "extractors-pdf2text",
+  "version": "0.4",
+  "description": "Extracts text from pdf files. Creates an xml, json and txt file and uploads to Clowder dataset. Uses Grobid service and AllenAI s2orc-doc2json",
   "author": "Mathew, Minu <[email protected]>; Lo, Kyle  and Wang, Lucy Lu  and Neumann, Mark  and Kinney, Rodney  and Weld, Daniel",
   "contributors": [],
   "contexts": [{}],
   "repository": [
     {
       "repType": "git",
-      "repUrl": "https://github.com/clowder-framework/text-extractor"
+      "repUrl": "https://github.com/clowder-framework/extractors-s2orc-pdf2text"
     }
   ],
   "process": {
     "file": [
-      "application/pdf",
-      "application/*",
-      "text/plain"
+      "application/pdf"
     ]
   },
   "external_services": [],
   "dependencies": [],
   "bibtex": [],
-  "labels": ["Type/Any"]
+  "labels": ["Type/PDF"]
 }
@@ -2,7 +2,7 @@
 import setuptools
 
 setuptools.setup(
-    name='doc2json',
+    name='doc2txt',
     version='0.1',
     packages=setuptools.find_packages(),
     install_requires=[
 
@@ -2,9 +2,9 @@
 import unittest
 import shutil
 
-from doc2json.grobid2json.process_pdf import process_pdf_file
-from doc2json.tex2json.process_tex import process_tex_file
-from doc2json.jats2json.process_jats import process_jats_file
+from doc2txt.grobid2json.process_pdf import process_pdf_file
+from doc2txt.tex2json.process_tex import process_tex_file
+from doc2txt.jats2json.process_jats import process_jats_file
 
 TEST_PDF_INPUT_DATA = os.path.join('tests', 'pdf')
 TEST_PDF_TEMP_DATA = os.path.join('tests', 'pdf_temp')
 
@@ -2,7 +2,7 @@
 import unittest
 import json
 
-from doc2json.s2orc import load_s2orc
+from doc2txt.s2orc import load_s2orc
 
 JSON_INPUT_DATA = os.path.join('tests', 'pdf', 'N18-3011.json')
 
 
@@ -2,7 +2,7 @@
 import unittest
 import json
 
-from doc2json.s2orc import load_s2orc
+from doc2txt.s2orc import load_s2orc
 
 TEST_S2ORC_INPUT_DATA = os.path.join('tests', 's2orc')
 TEST_S2ORC_CURRENT = os.path.join(TEST_S2ORC_INPUT_DATA, '20210101')