Tcat auto import (#203)

* Basic processor to send info to a TCAT instance * get_results_url() added to dataset * Attempt to handle docker to docker tcat exports * rename Docker db to 4cat_db to avoid confusing myself * reorder config.py * add tcat username and pass to config.py * add access log for Gunicorn * Add tcat user/pass, url parsing, and better error handling * Handle already uploaded case * typo in tcat_auto_upload * more typos! stupid cut and paste with your tabs and spaces... * Fix docker compose for merge * Small changes + One-click preset Co-authored-by: Stijn Peeters <[email protected]> Co-authored-by: Stijn Peeters <[email protected]>
digitalmethodsinitiative · Dec 8, 2021 · 938c168 · 938c168
1 parent 1881dad
commit 938c168
Show file tree

Hide file tree

Showing 6 changed files with 239 additions and 33 deletions.
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
@@ -911,6 +911,20 @@ def get_extension(self):
 			return self.get_results_path().suffix[1:]
 		return False
 
+	def get_result_url(self):
+		"""
+		Gets the 4CAT frontend URL of a dataset file.
+
+		Uses the config.py FlaskConfig attributes (i.e., SERVER_NAME and
+		SERVER_HTTPS) plus hardcoded '/result/'.
+		TODO: create more dynamic method of obtaining url.
+		"""
+		filename = self.get_results_path().name
+		url_to_file = ('https://' if config.FlaskConfig.SERVER_HTTPS else 'http://') + \
+						config.FlaskConfig.SERVER_NAME + '/result/' + filename
+		return url_to_file
+
+
 	def __getattr__(self, attr):
 		"""
 		Getter so we don't have to use .data all the time

diff --git a/config.py-example b/config.py-example
@@ -18,6 +18,43 @@ DATASOURCES = {
     "twitterv2": {}
 }
 
+#####################
+# Processor Options #
+#####################
+
+# download_images.py
+MAX_NUMBER_IMAGES = 1000
+
+# YouTube variables to use for processors
+YOUTUBE_API_SERVICE_NAME = "youtube"
+YOUTUBE_API_VERSION = "v3"
+YOUTUBE_DEVELOPER_KEY = ""
+
+# Tumblr API keys to use for data capturing
+TUMBLR_CONSUMER_KEY = ""
+TUMBLR_CONSUMER_SECRET_KEY = ""
+TUMBLR_API_KEY = ""
+TUMBLR_API_SECRET_KEY = ""
+
+# Reddit API keys
+REDDIT_API_CLIENTID = ""
+REDDIT_API_SECRET = ""
+
+# tcat_auto_upload.py
+TCAT_SERVER = ''
+TCAT_TOKEN = ''
+TCAT_USERNAME = ''
+TCAT_PASSWORD = ''
+
+# pix-plot.py
+# If you host a version of https://github.com/digitalmethodsinitiative/dmi_pix_plot, you can use a processor to publish
+# downloaded images into a PixPlot there
+PIXPLOT_SERVER = ""
+
+########################
+# 4CAT General Options #
+########################
+
 # Configure how the tool is to be named in its web interface. The backend will
 # always refer to '4CAT' - the name of the software, and a 'powered by 4CAT'
 # notice may also show up in the web interface regardless of the value entered here.
@@ -31,10 +68,6 @@ DB_USER = "fourcat"
 DB_NAME = "fourcat"
 DB_PASSWORD = "supers3cr3t"
 
-# Processor Options
-# download_images.py
-MAX_NUMBER_IMAGES = 1000
-
 # Path to folders where logs/images/data may be saved.
 # Paths are relative to the folder this config file is in.
 PATH_ROOT = os.path.abspath(os.path.dirname(__file__))  # better don't change this
@@ -83,36 +116,11 @@ MAIL_USERNAME = ""
 MAIL_PASSWORD = ""
 NOREPLY_EMAIL = "noreply@localhost"
 
-
 # Scrape settings for data sources that contain their own scrapers
 SCRAPE_TIMEOUT = 5  # how long to wait for a scrape request to finish?
 SCRAPE_PROXIES = {"http": []}  # Items in this list should be formatted like "http://111.222.33.44:1234"
 IMAGE_INTERVAL = 3600
 
-# YouTube variables to use for processors
-YOUTUBE_API_SERVICE_NAME = "youtube"
-YOUTUBE_API_VERSION = "v3"
-YOUTUBE_DEVELOPER_KEY = ""
-
-# Tumblr API keys to use for data capturing
-TUMBLR_CONSUMER_KEY = ""
-TUMBLR_CONSUMER_SECRET_KEY = ""
-TUMBLR_API_KEY = ""
-TUMBLR_API_SECRET_KEY = ""
-
-# Reddit API keys
-REDDIT_API_CLIENTID = ""
-REDDIT_API_SECRET = ""
-
-# PixPlot Server
-# If you host a version of https://github.com/digitalmethodsinitiative/dmi_pix_plot, you can use a processor to publish
-# downloaded images into a PixPlot there
-PIXPLOT_SERVER = ""
-
-# Explorer settings
-# The maximum allowed amount of rows (prevents timeouts and memory errors)
-MAX_POSTS_EXPLORER = 500000
-
 # Web tool settings
 class FlaskConfig:
     FLASK_APP = 'webtool/fourcat'
@@ -123,7 +131,6 @@ class FlaskConfig:
     HOSTNAME_WHITELIST_API = ["localhost"]  # hostnames matching these are exempt from rate limiting
     HOSTNAME_WHITELIST_NAME = "Automatic login"
 
-
 ##########
 # DOCKER #
 ##########

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,7 +2,7 @@ version: '3.6'
 
 services:
   db:
-    container_name: db
+    container_name: 4cat_db
     image: postgres:latest
     environment:
       - POSTGRES_USER=${POSTGRES_USER}

diff --git a/processors/conversion/tcat_auto_upload.py b/processors/conversion/tcat_auto_upload.py
@@ -0,0 +1,134 @@
+"""
+Send TCAT-ready json to a particular TCAT instance
+"""
+import requests
+import random
+from urllib.parse import urlparse
+
+from backend.abstract.processor import BasicProcessor
+from common.lib.exceptions import ProcessorException
+
+import config
+
+__author__ = "Dale Wahl"
+__credits__ = ["Dale Wahl"]
+__maintainer__ = "Dale Wahl"
+__email__ = "[email protected]"
+
+
+class FourcatToDmiTcatUploader(BasicProcessor):
+    """
+    Send TCAT-ready json to a particular TCAT instance.
+    File to  be imported by TCAT's import-jsondump.php
+    """
+    type = "tcat-auto-upload"  # job type ID
+    category = "Conversion"  # category
+    title = "Upload to DMI-TCAT"  # title displayed in UI
+    description = "Send TCAT-ready json to a particular DMI-TCAT instance."  # description displayed in UI
+    extension = "html"  # extension of result file, used internally and in UI
+
+    @classmethod
+    def is_compatible_with(cls, module=None):
+        """
+        Determine if processor is compatible with dataset
+
+        :param module: Dataset or processor to determine compatibility with
+        """
+        return module.type == "convert-ndjson-for-tcat" and \
+            hasattr(config, 'TCAT_SERVER') and \
+            config.TCAT_SERVER and \
+            hasattr(config, 'TCAT_TOKEN') and \
+            hasattr(config, 'TCAT_USERNAME') and \
+            hasattr(config, 'TCAT_PASSWORD')
+
+    def process(self):
+        """
+        Send TCAT-ready json to a particular TCAT instance.
+        """
+        self.dataset.update_status("Preparing upload")
+        bin_name = ''.join(e if e.isalnum() else '_' for e in self.dataset.top_parent().get_label())
+        self.dataset.log('Label for DMI-TCAT bin_name: ' + bin_name)
+
+        url_to_file = self.dataset.get_parent().get_result_url()
+        self.dataset.log('File location URL: ' + url_to_file)
+
+        tcat_server = config.TCAT_SERVER
+        if type(config.TCAT_SERVER) in (list, tuple, set):
+            tcat_server = random.choice(config.TCAT_SERVER)
+
+        # DOCKER shenanigans
+        self.dataset.log('Docker search: ' + str('host.docker.internal' in config.TCAT_SERVER))
+        if 'host.docker.internal' in config.TCAT_SERVER:
+            url_to_file = url_to_file.replace('localhost', 'host.docker.internal')
+            self.dataset.log('New URL: ' + url_to_file)
+
+        query = str(self.dataset.top_parent().get_parameters().get("query", ""))
+        self.dataset.log('Twitter query: ' + query)
+
+        # TCAT authorization information
+        auth = (config.TCAT_USERNAME, config.TCAT_PASSWORD)
+
+        # from urlparse import urlparse  # Python 2
+        parsed_uri = urlparse(config.TCAT_SERVER)
+        post_json_url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
+        post_json_url = post_json_url + '/api/import-from-4cat.php'
+        self.dataset.update_status("Sending dataset to DMI-TCAT: %s" % post_json_url)
+        response = requests.post(post_json_url, auth=auth, data={
+                                                'url': url_to_file,
+                                                'name': bin_name,
+                                                'query': query,
+                                                'token': config.TCAT_TOKEN,
+                                                })
+
+        if response.status_code == 404:
+            raise ProcessorException('DMI-TCAT URL 404 error at %s' % config.TCAT_SERVER)
+        elif response.status_code != 200:
+            raise ProcessorException('DMI-TCAT Connection Error %i error: %s' % (response.status_code, str(response.reason)))
+        else:
+            pass
+
+        try:
+            resp_content = response.json()
+        except ValueError:
+            # If import-jsondump.php fails, no json response is returned
+            if 'The query bin' in response.text and 'already exists' in response.text:
+                # Query bin already uploaded
+                # TODO: look at possibility to add to existing bin?
+                self.dataset.update_status("DMI-TCAT bin already exists; unable to add to existing bin.", is_final=True)
+                self.dataset.finish(0)
+                return
+            else:
+                # Something else is wrong...
+                raise ProcessorException('DMI-TCAT Unexpected response: %s - %s - %s' %  (response.status_code, str(response.reason), response.text))
+
+        if 'success' not in resp_content:
+            # A json response was returned, but not the one we're expecting!
+            raise ProcessorException('DMI-TCAT Unexpected response: %s - %s - %s' %  (response.status_code, str(response.reason), response.text))
+        elif not resp_content['success']:
+            # success should be True if upload was successful
+            raise ProcessorException('DMI-TCAT Import failure: %s' % str(resp_content))
+        else:
+            pass
+
+        self.dataset.update_status("Waiting for upload to complete")
+        # Unsure how to query TCAT, invalid bin_name still returns 200 response
+        # Could attempt to parse the resultant HTML
+
+        self.dataset.update_status("Upload complete, writing HTML file")
+        # Create HTML file
+        tcat_result_url = config.TCAT_SERVER.replace('/api/import-from-4cat.php', '').rstrip('/') + '/analysis/index.php?dataset=' + bin_name
+        html_file = self.get_html_page(tcat_result_url)
+
+        # Write HTML file
+        with self.dataset.get_results_path().open("w", encoding="utf-8") as output_file:
+            output_file.write(html_file)
+
+        # Finish
+        self.dataset.update_status("Finished")
+        self.dataset.finish(self.dataset.top_parent().num_rows)
+
+    def get_html_page(self, url):
+        """
+        Returns a html string to redirect to the location of the DMI-TCAT dataset.
+        """
+        return f"<head><meta http-equiv='refresh' content='0; URL={url}'></head>"
diff --git a/processors/conversion/twitter_ndjson_to_tcat_json.py b/processors/conversion/twitter_ndjson_to_tcat_json.py
@@ -18,8 +18,8 @@ class ConvertNDJSONToJSON(BasicProcessor):
     type = "convert-ndjson-for-tcat"  # job type ID
     category = "Conversion"  # category
     title = "Convert to TCAT JSON"  # title displayed in UI
-    description = "Convert a NDJSON Twitter file to TCAT JSON format. Can be imported with TCAT's import-jsondump.php script."  # description displayed in UI
-    extension = "json"  # extension of result file, used internally and in UI
+    description = "Convert a Twitter dataset to a format compatible with DMI-TCAT and upload it to an available DMI-TCAT instance."  # description displayed in UI
+    extension = "html"  # extension of result file, used internally and in UI
 
     @classmethod
     def is_compatible_with(cls, module=None):

diff --git a/processors/presets/upload-to-dmi-tcat.py b/processors/presets/upload-to-dmi-tcat.py
@@ -0,0 +1,51 @@
+"""
+Upload Twitter dataset to DMI-TCAT instance
+"""
+from backend.abstract.preset import ProcessorPreset
+
+import config
+
+class FourcatToDmiTcatConverterAndUploader(ProcessorPreset):
+    """
+    Run processor pipeline to extract neologisms
+    """
+    type = "preset-upload-tcat"  # job type ID
+    category = "Presets"  # category. 'Presets' are always listed first in the UI.
+    title = "Upload to DMI-TCAT"  # title displayed in UI
+    description = "Convert the dataset to a format compatible with DMI-TCAT and upload it to an available instance."  # description displayed in UI
+    extension = "svg"
+
+    @classmethod
+    def is_compatible_with(cls, module=None):
+        """
+        Determine if processor is compatible with dataset
+
+        :param module: Dataset or processor to determine compatibility with
+        """
+        return module.type == "twitterv2-search" and \
+               hasattr(config, 'TCAT_SERVER') and \
+               config.TCAT_SERVER and \
+               hasattr(config, 'TCAT_TOKEN') and \
+               hasattr(config, 'TCAT_USERNAME') and \
+               hasattr(config, 'TCAT_PASSWORD')
+
+    def get_processor_pipeline(self):
+        """
+        This queues a series of post-processors to upload a dataset to a
+        DMI-TCAT instance.
+        """
+
+        pipeline = [
+            # first, convert to import-able format
+            {
+                "type": "convert-ndjson-for-tcat",
+                "parameters": {}
+            },
+            # then, upload it
+            {
+                "type": "tcat-auto-upload",
+                "parameters": {}
+            }
+        ]
+
+        return pipeline