diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 9a393e5ec..335668035 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -911,6 +911,20 @@ def get_extension(self): return self.get_results_path().suffix[1:] return False + def get_result_url(self): + """ + Gets the 4CAT frontend URL of a dataset file. + + Uses the config.py FlaskConfig attributes (i.e., SERVER_NAME and + SERVER_HTTPS) plus hardcoded '/result/'. + TODO: create more dynamic method of obtaining url. + """ + filename = self.get_results_path().name + url_to_file = ('https://' if config.FlaskConfig.SERVER_HTTPS else 'http://') + \ + config.FlaskConfig.SERVER_NAME + '/result/' + filename + return url_to_file + + def __getattr__(self, attr): """ Getter so we don't have to use .data all the time diff --git a/config.py-example b/config.py-example index 3b3823e42..f6790f50f 100644 --- a/config.py-example +++ b/config.py-example @@ -18,6 +18,43 @@ DATASOURCES = { "twitterv2": {} } +##################### +# Processor Options # +##################### + +# download_images.py +MAX_NUMBER_IMAGES = 1000 + +# YouTube variables to use for processors +YOUTUBE_API_SERVICE_NAME = "youtube" +YOUTUBE_API_VERSION = "v3" +YOUTUBE_DEVELOPER_KEY = "" + +# Tumblr API keys to use for data capturing +TUMBLR_CONSUMER_KEY = "" +TUMBLR_CONSUMER_SECRET_KEY = "" +TUMBLR_API_KEY = "" +TUMBLR_API_SECRET_KEY = "" + +# Reddit API keys +REDDIT_API_CLIENTID = "" +REDDIT_API_SECRET = "" + +# tcat_auto_upload.py +TCAT_SERVER = '' +TCAT_TOKEN = '' +TCAT_USERNAME = '' +TCAT_PASSWORD = '' + +# pix-plot.py +# If you host a version of https://github.com/digitalmethodsinitiative/dmi_pix_plot, you can use a processor to publish +# downloaded images into a PixPlot there +PIXPLOT_SERVER = "" + +######################## +# 4CAT General Options # +######################## + # Configure how the tool is to be named in its web interface. The backend will # always refer to '4CAT' - the name of the software, and a 'powered by 4CAT' # notice may also show up in the web interface regardless of the value entered here. @@ -31,10 +68,6 @@ DB_USER = "fourcat" DB_NAME = "fourcat" DB_PASSWORD = "supers3cr3t" -# Processor Options -# download_images.py -MAX_NUMBER_IMAGES = 1000 - # Path to folders where logs/images/data may be saved. # Paths are relative to the folder this config file is in. PATH_ROOT = os.path.abspath(os.path.dirname(__file__)) # better don't change this @@ -83,36 +116,11 @@ MAIL_USERNAME = "" MAIL_PASSWORD = "" NOREPLY_EMAIL = "noreply@localhost" - # Scrape settings for data sources that contain their own scrapers SCRAPE_TIMEOUT = 5 # how long to wait for a scrape request to finish? SCRAPE_PROXIES = {"http": []} # Items in this list should be formatted like "http://111.222.33.44:1234" IMAGE_INTERVAL = 3600 -# YouTube variables to use for processors -YOUTUBE_API_SERVICE_NAME = "youtube" -YOUTUBE_API_VERSION = "v3" -YOUTUBE_DEVELOPER_KEY = "" - -# Tumblr API keys to use for data capturing -TUMBLR_CONSUMER_KEY = "" -TUMBLR_CONSUMER_SECRET_KEY = "" -TUMBLR_API_KEY = "" -TUMBLR_API_SECRET_KEY = "" - -# Reddit API keys -REDDIT_API_CLIENTID = "" -REDDIT_API_SECRET = "" - -# PixPlot Server -# If you host a version of https://github.com/digitalmethodsinitiative/dmi_pix_plot, you can use a processor to publish -# downloaded images into a PixPlot there -PIXPLOT_SERVER = "" - -# Explorer settings -# The maximum allowed amount of rows (prevents timeouts and memory errors) -MAX_POSTS_EXPLORER = 500000 - # Web tool settings class FlaskConfig: FLASK_APP = 'webtool/fourcat' @@ -123,7 +131,6 @@ class FlaskConfig: HOSTNAME_WHITELIST_API = ["localhost"] # hostnames matching these are exempt from rate limiting HOSTNAME_WHITELIST_NAME = "Automatic login" - ########## # DOCKER # ########## diff --git a/docker-compose.yml b/docker-compose.yml index 0e8785626..5496aa215 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.6' services: db: - container_name: db + container_name: 4cat_db image: postgres:latest environment: - POSTGRES_USER=${POSTGRES_USER} diff --git a/processors/conversion/tcat_auto_upload.py b/processors/conversion/tcat_auto_upload.py new file mode 100644 index 000000000..be2cec6c3 --- /dev/null +++ b/processors/conversion/tcat_auto_upload.py @@ -0,0 +1,134 @@ +""" +Send TCAT-ready json to a particular TCAT instance +""" +import requests +import random +from urllib.parse import urlparse + +from backend.abstract.processor import BasicProcessor +from common.lib.exceptions import ProcessorException + +import config + +__author__ = "Dale Wahl" +__credits__ = ["Dale Wahl"] +__maintainer__ = "Dale Wahl" +__email__ = "d.l.wahl@uva.nl" + + +class FourcatToDmiTcatUploader(BasicProcessor): + """ + Send TCAT-ready json to a particular TCAT instance. + File to be imported by TCAT's import-jsondump.php + """ + type = "tcat-auto-upload" # job type ID + category = "Conversion" # category + title = "Upload to DMI-TCAT" # title displayed in UI + description = "Send TCAT-ready json to a particular DMI-TCAT instance." # description displayed in UI + extension = "html" # extension of result file, used internally and in UI + + @classmethod + def is_compatible_with(cls, module=None): + """ + Determine if processor is compatible with dataset + + :param module: Dataset or processor to determine compatibility with + """ + return module.type == "convert-ndjson-for-tcat" and \ + hasattr(config, 'TCAT_SERVER') and \ + config.TCAT_SERVER and \ + hasattr(config, 'TCAT_TOKEN') and \ + hasattr(config, 'TCAT_USERNAME') and \ + hasattr(config, 'TCAT_PASSWORD') + + def process(self): + """ + Send TCAT-ready json to a particular TCAT instance. + """ + self.dataset.update_status("Preparing upload") + bin_name = ''.join(e if e.isalnum() else '_' for e in self.dataset.top_parent().get_label()) + self.dataset.log('Label for DMI-TCAT bin_name: ' + bin_name) + + url_to_file = self.dataset.get_parent().get_result_url() + self.dataset.log('File location URL: ' + url_to_file) + + tcat_server = config.TCAT_SERVER + if type(config.TCAT_SERVER) in (list, tuple, set): + tcat_server = random.choice(config.TCAT_SERVER) + + # DOCKER shenanigans + self.dataset.log('Docker search: ' + str('host.docker.internal' in config.TCAT_SERVER)) + if 'host.docker.internal' in config.TCAT_SERVER: + url_to_file = url_to_file.replace('localhost', 'host.docker.internal') + self.dataset.log('New URL: ' + url_to_file) + + query = str(self.dataset.top_parent().get_parameters().get("query", "")) + self.dataset.log('Twitter query: ' + query) + + # TCAT authorization information + auth = (config.TCAT_USERNAME, config.TCAT_PASSWORD) + + # from urlparse import urlparse # Python 2 + parsed_uri = urlparse(config.TCAT_SERVER) + post_json_url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) + post_json_url = post_json_url + '/api/import-from-4cat.php' + self.dataset.update_status("Sending dataset to DMI-TCAT: %s" % post_json_url) + response = requests.post(post_json_url, auth=auth, data={ + 'url': url_to_file, + 'name': bin_name, + 'query': query, + 'token': config.TCAT_TOKEN, + }) + + if response.status_code == 404: + raise ProcessorException('DMI-TCAT URL 404 error at %s' % config.TCAT_SERVER) + elif response.status_code != 200: + raise ProcessorException('DMI-TCAT Connection Error %i error: %s' % (response.status_code, str(response.reason))) + else: + pass + + try: + resp_content = response.json() + except ValueError: + # If import-jsondump.php fails, no json response is returned + if 'The query bin' in response.text and 'already exists' in response.text: + # Query bin already uploaded + # TODO: look at possibility to add to existing bin? + self.dataset.update_status("DMI-TCAT bin already exists; unable to add to existing bin.", is_final=True) + self.dataset.finish(0) + return + else: + # Something else is wrong... + raise ProcessorException('DMI-TCAT Unexpected response: %s - %s - %s' % (response.status_code, str(response.reason), response.text)) + + if 'success' not in resp_content: + # A json response was returned, but not the one we're expecting! + raise ProcessorException('DMI-TCAT Unexpected response: %s - %s - %s' % (response.status_code, str(response.reason), response.text)) + elif not resp_content['success']: + # success should be True if upload was successful + raise ProcessorException('DMI-TCAT Import failure: %s' % str(resp_content)) + else: + pass + + self.dataset.update_status("Waiting for upload to complete") + # Unsure how to query TCAT, invalid bin_name still returns 200 response + # Could attempt to parse the resultant HTML + + self.dataset.update_status("Upload complete, writing HTML file") + # Create HTML file + tcat_result_url = config.TCAT_SERVER.replace('/api/import-from-4cat.php', '').rstrip('/') + '/analysis/index.php?dataset=' + bin_name + html_file = self.get_html_page(tcat_result_url) + + # Write HTML file + with self.dataset.get_results_path().open("w", encoding="utf-8") as output_file: + output_file.write(html_file) + + # Finish + self.dataset.update_status("Finished") + self.dataset.finish(self.dataset.top_parent().num_rows) + + def get_html_page(self, url): + """ + Returns a html string to redirect to the location of the DMI-TCAT dataset. + """ + return f"
" diff --git a/processors/conversion/twitter_ndjson_to_tcat_json.py b/processors/conversion/twitter_ndjson_to_tcat_json.py index efee1ef16..c77e7a5fe 100644 --- a/processors/conversion/twitter_ndjson_to_tcat_json.py +++ b/processors/conversion/twitter_ndjson_to_tcat_json.py @@ -18,8 +18,8 @@ class ConvertNDJSONToJSON(BasicProcessor): type = "convert-ndjson-for-tcat" # job type ID category = "Conversion" # category title = "Convert to TCAT JSON" # title displayed in UI - description = "Convert a NDJSON Twitter file to TCAT JSON format. Can be imported with TCAT's import-jsondump.php script." # description displayed in UI - extension = "json" # extension of result file, used internally and in UI + description = "Convert a Twitter dataset to a format compatible with DMI-TCAT and upload it to an available DMI-TCAT instance." # description displayed in UI + extension = "html" # extension of result file, used internally and in UI @classmethod def is_compatible_with(cls, module=None): diff --git a/processors/presets/upload-to-dmi-tcat.py b/processors/presets/upload-to-dmi-tcat.py new file mode 100644 index 000000000..9dd0e780f --- /dev/null +++ b/processors/presets/upload-to-dmi-tcat.py @@ -0,0 +1,51 @@ +""" +Upload Twitter dataset to DMI-TCAT instance +""" +from backend.abstract.preset import ProcessorPreset + +import config + +class FourcatToDmiTcatConverterAndUploader(ProcessorPreset): + """ + Run processor pipeline to extract neologisms + """ + type = "preset-upload-tcat" # job type ID + category = "Presets" # category. 'Presets' are always listed first in the UI. + title = "Upload to DMI-TCAT" # title displayed in UI + description = "Convert the dataset to a format compatible with DMI-TCAT and upload it to an available instance." # description displayed in UI + extension = "svg" + + @classmethod + def is_compatible_with(cls, module=None): + """ + Determine if processor is compatible with dataset + + :param module: Dataset or processor to determine compatibility with + """ + return module.type == "twitterv2-search" and \ + hasattr(config, 'TCAT_SERVER') and \ + config.TCAT_SERVER and \ + hasattr(config, 'TCAT_TOKEN') and \ + hasattr(config, 'TCAT_USERNAME') and \ + hasattr(config, 'TCAT_PASSWORD') + + def get_processor_pipeline(self): + """ + This queues a series of post-processors to upload a dataset to a + DMI-TCAT instance. + """ + + pipeline = [ + # first, convert to import-able format + { + "type": "convert-ndjson-for-tcat", + "parameters": {} + }, + # then, upload it + { + "type": "tcat-auto-upload", + "parameters": {} + } + ] + + return pipeline