Skip to content

Commit

Permalink
Tcat auto import (#203)
Browse files Browse the repository at this point in the history
* Basic processor to send info to a TCAT instance

* get_results_url() added to dataset

* Attempt to handle docker to docker tcat exports

* rename Docker db to 4cat_db to avoid confusing myself

* reorder config.py

* add tcat username and pass to config.py

* add access log for Gunicorn

* Add tcat user/pass, url parsing, and better error handling

* Handle already uploaded case

* typo in tcat_auto_upload

* more typos! stupid cut and paste with your tabs and spaces...

* Fix docker compose for merge

* Small changes + One-click preset

Co-authored-by: Stijn Peeters <[email protected]>
Co-authored-by: Stijn Peeters <[email protected]>
  • Loading branch information
3 people authored Dec 8, 2021
1 parent 1881dad commit 938c168
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 33 deletions.
14 changes: 14 additions & 0 deletions common/lib/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,20 @@ def get_extension(self):
return self.get_results_path().suffix[1:]
return False

def get_result_url(self):
"""
Gets the 4CAT frontend URL of a dataset file.
Uses the config.py FlaskConfig attributes (i.e., SERVER_NAME and
SERVER_HTTPS) plus hardcoded '/result/'.
TODO: create more dynamic method of obtaining url.
"""
filename = self.get_results_path().name
url_to_file = ('https://' if config.FlaskConfig.SERVER_HTTPS else 'http://') + \
config.FlaskConfig.SERVER_NAME + '/result/' + filename
return url_to_file


def __getattr__(self, attr):
"""
Getter so we don't have to use .data all the time
Expand Down
67 changes: 37 additions & 30 deletions config.py-example
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,43 @@ DATASOURCES = {
"twitterv2": {}
}

#####################
# Processor Options #
#####################

# download_images.py
MAX_NUMBER_IMAGES = 1000

# YouTube variables to use for processors
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
YOUTUBE_DEVELOPER_KEY = ""

# Tumblr API keys to use for data capturing
TUMBLR_CONSUMER_KEY = ""
TUMBLR_CONSUMER_SECRET_KEY = ""
TUMBLR_API_KEY = ""
TUMBLR_API_SECRET_KEY = ""

# Reddit API keys
REDDIT_API_CLIENTID = ""
REDDIT_API_SECRET = ""

# tcat_auto_upload.py
TCAT_SERVER = ''
TCAT_TOKEN = ''
TCAT_USERNAME = ''
TCAT_PASSWORD = ''

# pix-plot.py
# If you host a version of https://github.com/digitalmethodsinitiative/dmi_pix_plot, you can use a processor to publish
# downloaded images into a PixPlot there
PIXPLOT_SERVER = ""

########################
# 4CAT General Options #
########################

# Configure how the tool is to be named in its web interface. The backend will
# always refer to '4CAT' - the name of the software, and a 'powered by 4CAT'
# notice may also show up in the web interface regardless of the value entered here.
Expand All @@ -31,10 +68,6 @@ DB_USER = "fourcat"
DB_NAME = "fourcat"
DB_PASSWORD = "supers3cr3t"

# Processor Options
# download_images.py
MAX_NUMBER_IMAGES = 1000

# Path to folders where logs/images/data may be saved.
# Paths are relative to the folder this config file is in.
PATH_ROOT = os.path.abspath(os.path.dirname(__file__)) # better don't change this
Expand Down Expand Up @@ -83,36 +116,11 @@ MAIL_USERNAME = ""
MAIL_PASSWORD = ""
NOREPLY_EMAIL = "noreply@localhost"


# Scrape settings for data sources that contain their own scrapers
SCRAPE_TIMEOUT = 5 # how long to wait for a scrape request to finish?
SCRAPE_PROXIES = {"http": []} # Items in this list should be formatted like "http://111.222.33.44:1234"
IMAGE_INTERVAL = 3600

# YouTube variables to use for processors
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
YOUTUBE_DEVELOPER_KEY = ""

# Tumblr API keys to use for data capturing
TUMBLR_CONSUMER_KEY = ""
TUMBLR_CONSUMER_SECRET_KEY = ""
TUMBLR_API_KEY = ""
TUMBLR_API_SECRET_KEY = ""

# Reddit API keys
REDDIT_API_CLIENTID = ""
REDDIT_API_SECRET = ""

# PixPlot Server
# If you host a version of https://github.com/digitalmethodsinitiative/dmi_pix_plot, you can use a processor to publish
# downloaded images into a PixPlot there
PIXPLOT_SERVER = ""

# Explorer settings
# The maximum allowed amount of rows (prevents timeouts and memory errors)
MAX_POSTS_EXPLORER = 500000

# Web tool settings
class FlaskConfig:
FLASK_APP = 'webtool/fourcat'
Expand All @@ -123,7 +131,6 @@ class FlaskConfig:
HOSTNAME_WHITELIST_API = ["localhost"] # hostnames matching these are exempt from rate limiting
HOSTNAME_WHITELIST_NAME = "Automatic login"


##########
# DOCKER #
##########
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: '3.6'

services:
db:
container_name: db
container_name: 4cat_db
image: postgres:latest
environment:
- POSTGRES_USER=${POSTGRES_USER}
Expand Down
134 changes: 134 additions & 0 deletions processors/conversion/tcat_auto_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""
Send TCAT-ready json to a particular TCAT instance
"""
import requests
import random
from urllib.parse import urlparse

from backend.abstract.processor import BasicProcessor
from common.lib.exceptions import ProcessorException

import config

__author__ = "Dale Wahl"
__credits__ = ["Dale Wahl"]
__maintainer__ = "Dale Wahl"
__email__ = "[email protected]"


class FourcatToDmiTcatUploader(BasicProcessor):
"""
Send TCAT-ready json to a particular TCAT instance.
File to be imported by TCAT's import-jsondump.php
"""
type = "tcat-auto-upload" # job type ID
category = "Conversion" # category
title = "Upload to DMI-TCAT" # title displayed in UI
description = "Send TCAT-ready json to a particular DMI-TCAT instance." # description displayed in UI
extension = "html" # extension of result file, used internally and in UI

@classmethod
def is_compatible_with(cls, module=None):
"""
Determine if processor is compatible with dataset
:param module: Dataset or processor to determine compatibility with
"""
return module.type == "convert-ndjson-for-tcat" and \
hasattr(config, 'TCAT_SERVER') and \
config.TCAT_SERVER and \
hasattr(config, 'TCAT_TOKEN') and \
hasattr(config, 'TCAT_USERNAME') and \
hasattr(config, 'TCAT_PASSWORD')

def process(self):
"""
Send TCAT-ready json to a particular TCAT instance.
"""
self.dataset.update_status("Preparing upload")
bin_name = ''.join(e if e.isalnum() else '_' for e in self.dataset.top_parent().get_label())
self.dataset.log('Label for DMI-TCAT bin_name: ' + bin_name)

url_to_file = self.dataset.get_parent().get_result_url()
self.dataset.log('File location URL: ' + url_to_file)

tcat_server = config.TCAT_SERVER
if type(config.TCAT_SERVER) in (list, tuple, set):
tcat_server = random.choice(config.TCAT_SERVER)

# DOCKER shenanigans
self.dataset.log('Docker search: ' + str('host.docker.internal' in config.TCAT_SERVER))
if 'host.docker.internal' in config.TCAT_SERVER:
url_to_file = url_to_file.replace('localhost', 'host.docker.internal')
self.dataset.log('New URL: ' + url_to_file)

query = str(self.dataset.top_parent().get_parameters().get("query", ""))
self.dataset.log('Twitter query: ' + query)

# TCAT authorization information
auth = (config.TCAT_USERNAME, config.TCAT_PASSWORD)

# from urlparse import urlparse # Python 2
parsed_uri = urlparse(config.TCAT_SERVER)
post_json_url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
post_json_url = post_json_url + '/api/import-from-4cat.php'
self.dataset.update_status("Sending dataset to DMI-TCAT: %s" % post_json_url)
response = requests.post(post_json_url, auth=auth, data={
'url': url_to_file,
'name': bin_name,
'query': query,
'token': config.TCAT_TOKEN,
})

if response.status_code == 404:
raise ProcessorException('DMI-TCAT URL 404 error at %s' % config.TCAT_SERVER)
elif response.status_code != 200:
raise ProcessorException('DMI-TCAT Connection Error %i error: %s' % (response.status_code, str(response.reason)))
else:
pass

try:
resp_content = response.json()
except ValueError:
# If import-jsondump.php fails, no json response is returned
if 'The query bin' in response.text and 'already exists' in response.text:
# Query bin already uploaded
# TODO: look at possibility to add to existing bin?
self.dataset.update_status("DMI-TCAT bin already exists; unable to add to existing bin.", is_final=True)
self.dataset.finish(0)
return
else:
# Something else is wrong...
raise ProcessorException('DMI-TCAT Unexpected response: %s - %s - %s' % (response.status_code, str(response.reason), response.text))

if 'success' not in resp_content:
# A json response was returned, but not the one we're expecting!
raise ProcessorException('DMI-TCAT Unexpected response: %s - %s - %s' % (response.status_code, str(response.reason), response.text))
elif not resp_content['success']:
# success should be True if upload was successful
raise ProcessorException('DMI-TCAT Import failure: %s' % str(resp_content))
else:
pass

self.dataset.update_status("Waiting for upload to complete")
# Unsure how to query TCAT, invalid bin_name still returns 200 response
# Could attempt to parse the resultant HTML

self.dataset.update_status("Upload complete, writing HTML file")
# Create HTML file
tcat_result_url = config.TCAT_SERVER.replace('/api/import-from-4cat.php', '').rstrip('/') + '/analysis/index.php?dataset=' + bin_name
html_file = self.get_html_page(tcat_result_url)

# Write HTML file
with self.dataset.get_results_path().open("w", encoding="utf-8") as output_file:
output_file.write(html_file)

# Finish
self.dataset.update_status("Finished")
self.dataset.finish(self.dataset.top_parent().num_rows)

def get_html_page(self, url):
"""
Returns a html string to redirect to the location of the DMI-TCAT dataset.
"""
return f"<head><meta http-equiv='refresh' content='0; URL={url}'></head>"
4 changes: 2 additions & 2 deletions processors/conversion/twitter_ndjson_to_tcat_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class ConvertNDJSONToJSON(BasicProcessor):
type = "convert-ndjson-for-tcat" # job type ID
category = "Conversion" # category
title = "Convert to TCAT JSON" # title displayed in UI
description = "Convert a NDJSON Twitter file to TCAT JSON format. Can be imported with TCAT's import-jsondump.php script." # description displayed in UI
extension = "json" # extension of result file, used internally and in UI
description = "Convert a Twitter dataset to a format compatible with DMI-TCAT and upload it to an available DMI-TCAT instance." # description displayed in UI
extension = "html" # extension of result file, used internally and in UI

@classmethod
def is_compatible_with(cls, module=None):
Expand Down
51 changes: 51 additions & 0 deletions processors/presets/upload-to-dmi-tcat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Upload Twitter dataset to DMI-TCAT instance
"""
from backend.abstract.preset import ProcessorPreset

import config

class FourcatToDmiTcatConverterAndUploader(ProcessorPreset):
"""
Run processor pipeline to extract neologisms
"""
type = "preset-upload-tcat" # job type ID
category = "Presets" # category. 'Presets' are always listed first in the UI.
title = "Upload to DMI-TCAT" # title displayed in UI
description = "Convert the dataset to a format compatible with DMI-TCAT and upload it to an available instance." # description displayed in UI
extension = "svg"

@classmethod
def is_compatible_with(cls, module=None):
"""
Determine if processor is compatible with dataset
:param module: Dataset or processor to determine compatibility with
"""
return module.type == "twitterv2-search" and \
hasattr(config, 'TCAT_SERVER') and \
config.TCAT_SERVER and \
hasattr(config, 'TCAT_TOKEN') and \
hasattr(config, 'TCAT_USERNAME') and \
hasattr(config, 'TCAT_PASSWORD')

def get_processor_pipeline(self):
"""
This queues a series of post-processors to upload a dataset to a
DMI-TCAT instance.
"""

pipeline = [
# first, convert to import-able format
{
"type": "convert-ndjson-for-tcat",
"parameters": {}
},
# then, upload it
{
"type": "tcat-auto-upload",
"parameters": {}
}
]

return pipeline

0 comments on commit 938c168

Please sign in to comment.