diff --git a/.env b/.env index 246fe509b..95de5e6d6 100644 --- a/.env +++ b/.env @@ -8,5 +8,5 @@ SERVER_NAME=localhost PUBLIC_PORT=80 PUBLIC_API_PORT=4444 -# Telegram aparently needs its own port +# Telegram apparently needs its own port TELEGRAM_PORT=443 diff --git a/README.md b/README.md index 06b652358..fce030b2c 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@

A screenshot of 4CAT, displaying its 'Create Dataset' interfaceA screenshot of 4CAT, displaying a network visualisation of a dataset

-4CAT is a research tool that can be used to analyse and process data from -online social platforms. Its goal is to make the capture and analysis of data -from these platforms accessible to people through a web interface, without -requiring any programming or web scraping skills. Our target audience is +4CAT is a research tool that can be used to analyse and process data from +online social platforms. Its goal is to make the capture and analysis of data +from these platforms accessible to people through a web interface, without +requiring any programming or web scraping skills. Our target audience is researchers, students and journalists interested using Digital Methods in their work. @@ -53,7 +53,7 @@ You can install 4CAT locally or on a server via Docker or manually. The usual docker-compose up ``` -will work, but detailed and alternative installation +will work, but detailed and alternative installation instructions are available [in our wiki](https://github.com/digitalmethodsinitiative/4cat/wiki/Installing-4CAT). Currently 4chan, 8chan, and 8kun require additional steps; please see the wiki. diff --git a/backend/abstract/processor.py b/backend/abstract/processor.py index 186e5b5f7..ea6633116 100644 --- a/backend/abstract/processor.py +++ b/backend/abstract/processor.py @@ -8,6 +8,7 @@ import json import abc import csv +import os from pathlib import Path, PurePath @@ -512,6 +513,33 @@ def write_archive_and_finish(self, files, num_items=None, compression=zipfile.ZI self.dataset.finish(num_items) + def create_standalone(self): + # copy this dataset - the filtered version - and make that copy standalone + # this has the benefit of allowing for all analyses that can be run on + # full datasets on the new, filtered copy as well + top_parent = self.source_dataset + + standalone = self.dataset.copy(shallow=False) + standalone.body_match = "(Filtered) " + top_parent.query + standalone.datasource = top_parent.parameters.get("datasource", "custom") + + try: + standalone.board = top_parent.board + except KeyError: + standalone.board = self.type + + standalone.type = "search" + + standalone.detach() + standalone.delete_parameter("key_parent") + + self.dataset.copied_to = standalone.key + + # we don't need this file anymore - it has been copied to the new + # standalone dataset, and this one is not accessible via the interface + # except as a link to the copied standalone dataset + os.unlink(self.dataset.get_results_path()) + @classmethod def is_filter(cls): """ diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py index a22d4326a..898b4824b 100644 --- a/processors/filtering/column_filter.py +++ b/processors/filtering/column_filter.py @@ -1,7 +1,6 @@ """ Filter posts by a given column """ -import os import re import csv import datetime @@ -200,28 +199,5 @@ def process(self): def after_process(self): super().after_process() - # copy this dataset - the filtered version - and make that copy standalone - # this has the benefit of allowing for all analyses that can be run on - # full datasets on the new, filtered copy as well - top_parent = self.source_dataset - - standalone = self.dataset.copy(shallow=False) - standalone.body_match = "(Filtered) " + top_parent.query - standalone.datasource = top_parent.parameters.get("datasource", "custom") - - try: - standalone.board = top_parent.board - except KeyError: - standalone.board = self.type - - standalone.type = "search" - - standalone.detach() - standalone.delete_parameter("key_parent") - - self.dataset.copied_to = standalone.key - - # we don't need this file anymore - it has been copied to the new - # standalone dataset, and this one is not accessible via the interface - # except as a link to the copied standalone dataset - os.unlink(self.dataset.get_results_path()) + # Request standalone + self.create_standalone() diff --git a/processors/filtering/date_filter.py b/processors/filtering/date_filter.py new file mode 100644 index 000000000..2f711b81e --- /dev/null +++ b/processors/filtering/date_filter.py @@ -0,0 +1,146 @@ +""" +Filter posts by a dates +""" +import csv +import dateutil.parser +from datetime import datetime + +from backend.abstract.processor import BasicProcessor +from common.lib.helpers import UserInput + +__author__ = "Dale Wahl" +__credits__ = ["Dale Wahl"] +__maintainer__ = "Dale Wahl" +__email__ = "4cat@oilab.eu" + +csv.field_size_limit(1024 * 1024 * 1024) + + +class DateFilter(BasicProcessor): + """ + Retain only posts between specific dates + """ + type = "date-filter" # job type ID + category = "Filtering" # category + title = "Filter by date" # title displayed in UI + description = "Copies the dataset, retaining only posts between the given dates. This creates a new, separate \ + dataset you can run analyses on." + extension = "csv" # extension of result file, used internally and in UI + + options = { + "daterange": { + "type": UserInput.OPTION_DATERANGE, + "help": "Date range:", + }, + "parse_error": { + "type": UserInput.OPTION_CHOICE, + "help": "Invalid date formats:", + "options": { + "return": "Keep invalid dates for new dataset", + "reject": "Remove invalid dates for new dataset", + }, + "default": "return" + }, + } + + @classmethod + def is_compatible_with(cls, module=None): + """ + Allow processor on CSV files + + :param module: Dataset or processor to determine compatibility with + """ + return module.is_top_dataset() and module.get_extension() == "csv" + + def process(self): + """ + Reads a CSV file, filtering items that match in the required way, and + creates a new dataset containing the matching values + """ + # Column to match + # 'timestamp' should be a required field in all datasources + date_column_name = 'timestamp' + + # Process inputs from user + min_date, max_date = self.parameters.get("daterange") + # Convert to datetime for easy comparison + min_date = datetime.fromtimestamp(min_date).date() + max_date = datetime.fromtimestamp(max_date).date() + # Decide how to handle invalid dates + if self.parameters.get("parse_error") == 'return': + keep_errors = True + elif self.parameters.get("parse_error") == 'reject': + keep_errors = False + else: + raise "Error with parse_error types" + + # Track progress + processed_items = 0 + invalid_dates = 0 + matching_items = 0 + + # Start writer + with self.dataset.get_results_path().open("w", encoding="utf-8") as outfile: + writer = None + + # Loop through items + for item in self.iterate_items(self.source_file): + if not writer: + # First iteration, check if column actually exists + if date_column_name not in item.keys(): + self.dataset.update_status("'%s' column not found in dataset" % date_column_name, is_final=True) + self.dataset.finish(0) + return + + # initialise csv writer - we do this explicitly rather than + # using self.write_items_and_finish() because else we have + # to store a potentially very large amount of items in + # memory which is not a good idea + writer = csv.DictWriter(outfile, fieldnames=item.keys()) + writer.writeheader() + + # Update 4CAT and user on status + processed_items += 1 + if processed_items % 500 == 0: + self.dataset.update_status("Processed %i items (%i matching, %i invalid dates)" % (processed_items, + matching_items, + invalid_dates)) + + # Attempt to parse timestamp + try: + item_date = dateutil.parser.parse(item.get(date_column_name)) + except dateutil.parser.ParserError: + if keep_errors: + # Keep item + invalid_dates += 1 + writer.writerow(item) + continue + else: + # Reject item + invalid_dates += 1 + continue + + # Only use date for comparison (not time) + item_date = item_date.date() + + # Reject dates + if min_date and item_date < min_date: + continue + if max_date and item_date > max_date: + continue + + # Must be a good date! + writer.writerow(item) + matching_items += 1 + + # Any matches? + if matching_items == 0: + self.dataset.update_status("No items matched your criteria", is_final=True) + + self.dataset.finish(matching_items) + + def after_process(self): + super().after_process() + + # Request standalone + self.create_standalone() diff --git a/processors/filtering/lexical_filter.py b/processors/filtering/lexical_filter.py index 6d6a340da..2724fe6ae 100644 --- a/processors/filtering/lexical_filter.py +++ b/processors/filtering/lexical_filter.py @@ -1,10 +1,7 @@ """ Filter posts by lexicon """ -import pickle import re -import os - import csv from pathlib import Path @@ -20,6 +17,7 @@ csv.field_size_limit(1024 * 1024 * 1024) + class LexicalFilter(BasicProcessor): """ Retain only posts matching a given lexicon @@ -57,7 +55,7 @@ class LexicalFilter(BasicProcessor): def process(self): """ - Reads a CSV file, counts occurences of chosen values over all posts, + Reads a CSV file, counts occurrences of chosen values over all posts, and aggregates the results per chosen time frame """ @@ -155,28 +153,5 @@ def process(self): def after_process(self): super().after_process() - # copy this dataset - the filtered version - and make that copy standalone - # this has the benefit of allowing for all analyses that can be run on - # full datasets on the new, filtered copy as well - top_parent = self.source_dataset - - standalone = self.dataset.copy(shallow=False) - standalone.body_match = "(Filtered) " + top_parent.query - standalone.datasource = top_parent.parameters.get("datasource", "custom") - - try: - standalone.board = top_parent.board - except KeyError: - standalone.board = self.type - - standalone.type = "search" - - standalone.detach() - standalone.delete_parameter("key_parent") - - self.dataset.copied_to = standalone.key - - # we don't need this file anymore - it has been copied to the new - # standalone dataset, and this one is not accessible via the interface - # except as a link to the copied standalone dataset - os.unlink(self.dataset.get_results_path()) + # Request standalone + self.create_standalone() diff --git a/processors/filtering/unique_filter.py b/processors/filtering/unique_filter.py index 629bd1772..8b7f097b5 100644 --- a/processors/filtering/unique_filter.py +++ b/processors/filtering/unique_filter.py @@ -2,14 +2,11 @@ Filter by unique posts """ import hashlib -import os import csv from backend.abstract.processor import BasicProcessor from common.lib.helpers import UserInput -import config - __author__ = "Sal Hagen" __credits__ = ["Sal Hagen"] __maintainer__ = "Sal Hagen" @@ -17,6 +14,7 @@ csv.field_size_limit(1024 * 1024 * 1024) + class UniqueFilter(BasicProcessor): """ Retain only posts matching a given lexicon @@ -31,11 +29,11 @@ class UniqueFilter(BasicProcessor): # interface. options = { "case_sensitive": { - "type": UserInput.OPTION_TOGGLE, - "help": "Case sentitive", - "default": False, - "tooltip": "Check to consider posts with different capitals as different." - } + "type": UserInput.OPTION_TOGGLE, + "help": "Case sensitive", + "default": False, + "tooltip": "Check to consider posts with different capitals as different." + } } def process(self): @@ -91,28 +89,5 @@ def process(self): def after_process(self): super().after_process() - # copy this dataset - the filtered version - and make that copy standalone - # this has the benefit of allowing for all analyses that can be run on - # full datasets on the new, filtered copy as well - top_parent = self.source_dataset - - standalone = self.dataset.copy(shallow=False) - standalone.body_match = "(Filtered) " + top_parent.query - standalone.datasource = top_parent.parameters.get("datasource", "custom") - - try: - standalone.board = top_parent.board - except KeyError: - standalone.board = self.type - - standalone.type = "search" - - standalone.detach() - standalone.delete_parameter("key_parent") - - self.dataset.copied_to = standalone.key - - # we don't need this file anymore - it has been copied to the new - # standalone dataset, and this one is not accessible via the interface - # except as a link to the copied standalone dataset - os.unlink(self.dataset.get_results_path()) + # Request standalone + self.create_standalone() diff --git a/processors/filtering/wildcard_filter.py b/processors/filtering/wildcard_filter.py index 1f2089091..a32e0a211 100644 --- a/processors/filtering/wildcard_filter.py +++ b/processors/filtering/wildcard_filter.py @@ -1,18 +1,12 @@ """ Filter posts by lexicon """ -import pickle import re -import os - import csv -from pathlib import Path from backend.abstract.processor import BasicProcessor from common.lib.helpers import UserInput -import config - __author__ = "Stijn Peeters" __credits__ = ["Stijn Peeters"] __maintainer__ = "Stijn Peeters" @@ -90,28 +84,5 @@ def process(self): def after_process(self): super().after_process() - # copy this dataset - the filtered version - and make that copy standalone - # this has the benefit of allowing for all analyses that can be run on - # full datasets on the new, filtered copy as well - top_parent = self.source_dataset - - standalone = self.dataset.copy(shallow=False) - standalone.body_match = "(Filtered) " + top_parent.query - standalone.datasource = top_parent.parameters.get("datasource", "custom") - - try: - standalone.board = top_parent.board - except KeyError: - standalone.board = self.type - - standalone.type = "search" - - standalone.detach() - standalone.delete_parameter("key_parent") - - self.dataset.copied_to = standalone.key - - # we don't need this file anymore - it has been copied to the new - # standalone dataset, and this one is not accessible via the interface - # except as a link to the copied standalone dataset - os.unlink(self.dataset.get_results_path()) + # Request standalone + self.create_standalone() diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index d7c2326e0..023b8a9e7 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -17,7 +17,7 @@ @app.template_filter('datetime') def _jinja2_filter_datetime(date, fmt=None): date = datetime.datetime.utcfromtimestamp(date) - format = "%d-%m-%Y" if not fmt else fmt + format = "%d %b %Y" if not fmt else fmt return date.strftime(format) @@ -153,4 +153,4 @@ def uniqid(): "__tool_name_long": config.TOOL_NAME_LONG, "__announcement": announcement_file.open().read().strip() if announcement_file.exists() else None, "uniqid": uniqid - } \ No newline at end of file + } diff --git a/webtool/templates/processor-option.html b/webtool/templates/processor-option.html index b80425bd0..827a7640f 100644 --- a/webtool/templates/processor-option.html +++ b/webtool/templates/processor-option.html @@ -1,5 +1,5 @@ {% set option_settings = processor.get_options(dataset, current_user)[option] %} -
+
diff --git a/webtool/templates/result-child.html b/webtool/templates/result-child.html index bde6aa05a..c422b1c50 100644 --- a/webtool/templates/result-child.html +++ b/webtool/templates/result-child.html @@ -29,7 +29,7 @@ {% if item.is_finished() and not is_filtered %}{% endif %} - {% if item.is_finished() and is_filtered and "copied_to" in item.parameters %}{% endif %} + {% if item.is_finished() and is_filtered and "copied_to" in item.parameters %}{% endif %} {% if item.is_finished() and item.num_rows >= 0 %} {% if is_filtered and "copied_to" in item.parameters %} New dataset