Skip to content

Commit

Permalink
Expire all datasets (#229)
Browse files Browse the repository at this point in the history
* Expire all datasets

Adds config options EXPIRE_DATASETS and EXPIRE_ALLOW_OPOUT to set datasets to expire automatically (and optionally allow users to make datasets not expire).

* Don't show 'expires-after' parameter in badge list

* Fix SQL syntax error
  • Loading branch information
stijn-uva authored Feb 9, 2022
1 parent be4e4db commit 1e89883
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 14 deletions.
24 changes: 17 additions & 7 deletions backend/workers/expire_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def work(self):
delete old datasets, do so for all qualifying datasets
:return:
"""
datasets = []

# first get datasets for which the data source specifies that they need
# to be deleted after a certain amount of time
for datasource_id in self.all_modules.datasources:
datasource = self.all_modules.datasources[datasource_id]

Expand All @@ -32,15 +36,21 @@ def work(self):
continue

cutoff = time.time() - datasource.get("expire-datasets")
datasets = self.db.fetchall(
datasets += self.db.fetchall(
"SELECT key FROM datasets WHERE key_parent = '' AND parameters::json->>'datasource' = %s AND timestamp < %s",
(datasource_id, cutoff))

# we instantiate the dataset, because its delete() method does all
# the work (e.g. deleting child datasets) for us
for dataset in datasets:
dataset = DataSet(key=dataset["key"], db=self.db)
dataset.delete()
self.log.info("Deleting dataset %s/%s (expired per configuration)" % (datasource, dataset.key))
# and now find datasets that have their expiration date set
# individually
cutoff = int(time.time())
datasets += self.db.fetchall("SELECT key FROM datasets WHERE parameters::json->>'expires-after' IS NOT NULL AND (parameters::json->>'expires-after')::int > %s", (cutoff,))

# we instantiate the dataset, because its delete() method does all
# the work (e.g. deleting child datasets) for us
for dataset in datasets:
dataset = DataSet(key=dataset["key"], db=self.db)
dataset.delete()
self.log.info("Deleting dataset %s/%s (expired per configuration)" % (dataset.parameters.get("datasource", "unknown"), dataset.key))


self.job.finish()
9 changes: 9 additions & 0 deletions config.py-example
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ PATH_SESSIONS = "sessions" # folder where API session data is stored (e.g., Tele
PATH_VERSION = ".git-checked-out" # file containing a commit ID (everything after the first whitespace found is ignored)
GITHUB_URL = "https://github.com/digitalmethodsinitiative/4cat" # URL to the github repository for this 4CAT instance

# These settings control whether top-level datasets (i.e. those created via the
# 'Create dataset' page) are deleted automatically, and if so, after how much
# time. You can also allow users to cancel this (i.e. opt out). Note that if
# users are allowed to opt out, data sources can still force the expiration of
# datasets created through that data source. This cannot be overridden by the
# user.
EXPIRE_DATASETS = 0 # 0 or False-y to not expire
EXPIRE_ALLOW_OPTOUT = True # allow users to opt out of expiration

# 4CAT has an API (available from localhost) that can be used for monitoring
# and will listen for requests on the following port. "0" disables the API.
API_HOST = "localhost"
Expand Down
54 changes: 51 additions & 3 deletions webtool/lib/template_filters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import markdown
import json
import time
import uuid
import math
import os
Expand Down Expand Up @@ -60,8 +61,55 @@ def _jinja2_filter_numberify(number):

return time_str.strip()

@app.template_filter('timify_long')
def _jinja2_filter_timify_long(number):
"""
Make a number look like an indication of time
:param number: Number to convert. If the number is larger than the current
UNIX timestamp, decrease by that amount
:return str: A nice, string, for example `1 month, 3 weeks, 4 hours and 2 minutes`
"""
components = []
if number > time.time():
number = time.time() - number

month_length = 30.42 * 86400
months = math.floor(number / month_length)
if months:
components.append("%i month%s" % (months, "s" if months != 1 else ""))
number -= (months * month_length)

week_length = 7 * 86400
weeks = math.floor(number / week_length)
if weeks:
components.append("%i week%s" % (weeks, "s" if weeks != 1 else ""))
number -= (weeks * week_length)

day_length = 86400
days = math.floor(number / day_length)
if days:
components.append("%i day%s" % (days, "s" if days != 1 else ""))
number -= (days * day_length)

hour_length = 3600
hours = math.floor(number / hour_length)
if hours:
components.append("%i hour%s" % (hours, "s" if hours != 1 else ""))
number -= (hours * hour_length)

minute_length = 60
minutes = math.floor(number / minute_length)
if minutes:
components.append("%i minute%s" % (minutes, "s" if minutes != 1 else ""))

last_str = components.pop()
time_str = ""
if components:
time_str = ", ".join(components)
time_str += " and "


return time_str + last_str

@app.template_filter("http_query")
def _jinja2_filter_httpquery(data):
Expand All @@ -72,13 +120,11 @@ def _jinja2_filter_httpquery(data):
except TypeError:
return ""


@app.template_filter('markdown')
def _jinja2_filter_markdown(text):
val = markdown.markdown(text)
return val


@app.template_filter('isbool')
def _jinja2_filter_isbool(value):
return isinstance(value, bool)
Expand Down Expand Up @@ -197,6 +243,8 @@ def uniqid():
"__tool_name": config.TOOL_NAME,
"__tool_name_long": config.TOOL_NAME_LONG,
"__announcement": announcement_file.open().read().strip() if announcement_file.exists() else None,
"__expire_datasets": config.EXPIRE_DATASETS if hasattr(config, "EXPIRE_DATASETS") else None,
"__expire_optout": config.EXPIRE_ALLOW_OPTOUT if hasattr(config, "EXPIRE_ALLOW_OPTOUT") else None,
"uniqid": uniqid
}

5 changes: 5 additions & 0 deletions webtool/templates/create-dataset.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ <h2><span>Create new dataset</span></h2>
<p><br>Please be conservative; 4CAT is a shared resource and large dataset queries may prevent others
from using it. We recommend to start with smaller date ranges and specific queries and then cast
a wider net if needed.</p>
{% if __expire_datasets %}
<p>Note that datasets will be deleted automatically after {{ __expire_datasets|timify_long }}.
{% if __expire_optout %} You can choose to keep the dataset for longer from the result
page.{% endif %}</p>
{% endif %}
<div class="form-element">
<label for="datasource-select">Data source:</label>
<div>
Expand Down
2 changes: 1 addition & 1 deletion webtool/templates/result-details.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ <h2 class="blocktitle">
<dl class="metadata-wrapper">
{% if timestamp_expires %}
<div class="fullwidth notice">
<strong>Note:</strong> this dataset will no longer be available after {{ timestamp_expires|datetime("%d %b %Y, %H:%M") }}
<strong>Note:</strong> this dataset will no longer be available after {{ timestamp_expires|datetime("%d %b %Y, %H:%M") }}.{% if not expires_by_datasource and can_unexpire %} You can <a href="{{ url_for("keep_dataset", key=dataset.key) }}">cancel deletion</a>.{% endif %}
</div>
{% endif %}
{% if "copied_from" in dataset.parameters %}
Expand Down
2 changes: 1 addition & 1 deletion webtool/templates/result-metadata.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
{% elif parameter == "country_name" and dataset.parameters[parameter] != "all" %}
<span class="inline-label">country:</span>
<span class="inline-query">{{ dataset.parameters.country_name|join(', ') }}</span>
{% elif dataset.parameters[parameter] and parameter[0:4] != "api_" and parameter not in ("jst", "mst", "copied_from", "copied_at", "pseudonymise", "user", "time", "search-scope", "search_scope", "random_amount", "scope_length", "scope_density", "country_name", "min_date", "max_date", "board", "datasource", "type", "label") %}
{% elif dataset.parameters[parameter] and parameter[0:4] != "api_" and parameter not in ("jst", "mst", "copied_from", "copied_at", "pseudonymise", "user", "time", "search-scope", "search_scope", "random_amount", "scope_length", "scope_density", "country_name", "min_date", "max_date", "board", "datasource", "type", "label", "expires-after") %}
{% if not dataset.parameters[parameter]|isbool and dataset.parameters[parameter] %}
<span class="inline-label">{{ parameter }}:</span>
<span class="inline-query has-more" data-max-length="75">{{ dataset.parameters[parameter]|string }}</span>
Expand Down
32 changes: 30 additions & 2 deletions webtool/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def show_frontpage():

datasources = backend.all_modules.datasources


return render_template("frontpage.html", stats=stats, news=news, datasources=datasources)


Expand Down Expand Up @@ -418,8 +417,13 @@ def show_result(key):
# if the datasource is configured for it, this dataset may be deleted at some point
datasource = dataset.parameters.get("datasource", "")
datasources = list(backend.all_modules.datasources.keys())
expires_datasource = False
can_unexpire = hasattr(config, "EXPIRE_ALLOW_OPTOUT") and config.EXPIRE_ALLOW_OPTOUT
if datasource in backend.all_modules.datasources and backend.all_modules.datasources[datasource].get("expire-datasets", None):
timestamp_expires = dataset.timestamp + int(backend.all_modules.datasources[datasource].get("expire-datasets"))
expires_datasource = True
elif dataset.parameters.get("expires-after"):
timestamp_expires = dataset.parameters.get("expires-after")
else:
timestamp_expires = None

Expand All @@ -430,7 +434,8 @@ def show_result(key):

return render_template(template, dataset=dataset, parent_key=dataset.key, processors=backend.all_modules.processors,
is_processor_running=is_processor_running, messages=get_flashed_messages(),
is_favourite=is_favourite, timestamp_expires=timestamp_expires, datasources=datasources)
is_favourite=is_favourite, timestamp_expires=timestamp_expires,
expires_by_datasource=expires_datasource, can_unexpire=can_unexpire, datasources=datasources)


@app.route("/preview-as-table/<string:key>/")
Expand Down Expand Up @@ -593,6 +598,29 @@ def restart_dataset(key):
flash("Dataset queued for re-running.")
return redirect("/results/" + dataset.key + "/")

@app.route("/result/<string:key>/keep/", methods=["GET"])
@login_required
def keep_dataset(key):
try:
dataset = DataSet(key=key, db=db)
except TypeError:
return error(404, message="Dataset not found.")

if not dataset.key_parent:
# top-level dataset
# check if data source forces expiration - in that case, the user
# cannot reset this
datasources = backend.all_modules.datasources
datasource = dataset.parameters.get("datasource")
if datasource in datasources and datasources[datasource].get("expire-datasets"):
return render_template("error.html", title="Dataset cannot be kept",
message="All datasets of this data source (%s) are scheduled for automatic deletion. This cannot be overridden." %
datasource["name"]), 403

dataset.delete_parameter("expires-after")
flash("Dataset expiration data removed. The dataset will no longer be deleted automatically.")
return redirect(url_for("show_result", key=key))


@app.route("/result/<string:key>/nuke/", methods=["GET", "DELETE", "POST"])
@login_required
Expand Down

0 comments on commit 1e89883

Please sign in to comment.