Skip to content

Commit

Permalink
Private datasets (#231)
Browse files Browse the repository at this point in the history
* Private datasets

Can be toggled via the web interface. Private status determines who can view and manipulate datasets via the web interface. It does not encrypt data or anything like it.

* Fix template references to owner
  • Loading branch information
stijn-uva authored Feb 4, 2022
1 parent 881866d commit d6549f1
Show file tree
Hide file tree
Showing 14 changed files with 267 additions and 27 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
1.24
1.25

This file should not be modified. It is used by 4CAT to determine whether it
needs to run migration scripts to e.g. update the database structure to a more
Expand Down
11 changes: 9 additions & 2 deletions backend/abstract/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,15 @@ def after_process(self):
self.log.info("Not running follow-up processor of type %s for dataset %s, no input data for follow-up" % (next_type, self.dataset.key))

elif next_type in available_processors:
next_analysis = DataSet(parameters=next_parameters, type=next_type, db=self.db, parent=self.dataset.key,
extension=available_processors[next_type].extension)
next_analysis = DataSet(
parameters=next_parameters,
type=next_type,
db=self.db,
parent=self.dataset.key,
extension=available_processors[next_type].extension,
is_private=self.dataset.is_private,
owner=self.dataset.owner
)
self.queue.add_job(next_type, remote_id=next_analysis.key)
else:
self.log.warning("Dataset %s (of type %s) wants to run processor %s next, but it is incompatible" % (self.dataset.key, self.type, next_type))
Expand Down
2 changes: 2 additions & 0 deletions backend/database.sql
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ CREATE TABLE IF NOT EXISTS datasets (
key text,
type text DEFAULT 'search',
key_parent text DEFAULT '',
owner VARCHAR DEFAULT 'anonymous',
query text,
job integer DEFAULT 0,
parameters text,
Expand All @@ -40,6 +41,7 @@ CREATE TABLE IF NOT EXISTS datasets (
status text,
num_rows integer DEFAULT 0,
is_finished boolean DEFAULT FALSE,
is_private boolean DEFAULT TRUE,
software_version text,
software_file text DEFAULT '',
annotation_fields text DEFAULT ''
Expand Down
2 changes: 1 addition & 1 deletion backend/workers/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def process_request(self, request, payload):
"is_recurring": (int(job["interval"]) > 0),
"is_maybe_crashed": job["timestamp_claimed"] > 0 and not worker,
"dataset_key": worker.dataset.key if hasattr(worker, "dataset") else None,
"dataset_user": worker.dataset.parameters.get("user", None) if hasattr(worker, "dataset") else None,
"dataset_user": worker.dataset.owner if hasattr(worker, "dataset") else None,
"dataset_parent_key": worker.dataset.top_parent().key if hasattr(worker, "dataset") else None,
"timestamp_queued": job["timestamp"],
"timestamp_claimed": job["timestamp_lastclaimed"]
Expand Down
21 changes: 20 additions & 1 deletion common/lib/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class DataSet(FourcatModule):
staging_area = None

def __init__(self, parameters={}, key=None, job=None, data=None, db=None, parent=None, extension="csv",
type=None):
type=None, is_private=True, owner="anonymous"):
"""
Create new dataset object
Expand Down Expand Up @@ -101,12 +101,14 @@ def __init__(self, parameters={}, key=None, job=None, data=None, db=None, parent
self.data = {
"key": self.key,
"query": self.get_label(parameters, default=type),
"owner": owner,
"parameters": json.dumps(parameters),
"result_file": "",
"status": "",
"type": type,
"timestamp": int(time.time()),
"is_finished": False,
"is_private": is_private,
"software_version": get_software_version(),
"software_file": "",
"num_rows": 0,
Expand Down Expand Up @@ -455,6 +457,23 @@ def delete(self):
# already deleted, apparently
pass

def update_children(self, **kwargs):
"""
Update an attribute for all child datasets
Can be used to e.g. change the owner, version, finished status for all
datasets in a tree
:param kwargs: Parameters corresponding to known dataset attributes
"""
children = self.db.fetchall("SELECT * FROM datasets WHERE key_parent = %s", (self.key,))
for child in children:
child = DataSet(key=child["key"], db=self.db)
for attr, value in kwargs.items():
child.__setattr__(attr, value)

child.update_children(**kwargs)

def is_finished(self):
"""
Check if dataset is finished
Expand Down
42 changes: 42 additions & 0 deletions helper-scripts/migrate/migrate-1.24-1.25.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Add 'is_deactivated' column to user table
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + "'/../..")
from common.lib.database import Database
from common.lib.logger import Logger

import config

log = Logger(output=True)
db = Database(logger=log, dbname=config.DB_NAME, user=config.DB_USER, password=config.DB_PASSWORD, host=config.DB_HOST,
port=config.DB_PORT, appname="4cat-migrate")

print(" Checking if datasets table has a column 'is_private'...")
has_column = db.fetchone("SELECT COUNT(*) AS num FROM information_schema.columns WHERE table_name = 'datasets' AND column_name = 'is_private'")
if has_column["num"] == 0:
print(" ...No, adding.")
db.execute("ALTER TABLE datasets ADD COLUMN is_private BOOLEAN DEFAULT TRUE")
db.commit()

# make existing datasets all non-private, as they were before
db.execute("UPDATE datasets SET is_private = FALSE")
db.commit()
else:
print(" ...Yes, nothing to update.")

print(" Checking if datasets table has a column 'owner'...")
has_column = db.fetchone("SELECT COUNT(*) AS num FROM information_schema.columns WHERE table_name = 'datasets' AND column_name = 'owner'")
if has_column["num"] == 0:
print(" ...No, adding.")
db.execute("ALTER TABLE datasets ADD COLUMN owner VARCHAR DEFAULT 'anonymous'")
db.commit()

# make existing datasets all non-private, as they were before
db.execute("UPDATE datasets SET owner = parameters::json->>'user' WHERE parameters::json->>'user' IS NOT NULL")
db.commit()
else:
print(" ...Yes, nothing to update.")


print(" Done!")
9 changes: 8 additions & 1 deletion webtool/api_standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,14 @@ def process_standalone(processor):
return error(402, error="Input is empty")

# ok, valid input!
temp_dataset = DataSet(extension="csv", type="standalone", parameters={"user": current_user.get_id(), "after": [processor]}, db=db)
temp_dataset = DataSet(
extension="csv",
type="standalone",
parameters={"next": [processor]},
db=db,
owner=current_user.get_id(),
is_private=True
)
temp_dataset.finish(len(input))

# make sure the file is deleted later, whichever way this request is
Expand Down
80 changes: 71 additions & 9 deletions webtool/api_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,12 @@ def import_dataset():
if not worker:
return error(404, message="Unknown platform or source format")

dataset = DataSet(parameters={"user": current_user.get_id(), "datasource": platform}, type=worker.type, db=db)
dataset = DataSet(
parameters={"datasource": platform},
type=worker.type,
db=db,
owner=current_user.get_id()
)
dataset.update_status("Importing uploaded file...")

# store the file at the result path for the dataset, but with a different suffix
Expand Down Expand Up @@ -295,7 +300,6 @@ def queue_dataset():
status and results.
:return-error 404: If the datasource does not exist.
"""

datasource_id = request.form.get("datasource", "")
if datasource_id not in backend.all_modules.datasources:
return error(404, message="Datasource '%s' does not exist" % datasource_id)
Expand All @@ -318,14 +322,21 @@ def queue_dataset():
else:
raise NotImplementedError("Data sources MUST sanitise input values with validate_query")

sanitised_query["user"] = current_user.get_id()
sanitised_query["datasource"] = datasource_id
sanitised_query["type"] = search_worker_id

sanitised_query["pseudonymise"] = bool(request.form.to_dict().get("pseudonymise", False))
is_private = bool(request.form.to_dict().get("make-private", True))

extension = search_worker.extension if hasattr(search_worker, "extension") else "csv"
dataset = DataSet(parameters=sanitised_query, db=db, type=search_worker_id, extension=extension)
dataset = DataSet(
parameters=sanitised_query,
db=db,
type=search_worker_id,
extension=extension,
is_private=is_private,
owner=current_user.get_id()
)

if request.form.get("label"):
dataset.update_label(request.form.get("label"))
Expand Down Expand Up @@ -374,6 +385,9 @@ def check_dataset():
except TypeError:
return error(404, error="Dataset does not exist.")

if not current_user.can_access_dataset(dataset):
return error(403, error="Dataset is private")

results = dataset.check_dataset_finished()
if results == 'empty':
dataset_data = dataset.data
Expand Down Expand Up @@ -438,7 +452,7 @@ def edit_dataset_label(key):
except TypeError:
return error(404, error="Dataset does not exist.")

if not current_user.is_admin and not current_user.get_id() == dataset.parameters.get("user"):
if not current_user.is_admin and not current_user.get_id() == dataset.owner:
return error(403, message="Not allowed")

dataset.update_label(label)
Expand Down Expand Up @@ -594,7 +608,7 @@ def delete_dataset(key=None):
except TypeError:
return error(404, error="Dataset does not exist.")

if not current_user.is_admin and not current_user.get_id() == dataset.parameters.get("user"):
if not current_user.is_admin and not current_user.get_id() == dataset.owner:
return error(403, message="Not allowed")

# if there is an active or queued job for some child dataset, cancel and
Expand Down Expand Up @@ -658,6 +672,9 @@ def toggle_favourite(key):
except TypeError:
return error(404, error="Dataset does not exist.")

if not current_user.can_access_dataset(dataset):
return error(403, error="This dataset is private")

current_status = db.fetchone("SELECT * FROM users_favourites WHERE name = %s AND key = %s",
(current_user.get_id(), dataset.key))
if not current_status:
Expand All @@ -667,6 +684,38 @@ def toggle_favourite(key):
db.delete("users_favourites", where={"name": current_user.get_id(), "key": dataset.key})
return jsonify({"success": True, "favourite_status": False})

@app.route("/api/toggle-dataset-private/<string:key>")
@login_required
@openapi.endpoint("tool")
def toggle_private(key):
"""
Toggle whether a dataset is private or not
Private datasets cannot be viewed by users that are not an admin or the
owner of the dataset. An exception is datasets assigned to the user
'anonymous', which can be viewed by anyone. Only admins and owners can
toggle private status of a dataset.
:param str key: Key of the dataset to mark as (not) private
:return: A JSON object with the status of the request
:return-schema: {type=object,properties={success={type=boolean},is_private={type=boolean}}}
:return-error 404: If the dataset key was not found
"""
try:
dataset = DataSet(key=key, db=db)
except TypeError:
return error(404, error="Dataset does not exist.")

if dataset.owner != current_user.get_id() and not current_user.is_admin():
return error(403, error="This dataset is private")

# apply status to dataset and all children
dataset.is_private = not dataset.is_private
dataset.update_children(is_private=dataset.is_private)

return jsonify({"success": True, "is_private": dataset.is_private})

@app.route("/api/queue-processor/", methods=["POST"])
@api_ratelimit
Expand Down Expand Up @@ -731,6 +780,9 @@ def queue_processor(key=None, processor=None):
print("KEY", key)
return error(404, error="Not a valid dataset key.")

if not current_user.can_access_dataset(dataset):
return error(403, error="You cannot run processors on private datasets")

# check if processor is available for this dataset
available_processors = dataset.get_available_processors()
if processor not in available_processors:
Expand All @@ -741,12 +793,19 @@ def queue_processor(key=None, processor=None):
# create a dataset now
try:
options = UserInput.parse_all(available_processors[processor].get_options(dataset, current_user), request.form.to_dict(), silently_correct=False)
options["user"] = current_user.get_id()
except QueryParametersException as e:
return error(400, error=str(e))

analysis = DataSet(parent=dataset.key, parameters=options, db=db,
extension=available_processors[processor].extension, type=processor)
# private or not is inherited from parent dataset
analysis = DataSet(parent=dataset.key,
parameters=options,
db=db,
extension=available_processors[processor].extension,
type=processor,
is_private=dataset.is_private,
owner=current_user.get_id()
)

if analysis.is_new:
# analysis has not been run or queued before - queue a job to run it
queue.add_job(jobtype=processor, remote_id=analysis.key)
Expand Down Expand Up @@ -803,6 +862,9 @@ def check_processor():
except TypeError:
continue

if not current_user.can_access_dataset(dataset):
continue

genealogy = dataset.get_genealogy()
parent = genealogy[-2]
top_parent = genealogy[0]
Expand Down
32 changes: 30 additions & 2 deletions webtool/lib/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,35 @@ def get_by_token(db, token):
else:
return User(db, user)

def __init__(self, db, data, authenticated=False):
def can_access_dataset(self, dataset):
"""
Check if this user should be able to access a given dataset.
This depends mostly on the dataset's owner, which should match the
user if the dataset is private. If the dataset is not private, or
if the user is an admin or the dataset is private but assigned to
an anonymous user, the dataset can be accessed.
:param dataset: The dataset to check access to
:return bool:
"""
if not dataset.is_private:
return True

elif self.is_admin():
return True

elif self.get_id() == dataset.owner:
return True

elif dataset.owner == "anonymous":
return True

else:
return False


def __init__(self, data, authenticated=False):
"""
Instantiate user object
Expand Down Expand Up @@ -325,4 +353,4 @@ def set_password(self, password):
salt = bcrypt.gensalt()
password_hash = bcrypt.hashpw(password.encode("ascii"), salt)

self.db.update("users", where={"name": self.data["name"]}, data={"password": password_hash.decode("utf-8")})
self.db.update("users", where={"name": self.data["name"]}, data={"password": password_hash.decode("utf-8")})
11 changes: 10 additions & 1 deletion webtool/templates/create-dataset.html
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,19 @@ <h2><span>Create new dataset</span></h2>
appropriate.</p>
<div class="form-element">
<label for="data-pseudonymise">Pseudonymise:</label>
<div class="filter-parameters" id="board-filter">
<div class="filter-parameters">
<label><input type="checkbox" checked="checked" name="pseudonymise" id="data-pseudonymise"> Replace author names with hashed values</label>
</div>
</div>
<div class="form-element">
<label for="data-make-private">Make private:</label>
<div class="filter-parameters">
<label><input type="checkbox" name="make-private" id="data-make-private"> Make dataset private</label>
</div>

<button class="tooltip-trigger" aria-controls="tooltip-dataset-private" aria-label="Extended help for option">?</button>
<p role="tooltip" id="tooltip-dataset-private">This will only hide your dataset from other users. It will NOT encrypt your data and instance maintainers will still be able to view it. If you are working with sensitive data, you should consider running your own 4CAT instance.</p>
</div>
<div class="form-element">
<label for="dataset-label">Dataset name:</label>
<input id="dataset-label" name="label">
Expand Down
2 changes: 1 addition & 1 deletion webtool/templates/result-child.html
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ <h4>{{ processors[item.type].title if not deprecated else "(Deprecated analysis)
</li>
{% endif %}
{% endif %}
{% if current_user.is_authenticated and (current_user.get_id() == dataset.parameters.user or current_user.is_admin or item.parameters.userparameters.user == current_user.get_id()) %}
{% if current_user.is_authenticated and (current_user.get_id() == dataset.owner or current_user.is_admin or item.owner == current_user.get_id()) %}
<li>
<a class="property-badge delete-link tooltip-trigger" href="{{ url_for('delete_dataset_interactive', key=item.key) }}" data-confirm-action="delete this dataset"><i class="fa fa-fw fa-times" aria-hidden="true"></i> <span class="sr-only">Delete this analysis</span></a>
<p role="tooltip" id="tooltip-delete-{{ item.key }}" aria-hidden="true">{% if not item.is_finished %}Cancel and d{% else %}D{% endif %}elete this analysis and any underlying analyses</p>
Expand Down
Loading

0 comments on commit d6549f1

Please sign in to comment.