Skip to content

Commit 0de7baf

Browse files
stijn-uvadale-wahl
andauthored
Move datasets between 4CAT servers (#375)
* Export/import datasets between 4CAT servers * move datasource and add init, add new setting to allow export, pass 4CAT version for comparison, run some tests, I had no idea where you left off, so I just wanted to test it and see that way. Made some fixes and now see that you worked up until the log and actual data. * use version function for comparison this uses the commit, which makes sense at the moment, but perhaps not in the long term. * close, but children fail; possibly just zip files * use send_from_directory (tested on 1.5 gig files successfully); cleaned up some errors/logging and added notes * Use custom exception instead of TypeError when dataset not found * Use 401 HTTP status for login form * Ignore hidden files in cleanup worker * Ensure unique dataset key and allow changing keys * Allow workers to set dataset key manually * MANY CHANGES * Don't print log to stdout twice when not running as daemon * Remove stray debug print * Show original timestamp for imported datasets * Dataset form logic * Don't show empty dataset status * Forbid importing unfinished datasets * Not using this file * Fix copied comment * Fix interrupt routine and clean up half-imported data * Catch dataset not found error in expiration worker * Hide anon/label options when importing * Fix delete button on dataset creation page * Fix interrupting imports * Remove "filtered from" on imported datasets * Add DESCRIPTION.md for import data source * Better markdown * return error() not raise * use replace on result_file to use new_dataset.key (`with_stem` is python3.9 and newer) * Clarify some comments * get_software_version() -> get_software_commit() * Use version instead of commit to determine migration compatibility * More commentary * Use reserve_results_file() to ensure correct import data path --------- Co-authored-by: Dale Wahl <[email protected]>
1 parent 07904a6 commit 0de7baf

28 files changed

+738
-84
lines changed

backend/lib/processor.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
from backend.lib.worker import BasicWorker
1717
from common.lib.dataset import DataSet
1818
from common.lib.fourcat_module import FourcatModule
19-
from common.lib.helpers import get_software_version, remove_nuls, send_email
20-
from common.lib.exceptions import WorkerInterruptedException, ProcessorInterruptedException, ProcessorException, MapItemException
19+
from common.lib.helpers import get_software_commit, remove_nuls, send_email
20+
from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException,
21+
DataSetException, MapItemException)
2122
from common.config_manager import config, ConfigWrapper
2223

2324

@@ -98,7 +99,7 @@ def work(self):
9899
# that actually queued the processor, so their config is relevant
99100
self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
100101
self.owner = self.dataset.creator
101-
except TypeError as e:
102+
except DataSetException as e:
102103
# query has been deleted in the meantime. finish without error,
103104
# as deleting it will have been a conscious choice by a user
104105
self.job.finish()
@@ -133,10 +134,10 @@ def work(self):
133134
self.job.finish()
134135
return
135136

136-
except TypeError:
137+
except DataSetException:
137138
# we need to know what the source_dataset dataset was to properly handle the
138139
# analysis
139-
self.log.warning("Processor %s queued for orphan query %s: cannot run, cancelling job" % (
140+
self.log.warning("Processor %s queued for orphan dataset %s: cannot run, cancelling job" % (
140141
self.type, self.dataset.key))
141142
self.job.finish()
142143
return
@@ -160,7 +161,7 @@ def work(self):
160161

161162
# start log file
162163
self.dataset.update_status("Processing data")
163-
self.dataset.update_version(get_software_version())
164+
self.dataset.update_version(get_software_commit())
164165

165166
# get parameters
166167
# if possible, fill defaults where parameters are not provided

backend/workers/cancel_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Delete and cancel a dataset
33
"""
44
from backend.lib.worker import BasicWorker
5-
from common.lib.exceptions import JobNotFoundException
5+
from common.lib.exceptions import JobNotFoundException, DataSetException
66
from common.lib.dataset import DataSet
77
from common.lib.job import Job
88

@@ -27,7 +27,7 @@ def work(self):
2727
try:
2828
dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
2929
jobtype = dataset.data["type"]
30-
except TypeError:
30+
except DataSetException:
3131
# dataset already deleted, apparently
3232
self.job.finish()
3333
return

backend/workers/cleanup_tempfiles.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from common.config_manager import config
1010
from backend.lib.worker import BasicWorker
1111
from common.lib.dataset import DataSet
12-
from common.lib.exceptions import WorkerInterruptedException
12+
from common.lib.exceptions import WorkerInterruptedException, DataSetException
1313

1414

1515
class TempFileCleaner(BasicWorker):
@@ -34,6 +34,10 @@ def work(self):
3434

3535
result_files = Path(config.get('PATH_DATA')).glob("*")
3636
for file in result_files:
37+
if file.stem.startswith("."):
38+
# skip hidden files
39+
continue
40+
3741
if self.interrupted:
3842
raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
3943

@@ -50,7 +54,7 @@ def work(self):
5054

5155
try:
5256
dataset = DataSet(key=key, db=self.db)
53-
except TypeError:
57+
except DataSetException:
5458
# the dataset has been deleted since, but the result file still
5559
# exists - should be safe to clean up
5660
self.log.info("No matching dataset with key %s for file %s, deleting file" % (key, str(file)))

backend/workers/expire_items.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from backend.lib.worker import BasicWorker
1010
from common.lib.dataset import DataSet
11-
from common.config_manager import config
11+
from common.lib.exceptions import DataSetNotFoundException
1212

1313
from common.lib.user import User
1414

@@ -55,10 +55,15 @@ def expire_datasets(self):
5555
""")
5656

5757
for dataset in datasets:
58-
dataset = DataSet(key=dataset["key"], db=self.db)
59-
if dataset.is_expired():
60-
self.log.info(f"Deleting dataset {dataset.key} (expired)")
61-
dataset.delete()
58+
try:
59+
dataset = DataSet(key=dataset["key"], db=self.db)
60+
if dataset.is_expired():
61+
self.log.info(f"Deleting dataset {dataset.key} (expired)")
62+
dataset.delete()
63+
64+
except DataSetNotFoundException:
65+
# dataset already deleted I guess?
66+
pass
6267

6368
def expire_users(self):
6469
"""

common/lib/config_definition.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@
105105
"help": "Can use explorer",
106106
"tooltip": "Controls whether users can use the Explorer feature to navigate datasets."
107107
},
108+
"privileges.can_export_datasets": {
109+
"type": UserInput.OPTION_TOGGLE,
110+
"default": True,
111+
"help": "Can export datasets",
112+
"tooltip": "Allows users to export datasets they own to other 4CAT instances."
113+
},
108114
"privileges.admin.can_manage_users": {
109115
"type": UserInput.OPTION_TOGGLE,
110116
"default": False,

common/lib/dataset.py

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import datetime
44
import hashlib
55
import fnmatch
6+
import random
67
import shutil
78
import json
89
import time
@@ -14,9 +15,10 @@
1415
import backend
1516
from common.config_manager import config
1617
from common.lib.job import Job, JobNotFoundException
17-
from common.lib.helpers import get_software_version, NullAwareTextIOWrapper, convert_to_int
18+
from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
1819
from common.lib.fourcat_module import FourcatModule
19-
from common.lib.exceptions import ProcessorInterruptedException, DataSetException, MapItemException
20+
from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
21+
MapItemException)
2022

2123

2224
class DataSet(FourcatModule):
@@ -78,29 +80,29 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
7880
self.key = key
7981
current = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key,))
8082
if not current:
81-
raise TypeError("DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key)
83+
raise DataSetNotFoundException("DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key)
8284

8385
query = current["query"]
8486
elif job is not None:
8587
current = self.db.fetchone("SELECT * FROM datasets WHERE parameters::json->>'job' = %s", (job,))
8688
if not current:
87-
raise TypeError("DataSet() requires a valid job ID for its 'job' argument")
89+
raise DataSetNotFoundException("DataSet() requires a valid job ID for its 'job' argument")
8890

8991
query = current["query"]
9092
self.key = current["key"]
9193
elif data is not None:
9294
current = data
9395
if "query" not in data or "key" not in data or "parameters" not in data or "key_parent" not in data:
94-
raise ValueError("DataSet() requires a complete dataset record for its 'data' argument")
96+
raise DataSetException("DataSet() requires a complete dataset record for its 'data' argument")
9597

9698
query = current["query"]
9799
self.key = current["key"]
98100
else:
99101
if parameters is None:
100-
raise TypeError("DataSet() requires either 'key', or 'parameters' to be given")
102+
raise DataSetException("DataSet() requires either 'key', or 'parameters' to be given")
101103

102104
if not type:
103-
raise ValueError("Datasets must have their type set explicitly")
105+
raise DataSetException("Datasets must have their type set explicitly")
104106

105107
query = self.get_label(parameters, default=type)
106108
self.key = self.get_key(query, parameters, parent)
@@ -122,7 +124,7 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
122124
"timestamp": int(time.time()),
123125
"is_finished": False,
124126
"is_private": is_private,
125-
"software_version": get_software_version(),
127+
"software_version": get_software_commit(),
126128
"software_file": "",
127129
"num_rows": 0,
128130
"progress": 0.0,
@@ -556,7 +558,7 @@ def delete(self, commit=True):
556558
try:
557559
child = DataSet(key=child["key"], db=self.db)
558560
child.delete(commit=commit)
559-
except TypeError:
561+
except DataSetException:
560562
# dataset already deleted - race condition?
561563
pass
562564

@@ -977,7 +979,7 @@ def reserve_result_file(self, parameters=None, extension="csv"):
977979
self.data["result_file"] = file
978980
return updated > 0
979981

980-
def get_key(self, query, parameters, parent=""):
982+
def get_key(self, query, parameters, parent="", time_offset=0):
981983
"""
982984
Generate a unique key for this dataset that can be used to identify it
983985
@@ -987,6 +989,9 @@ def get_key(self, query, parameters, parent=""):
987989
:param str query: Query string
988990
:param parameters: Dataset parameters
989991
:param parent: Parent dataset's key (if applicable)
992+
:param time_offset: Offset to add to the time component of the dataset
993+
key. This can be used to ensure a unique key even if the parameters and
994+
timing is otherwise identical to an existing dataset's
990995
991996
:return str: Dataset key
992997
"""
@@ -999,16 +1004,53 @@ def get_key(self, query, parameters, parent=""):
9991004
for key in sorted(parameters):
10001005
param_key[key] = parameters[key]
10011006

1002-
# this ensures a different key for the same query if not queried
1003-
# at the exact same second. Since the same query may return
1004-
# different results when done at different times, getting a
1005-
# duplicate key is not actually always desirable. The resolution
1006-
# of this salt could be experimented with...
1007-
param_key["_salt"] = int(time.time())
1007+
# we additionally use the current time as a salt - this should usually
1008+
# ensure a unique key for the dataset. if for some reason there is a
1009+
# hash collision
1010+
param_key["_salt"] = int(time.time()) + time_offset
10081011

10091012
parent_key = str(parent) if parent else ""
10101013
plain_key = repr(param_key) + str(query) + parent_key
1011-
return hashlib.md5(plain_key.encode("utf-8")).hexdigest()
1014+
hashed_key = hashlib.md5(plain_key.encode("utf-8")).hexdigest()
1015+
1016+
if self.db.fetchone("SELECT key FROM datasets WHERE key = %s", (hashed_key,)):
1017+
# key exists, generate a new one
1018+
return self.get_key(query, parameters, parent, time_offset=random.randint(1,10))
1019+
else:
1020+
return hashed_key
1021+
1022+
def set_key(self, key):
1023+
"""
1024+
Change dataset key
1025+
1026+
In principe, keys should never be changed. But there are rare cases
1027+
where it is useful to do so, in particular when importing a dataset
1028+
from another 4CAT instance; in that case it makes sense to try and
1029+
ensure that the key is the same as it was before. This function sets
1030+
the dataset key and updates any dataset references to it.
1031+
1032+
:param str key: Key to set
1033+
:return str: Key that was set. If the desired key already exists, the
1034+
original key is kept.
1035+
"""
1036+
key_exists = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (key,))
1037+
if key_exists or not key:
1038+
return self.key
1039+
1040+
old_key = self.key
1041+
self.db.update("datasets", data={"key": key}, where={"key": old_key})
1042+
1043+
# update references
1044+
self.db.update("datasets", data={"key_parent": key}, where={"key_parent": old_key})
1045+
self.db.update("datasets_owners", data={"key": key}, where={"key": old_key})
1046+
self.db.update("jobs", data={"remote_id": key}, where={"remote_id": old_key})
1047+
self.db.update("users_favourites", data={"key": key}, where={"key": old_key})
1048+
1049+
# for good measure
1050+
self.db.commit()
1051+
self.key = key
1052+
1053+
return self.key
10121054

10131055
def get_status(self):
10141056
"""
@@ -1186,7 +1228,7 @@ def get_genealogy(self, inclusive=False):
11861228
while key_parent:
11871229
try:
11881230
parent = DataSet(key=key_parent, db=self.db)
1189-
except TypeError:
1231+
except DataSetException:
11901232
break
11911233

11921234
genealogy.append(parent)

common/lib/exceptions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ class DataSetException(FourcatException):
5353
"""
5454
pass
5555

56+
class DataSetNotFoundException(DataSetException):
57+
"""
58+
Raise if dataset does not exist
59+
"""
60+
pass
61+
5662

5763
class JobClaimedException(QueueException):
5864
"""

common/lib/helpers.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
from collections.abc import MutableMapping
2020
from html.parser import HTMLParser
21-
from urllib.parse import urlparse
21+
from pathlib import Path
2222
from calendar import monthrange
2323

2424
from common.lib.user_input import UserInput
@@ -99,18 +99,21 @@ def sniff_encoding(file):
9999
return "utf-8-sig" if maybe_bom == b"\xef\xbb\xbf" else "utf-8"
100100

101101

102-
def get_software_version():
102+
def get_software_commit():
103103
"""
104-
Get current 4CAT version
104+
Get current 4CAT commit hash
105105
106106
Reads a given version file and returns the first string found in there
107107
(up until the first space). On failure, return an empty string.
108108
109+
Use `get_software_version()` instead if you need the release version
110+
number rather than the precise commit hash.
111+
109112
If no version file is available, run `git show` to test if there is a git
110113
repository in the 4CAT root folder, and if so, what commit is currently
111114
checked out in it.
112115
113-
:return str: 4CAT version
116+
:return str: 4CAT git commit hash
114117
"""
115118
versionpath = config.get('PATH_ROOT').joinpath(config.get('path.versionfile'))
116119

@@ -139,6 +142,23 @@ def get_software_version():
139142
except OSError:
140143
return ""
141144

145+
def get_software_version():
146+
"""
147+
Get current 4CAT version
148+
149+
This is the actual software version, i.e. not the commit hash (see
150+
`get_software_hash()` for that). The current version is stored in a file
151+
with a canonical location: if the file doesn't exist, an empty string is
152+
returned.
153+
154+
:return str: Software version, for example `1.37`.
155+
"""
156+
current_version_file = Path(config.get("PATH_ROOT"), "config/.current-version")
157+
if not current_version_file.exists():
158+
return ""
159+
160+
with current_version_file.open() as infile:
161+
return infile.readline().strip()
142162

143163
def get_github_version(timeout=5):
144164
"""

common/lib/logger.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,8 @@ def __init__(self, output=False, filename='4cat.log'):
210210
self.logger.addHandler(slack_handler)
211211
except Exception:
212212
# we *may* need the logger before the database is in working order
213-
config.db.rollback()
213+
if config.db is not None:
214+
config.db.rollback()
214215

215216
def log(self, message, level=logging.INFO, frame=None):
216217
"""
@@ -221,9 +222,6 @@ def log(self, message, level=logging.INFO, frame=None):
221222
:param frame: Traceback frame. If no frame is given, it is
222223
extrapolated
223224
"""
224-
if self.print_logs and level > logging.DEBUG:
225-
print("LOG: %s" % message)
226-
227225
# logging can include the full stack trace in the log, but that's a
228226
# bit excessive - instead, only include the location the log was called
229227
if not frame:

0 commit comments

Comments
 (0)