Users and privileges overhaul (#339)

digitalmethodsinitiative · Jul 3, 2023 · ba3a675 · ba3a675
1 parent 0f8c28b
commit ba3a675
Show file tree

Hide file tree

Showing 252 changed files with 28,432 additions and 2,325 deletions.
diff --git a/.github/workflows/docker_pr_test.yml b/.github/workflows/docker_pr_test.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Print log on failure
         if: failure()
         run: |
-          docker cp 4cat_backend:/4cat/data/logs/backend_4cat.log ./backend_4cat.log
+          docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log
           echo "::group::Backend logs"
           cat backend_4cat.log
           echo "::endgroup::"
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@
 
 # actual files that are part of 4CAT but should not be included
 config.py
+module_config.bin
 .current-version
 deploy.sh
 module_cache.pb

diff --git a/4cat-daemon.py b/4cat-daemon.py
@@ -59,7 +59,7 @@
 # we can only import this here, because the version check above needs to be
 # done first, as it may detect that the user needs to migrate first before
 # the config manager can be run properly
-import common.config_manager as config
+from common.config_manager import config
 from common.lib.helpers import call_api
 # ---------------------------------------------
 #     Check validity of configuration file

diff --git a/VERSION b/VERSION
@@ -1,4 +1,4 @@
-1.33
+1.34
 
 This file should not be modified. It is used by 4CAT to determine whether it
 needs to run migration scripts to e.g. update the database structure to a more

diff --git a/backend/bootstrap.py b/backend/bootstrap.py
@@ -11,7 +11,7 @@
 from backend.lib.manager import WorkerManager
 from common.lib.logger import Logger
 
-import common.config_manager as config
+from common.config_manager import config
 
 def run(as_daemon=True):
 	pidfile = Path(config.get('PATH_ROOT'), config.get('PATH_LOCKFILE'), "4cat.pid")
@@ -54,13 +54,18 @@ def run(as_daemon=True):
 		log = Logger(output=not as_daemon)
 
 	log.info("4CAT Backend started, logger initialised")
-	db = Database(logger=log, appname="main")
+	db = Database(logger=log, appname="main",
+				  dbname=config.DB_NAME, user=config.DB_USER, password=config.DB_PASSWORD, host=config.DB_HOST, port=config.DB_PORT)
 	queue = JobQueue(logger=log, database=db)
 
 	# clean up after ourselves
 	db.commit()
 	queue.release_all()
 
+	# ensure database consistency for settings table
+	config.with_db(db)
+	config.ensure_database()
+
 	# make it happen
 	# this is blocking until the back-end is shut down
 	WorkerManager(logger=log, database=db, queue=queue, as_daemon=as_daemon)

diff --git a/backend/database.sql b/backend/database.sql
@@ -5,10 +5,16 @@
 
 -- 4CAT settings table
 CREATE TABLE IF NOT EXISTS settings (
-  name                   TEXT UNIQUE PRIMARY KEY,
-  value                  TEXT DEFAULT '{}'
+  name                   TEXT DEFAULT '' NOT NULL,
+  value                  TEXT DEFAULT '{}' NOT NULL,
+  tag                    TEXT DEFAULT '' NOT NULL
 );
 
+CREATE UNIQUE INDEX IF NOT EXISTS unique_setting
+  ON settings (
+    name, tag
+  );
+
 -- jobs table
 CREATE TABLE IF NOT EXISTS jobs (
   id                     SERIAL PRIMARY KEY,
@@ -38,7 +44,6 @@ CREATE TABLE IF NOT EXISTS datasets (
   key               text,
   type              text DEFAULT 'search',
   key_parent        text DEFAULT '',
-  owner             VARCHAR DEFAULT 'anonymous',
   query             text,
   job               integer DEFAULT 0,
   parameters        text,
@@ -54,6 +59,15 @@ CREATE TABLE IF NOT EXISTS datasets (
   annotation_fields text DEFAULT ''
 );
 
+CREATE TABLE datasets_owners (
+    "name" text DEFAULT 'anonymous'::text,
+    key text NOT NULL,
+    role TEXT DEFAULT 'owner'
+);
+
+CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ops,key text_ops);
+
+
 -- annotations
 CREATE TABLE IF NOT EXISTS annotations (
   key               text UNIQUE PRIMARY KEY,
@@ -75,10 +89,12 @@ CREATE TABLE IF NOT EXISTS users (
   password           TEXT,
   is_admin           BOOLEAN DEFAULT FALSE,
   register_token     TEXT DEFAULT '',
+  timestamp_created  INTEGER DEFAULT 0,
   timestamp_token    INTEGER DEFAULT 0,
   timestamp_seen     INTEGER DEFAULT 0,
   userdata           TEXT DEFAULT '{}',
-  is_deactivated     BOOLEAN DEFAULT FALSE
+  is_deactivated     BOOLEAN DEFAULT FALSE,
+  tags               JSONB DEFAULT '[]'
 );
 
 INSERT INTO users
@@ -136,35 +152,14 @@ CREATE FUNCTION count_estimate(query text) RETURNS bigint AS $$
   END;
   $$ LANGUAGE plpgsql VOLATILE STRICT;
 
-
--- fourcat settings insert default settings
--- TODO SHOULD BE ABLE TO REMOVE; all these should have corresponding values in common/lib/config_definitions given defaults
-INSERT INTO settings
-  (name, value)
-  Values
-    ('4cat.datasources', '["bitchute", "custom", "douban", "customimport", "reddit", "telegram", "twitterv2", "tiktok", "instagram", "9gag", "imgur", "linkedin", "parler", "douyin", "twitter-import"]'),
-    ('4cat.name', '"4CAT"'),
-    ('4cat.name_long', '"4CAT: Capture and Analysis Toolkit"'),
-    ('4cat.github_url', '"https://github.com/digitalmethodsinitiative/4cat"'),
-    ('4cat.phone_home_url', '"https://ping.4cat.nl"'),
-    ('path.versionfile', '".git-checked-out"'),
-    ('expire.timeout', '0'),
-    ('expire.allow_optout', 'true'),
-    ('expire.datasources', '{"tumblr": {"timeout": 259200, "allow_optout": false}}'),
-    ('logging.slack.level', '"WARNING"'),
-    ('logging.slack.webhook', 'null'),
-    ('mail.admin_email', 'null'),
-    ('mail.ssl', 'false'),
-    ('mail.username', 'null'),
-    ('mail.password', 'null'),
-    ('mail.noreply', '"noreply@localhost"'),
-    ('fourchan.image_interval', '3600'),
-    ('explorer.max_posts', '100000'),
-    ('flask.flask_app', '"webtool/fourcat"'),
-    ('flask.secret_key', concat('"', substr(md5(random()::text), 0, 25), '"')),
-    ('flask.https', 'false'),
-    ('flask.server_name', '"localhost"'),
-    ('flask.autologin.name', '"Automatic login"'),
-    ('flask.autologin.hostnames', '["localhost"]'),
-    ('flask.autologin.api', '["localhost"]')
-    ON CONFLICT DO NOTHING;
+-- default admin privileges
+INSERT INTO settings (name, value, tag) VALUES
+  ('privileges.admin.can_view_status', 'true', 'admin'),
+  ('privileges.admin.can_manage_users', 'true', 'admin'),
+  ('privileges.admin.can_manage_settings', 'true', 'admin'),
+  ('privileges.admin.can_manage_datasources', 'true', 'admin'),
+  ('privileges.admin.can_manage_notifications', 'true', 'admin'),
+  ('privileges.admin.can_manage_tags', 'true', 'admin'),
+  ('privileges.admin.can_restart', 'true', 'admin'),
+  ('privileges.can_view_all_datasets', 'true', 'admin'),
+  ('privileges.can_view_private_datasets', 'true', 'admin');
diff --git a/backend/lib/database_mysql.py b/backend/lib/database_mysql.py
@@ -4,8 +4,6 @@
 import pymysql.connections as mysqlconnections
 import pymysql
 
-import common.config_manager as config
-
 class MySQLDatabase:
 	"""
 	Simple database handler for MySQL connections

diff --git a/backend/lib/manager.py b/backend/lib/manager.py
@@ -47,7 +47,14 @@ def __init__(self, queue, database, logger, as_daemon=True):
 			if hasattr(worker, "ensure_job"):
 				self.queue.add_job(jobtype=worker_name, **worker.ensure_job)
 
-		self.log.info('4CAT Started')
+		self.log.info("4CAT Started")
+
+		# flush module collector log buffer
+		# the logger is not available when this initialises
+		# but it is now!
+		if all_modules.log_buffer:
+			self.log.warning(all_modules.log_buffer)
+			all_modules.log_buffer = ""
 
 		# it's time
 		self.loop()
@@ -87,7 +94,6 @@ def delegate(self):
 				# worker slots, start a new worker to run it
 				if len(self.worker_pool[jobtype]) < worker_class.max_workers:
 					try:
-						self.log.debug("Starting new worker for job %s" % jobtype)
 						job.claim()
 						worker = worker_class(logger=self.log, manager=self, job=job, modules=all_modules)
 						worker.start()

diff --git a/backend/abstract/preset.py → backend/lib/preset.py b/backend/abstract/preset.py → backend/lib/preset.py
@@ -2,7 +2,7 @@
 Queue a series of processors at once via a preset
 """
 import abc
-from backend.abstract.processor import BasicProcessor
+from backend.lib.processor import BasicProcessor
 
 from common.lib.dataset import DataSet
 

diff --git a/backend/abstract/processor.py → backend/lib/processor.py b/backend/abstract/processor.py → backend/lib/processor.py
@@ -12,12 +12,12 @@
 
 from pathlib import Path, PurePath
 
-import backend
-from backend.abstract.worker import BasicWorker
+from backend.lib.worker import BasicWorker
 from common.lib.dataset import DataSet
 from common.lib.fourcat_module import FourcatModule
 from common.lib.helpers import get_software_version, remove_nuls
 from common.lib.exceptions import WorkerInterruptedException, ProcessorInterruptedException, ProcessorException
+from common.config_manager import config, ConfigWrapper
 
 csv.field_size_limit(1024 * 1024 * 1024)
 
@@ -34,9 +34,8 @@ class BasicProcessor(FourcatModule, BasicWorker, metaclass=abc.ABCMeta):
 
 	To determine whether a processor can process a given dataset, you can
 	define a `is_compatible_with(FourcatModule module=None):) -> bool` class
-	method which takes a dataset *or* processor as argument and returns a bool
-	that determines if this processor is considered compatible with that
-	dataset or processor. For example:
+	method which takes a dataset as argument and returns a bool that determines
+	if this processor is considered compatible with that dataset. For example:
 
 	.. code-block:: python
 
@@ -56,6 +55,9 @@ def is_compatible_with(cls, module=None):
 	#: The dataset object that the processor is *creating*.
 	dataset = None
 
+	#: Owner (username) of the dataset
+	owner = None
+
 	#: The dataset object that the processor is *processing*.
 	source_dataset = None
 
@@ -74,6 +76,9 @@ def is_compatible_with(cls, module=None):
 	#: Configurable options for this processor
 	options = {}
 
+	#: 4CAT settings from the perspective of the dataset's owner
+	config = None
+
 	#: Values for the processor's options, populated by user input
 	parameters = {}
 
@@ -93,13 +98,22 @@ def work(self):
 		up.
 		"""
 		try:
+			# a dataset can have multiple owners, but the creator is the user
+			# that actually queued the processor, so their config is relevant
 			self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db)
-		except TypeError:
+			self.owner = self.dataset.creator
+		except TypeError as e:
 			# query has been deleted in the meantime. finish without error,
 			# as deleting it will have been a conscious choice by a user
 			self.job.finish()
 			return
 
+		# set up config reader using the worker's DB connection and the dataset
+		# creator. This ensures that if a value has been overriden for the owner,
+		# the overridden value is used instead.
+		config.with_db(self.db)
+		self.config = ConfigWrapper(config=config, user=self.owner)
+
 		if self.dataset.data.get("key_parent", None):
 			# search workers never have parents (for now), so we don't need to
 			# find out what the source_dataset dataset is if it's a search worker
@@ -242,7 +256,7 @@ def after_process(self):
 					parent=self.dataset.key,
 					extension=available_processors[next_type].extension,
 					is_private=self.dataset.is_private,
-					owner=self.dataset.owner
+					owner=self.dataset.creator
 				)
 				self.queue.add_job(next_type, remote_id=next_analysis.key)
 			else:
@@ -649,45 +663,6 @@ def get_status(cls):
 		"""
 		return cls.status if hasattr(cls, "status") else None
 
-	@classmethod
-	def get_available_processors(cls, self):
-		"""
-		Get list of processors compatible with this processor
-
-		Checks whether this dataset type is one that is listed as being accepted
-		by the processor, for each known type: if the processor does not
-		specify accepted types (via the `is_compatible_with` method), it is
-		assumed it accepts any top-level datasets
-
-		:return dict:  Compatible processors, `name => class` mapping
-		"""
-		processors = backend.all_modules.processors
-
-		available = []
-		for processor_type, processor in processors.items():
-			if processor_type.endswith("-search"):
-				continue
-
-			# consider a processor compatible if its is_compatible_with
-			# method returns True *or* if it has no explicit compatibility
-			# check and this dataset is top-level (i.e. has no parent)
-			if hasattr(processor, "is_compatible_with"):
-				if processor.is_compatible_with(module=self):
-					available.append(processor)
-
-		return available
-
-	@classmethod
-	def is_dataset(cls):
-		"""
-		Confirm this is *not* a dataset, but a processor.
-
-		Used for processor compatibility checks.
-
-		:return bool:  Always `False`, because this is a processor.
-		"""
-		return False
-
 	@classmethod
 	def is_top_dataset(cls):
 		"""
@@ -761,3 +736,5 @@ def process(self):
 		To be defined by the child processor.
 		"""
 		pass
+
+
diff --git a/backend/abstract/scraper.py → backend/lib/scraper.py b/backend/abstract/scraper.py → backend/lib/scraper.py
@@ -8,9 +8,9 @@
 import abc
 
 from pathlib import Path
-from backend.abstract.worker import BasicWorker
+from backend.lib.worker import BasicWorker
 
-import common.config_manager as config
+from common.config_manager import config
 
 class BasicHTTPScraper(BasicWorker, metaclass=abc.ABCMeta):
 	"""

diff --git a/backend/abstract/search.py → backend/lib/search.py b/backend/abstract/search.py → backend/lib/search.py
@@ -9,9 +9,9 @@
 from pathlib import Path
 from abc import ABC, abstractmethod
 
-import common.config_manager as config
+from common.config_manager import config
 from common.lib.dataset import DataSet
-from backend.abstract.processor import BasicProcessor
+from backend.lib.processor import BasicProcessor
 from common.lib.helpers import strip_tags, dict_search_and_update, remove_nuls, HashCache
 from common.lib.exceptions import WorkerInterruptedException, ProcessorInterruptedException
 
@@ -189,6 +189,7 @@ def import_from_file(self, path):
 				}
 
 		path.unlink()
+		self.dataset.delete_parameter("file")
 
 	def items_to_csv(self, results, filepath):
 		"""