diff --git a/processors/metrics/hatebase.py b/processors/metrics/hatebase.py index 2fb69f8ce..6a587519f 100644 --- a/processors/metrics/hatebase.py +++ b/processors/metrics/hatebase.py @@ -76,6 +76,7 @@ def process(self): # determine what vocabulary to use language = self.parameters.get("language") columns = self.parameters.get("columns") + self.dataset.log(f"Language: {language}; Columns: {columns}") if not columns: self.dataset.update_status("No columns selected; no data analysed.", is_final=True) @@ -87,9 +88,16 @@ def process(self): hatebase = json.loads(hatebasedata.read()) hatebase = {term.lower(): hatebase[term] for term in hatebase} + self.dataset.log(f"Number of hatebase terms: {len(hatebase)}") hatebase_regex = re.compile(r"\b(" + "|".join([re.escape(term) for term in hatebase]) + r")\b") + if not hatebase or not hatebase_regex: + self.dataset.update_status("No hatebase data found for the selected language.", is_final=True) + self.dataset.finish(0) + return + processed = 0 + matches = 0 with self.dataset.get_results_path().open("w") as output: fieldnames = self.source_dataset.get_item_keys(self) fieldnames += ("hatebase_num", "hatebase_num_ambiguous", "hatebase_num_unambiguous", @@ -124,6 +132,7 @@ def process(self): post_text = ' '.join([str(post.get(c, "")).lower() for c in columns]) for term in hatebase_regex.findall(post_text): + matches += 1 if hatebase[term]["plural_of"]: if hatebase[term]["plural_of"] in terms: continue @@ -157,6 +166,7 @@ def process(self): self.dataset.finish(0) return + self.dataset.log(f"Total terms matched: {matches}") self.dataset.update_status("Finished") self.dataset.finish(processed) @@ -182,6 +192,6 @@ def get_options(cls, parent_dataset=None, user=None): columns = parent_dataset.get_columns() options["columns"]["type"] = UserInput.OPTION_MULTI_SELECT options["columns"]["options"] = {v: v for v in columns} - options["columns"]["default"] = "body" if "body" in columns else sorted(columns).pop() + options["columns"]["default"] = ["body"] if "body" in columns else [sorted(columns).pop()] return options