Skip to content

Commit

Permalink
fix hatebase: default column option for OPTION_MULTI_SELECT must be list
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Aug 27, 2024
1 parent e276033 commit 4ba872b
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion processors/metrics/hatebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def process(self):
# determine what vocabulary to use
language = self.parameters.get("language")
columns = self.parameters.get("columns")
self.dataset.log(f"Language: {language}; Columns: {columns}")

if not columns:
self.dataset.update_status("No columns selected; no data analysed.", is_final=True)
Expand All @@ -87,9 +88,16 @@ def process(self):
hatebase = json.loads(hatebasedata.read())

hatebase = {term.lower(): hatebase[term] for term in hatebase}
self.dataset.log(f"Number of hatebase terms: {len(hatebase)}")
hatebase_regex = re.compile(r"\b(" + "|".join([re.escape(term) for term in hatebase]) + r")\b")

if not hatebase or not hatebase_regex:
self.dataset.update_status("No hatebase data found for the selected language.", is_final=True)
self.dataset.finish(0)
return

processed = 0
matches = 0
with self.dataset.get_results_path().open("w") as output:
fieldnames = self.source_dataset.get_item_keys(self)
fieldnames += ("hatebase_num", "hatebase_num_ambiguous", "hatebase_num_unambiguous",
Expand Down Expand Up @@ -124,6 +132,7 @@ def process(self):

post_text = ' '.join([str(post.get(c, "")).lower() for c in columns])
for term in hatebase_regex.findall(post_text):
matches += 1
if hatebase[term]["plural_of"]:
if hatebase[term]["plural_of"] in terms:
continue
Expand Down Expand Up @@ -157,6 +166,7 @@ def process(self):
self.dataset.finish(0)
return

self.dataset.log(f"Total terms matched: {matches}")
self.dataset.update_status("Finished")
self.dataset.finish(processed)

Expand All @@ -182,6 +192,6 @@ def get_options(cls, parent_dataset=None, user=None):
columns = parent_dataset.get_columns()
options["columns"]["type"] = UserInput.OPTION_MULTI_SELECT
options["columns"]["options"] = {v: v for v in columns}
options["columns"]["default"] = "body" if "body" in columns else sorted(columns).pop()
options["columns"]["default"] = ["body"] if "body" in columns else [sorted(columns).pop()]

return options

0 comments on commit 4ba872b

Please sign in to comment.