Skip to content

Commit

Permalink
merge get_item_keys w/ get_columns
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Feb 20, 2025
1 parent 4940702 commit a99ef1a
Show file tree
Hide file tree
Showing 8 changed files with 17 additions and 35 deletions.
2 changes: 1 addition & 1 deletion backend/lib/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,
# go through items one by one, optionally mapping them
if parent_path.suffix.lower() == ".csv":
# Get field names
fieldnames = which_parent.get_item_keys(self)
fieldnames = which_parent.get_columns()
if not update_existing and field_name in fieldnames:
raise ProcessorException('field_name %s already exists!' % field_name)
fieldnames.append(field_name)
Expand Down
38 changes: 10 additions & 28 deletions common/lib/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,31 +408,6 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
# yield a DatasetItem, which is a dict with some special properties
yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item))

def get_item_keys(self, processor=None):
"""
Get item attribute names
It can be useful to know what attributes an item in the dataset is
stored with, e.g. when one wants to produce a new dataset identical
to the source_dataset one but with extra attributes. This method provides
these, as a list.
:param BasicProcessor processor: A reference to the processor
asking for the item keys, to pass on to iterate_mapped_items
:return list: List of keys, may be empty if there are no items in the
dataset
"""

items = self.iterate_items(processor, warn_unmappable=False)
try:
keys = list(items.__next__().keys())
except (StopIteration, NotImplementedError):
return []
finally:
del items

return keys

def get_staging_area(self):
"""
Get path to a temporary folder in which files can be stored before
Expand Down Expand Up @@ -836,13 +811,20 @@ def get_columns(self):
:return list: List of dataset columns; empty list if unable to parse
"""

if not self.get_results_path().exists():
# no file to get columns from
return False
return []

if (self.get_results_path().suffix.lower() == ".csv") or (self.get_results_path().suffix.lower() == ".ndjson" and self.get_own_processor() is not None and self.get_own_processor().map_item_method_available(dataset=self)):
return self.get_item_keys(processor=self.get_own_processor())
items = self.iterate_items(warn_unmappable=False)
try:
keys = list(items.__next__().keys())
except (StopIteration, NotImplementedError):
# No items or otherwise unable to iterate
return []
finally:
del items
return keys
else:
# Filetype not CSV or an NDJSON with `map_item`
return []
Expand Down
2 changes: 1 addition & 1 deletion processors/conversion/consolidate_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def process(self):
expand_urls = self.parameters.get("expand_urls", False)

# Get fieldnames
fieldnames = self.source_dataset.get_item_keys(self) + ["4CAT_consolidated_urls_"+method]
fieldnames = self.source_dataset.get_columns() + ["4CAT_consolidated_urls_"+method]
# Avoid requesting the same URL multiple times (if redirects are to be resolved)
redirect_cache = {}

Expand Down
2 changes: 1 addition & 1 deletion processors/conversion/extract_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def process(self):
correct_croudtangle = self.parameters.get("correct_croudtangle", False)

# Create fieldnames
fieldnames = self.source_dataset.get_item_keys(self) + ["4CAT_number_unique_urls", "4CAT_extracted_urls"] + ["4CAT_extracted_from_" + column for column in columns]
fieldnames = self.source_dataset.get_columns() + ["4CAT_number_unique_urls", "4CAT_extracted_urls"] + ["4CAT_extracted_from_" + column for column in columns]

# Avoid requesting the same URL multiple times
cache = {}
Expand Down
2 changes: 1 addition & 1 deletion processors/conversion/ndjson_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def process(self):
# We first collect all possible columns for the csv file, then
# for each item make sure there is a value for all the columns (in the
# second step)
all_keys = self.source_dataset.get_item_keys()
all_keys = self.source_dataset.get_columns()

self.dataset.update_status("Converting file")
staging_area = self.dataset.get_staging_area()
Expand Down
2 changes: 1 addition & 1 deletion processors/filtering/reddit_get_votes.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def process(self):

# now write a new CSV with the updated scores
# get field names
fieldnames = [*self.source_dataset.get_item_keys(self)]
fieldnames = [*self.source_dataset.get_columns()]
if "score" not in fieldnames:
fieldnames.append("score")

Expand Down
2 changes: 1 addition & 1 deletion processors/metrics/hatebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def process(self):
processed = 0
matches = 0
with self.dataset.get_results_path().open("w") as output:
fieldnames = self.source_dataset.get_item_keys(self)
fieldnames = self.source_dataset.get_columns()
fieldnames += ("hatebase_num", "hatebase_num_ambiguous", "hatebase_num_unambiguous",
"hatebase_terms", "hatebase_terms_ambiguous", "hatebase_terms_unambiguous",
"hatebase_offensiveness_avg")
Expand Down
2 changes: 1 addition & 1 deletion processors/metrics/overtime-hatebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def process(self):
views = {}
intervals = set()

fieldnames = self.source_dataset.get_item_keys(self)
fieldnames = self.source_dataset.get_columns()
if "views" in fieldnames:
engagement_field = "views"
elif "score" in fieldnames:
Expand Down

0 comments on commit a99ef1a

Please sign in to comment.