diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 2446b3174..ebf721773 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -431,7 +431,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset, # go through items one by one, optionally mapping them if parent_path.suffix.lower() == ".csv": # Get field names - fieldnames = which_parent.get_item_keys(self) + fieldnames = which_parent.get_columns() if not update_existing and field_name in fieldnames: raise ProcessorException('field_name %s already exists!' % field_name) fieldnames.append(field_name) diff --git a/common/lib/dataset.py b/common/lib/dataset.py index e63df388e..b6951503b 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -408,31 +408,6 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau # yield a DatasetItem, which is a dict with some special properties yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item)) - def get_item_keys(self, processor=None): - """ - Get item attribute names - - It can be useful to know what attributes an item in the dataset is - stored with, e.g. when one wants to produce a new dataset identical - to the source_dataset one but with extra attributes. This method provides - these, as a list. - - :param BasicProcessor processor: A reference to the processor - asking for the item keys, to pass on to iterate_mapped_items - :return list: List of keys, may be empty if there are no items in the - dataset - """ - - items = self.iterate_items(processor, warn_unmappable=False) - try: - keys = list(items.__next__().keys()) - except (StopIteration, NotImplementedError): - return [] - finally: - del items - - return keys - def get_staging_area(self): """ Get path to a temporary folder in which files can be stored before @@ -836,13 +811,20 @@ def get_columns(self): :return list: List of dataset columns; empty list if unable to parse """ - if not self.get_results_path().exists(): # no file to get columns from - return False + return [] if (self.get_results_path().suffix.lower() == ".csv") or (self.get_results_path().suffix.lower() == ".ndjson" and self.get_own_processor() is not None and self.get_own_processor().map_item_method_available(dataset=self)): - return self.get_item_keys(processor=self.get_own_processor()) + items = self.iterate_items(warn_unmappable=False) + try: + keys = list(items.__next__().keys()) + except (StopIteration, NotImplementedError): + # No items or otherwise unable to iterate + return [] + finally: + del items + return keys else: # Filetype not CSV or an NDJSON with `map_item` return [] diff --git a/processors/conversion/consolidate_urls.py b/processors/conversion/consolidate_urls.py index 358e18baa..26d1935e1 100644 --- a/processors/conversion/consolidate_urls.py +++ b/processors/conversion/consolidate_urls.py @@ -263,7 +263,7 @@ def process(self): expand_urls = self.parameters.get("expand_urls", False) # Get fieldnames - fieldnames = self.source_dataset.get_item_keys(self) + ["4CAT_consolidated_urls_"+method] + fieldnames = self.source_dataset.get_columns() + ["4CAT_consolidated_urls_"+method] # Avoid requesting the same URL multiple times (if redirects are to be resolved) redirect_cache = {} diff --git a/processors/conversion/extract_urls.py b/processors/conversion/extract_urls.py index c40800915..6191e9b1b 100644 --- a/processors/conversion/extract_urls.py +++ b/processors/conversion/extract_urls.py @@ -230,7 +230,7 @@ def process(self): correct_croudtangle = self.parameters.get("correct_croudtangle", False) # Create fieldnames - fieldnames = self.source_dataset.get_item_keys(self) + ["4CAT_number_unique_urls", "4CAT_extracted_urls"] + ["4CAT_extracted_from_" + column for column in columns] + fieldnames = self.source_dataset.get_columns() + ["4CAT_number_unique_urls", "4CAT_extracted_urls"] + ["4CAT_extracted_from_" + column for column in columns] # Avoid requesting the same URL multiple times cache = {} diff --git a/processors/conversion/ndjson_to_csv.py b/processors/conversion/ndjson_to_csv.py index c7f46cf3c..a88166dc3 100644 --- a/processors/conversion/ndjson_to_csv.py +++ b/processors/conversion/ndjson_to_csv.py @@ -44,7 +44,7 @@ def process(self): # We first collect all possible columns for the csv file, then # for each item make sure there is a value for all the columns (in the # second step) - all_keys = self.source_dataset.get_item_keys() + all_keys = self.source_dataset.get_columns() self.dataset.update_status("Converting file") staging_area = self.dataset.get_staging_area() diff --git a/processors/filtering/reddit_get_votes.py b/processors/filtering/reddit_get_votes.py index 52d46d10c..65e647621 100644 --- a/processors/filtering/reddit_get_votes.py +++ b/processors/filtering/reddit_get_votes.py @@ -120,7 +120,7 @@ def process(self): # now write a new CSV with the updated scores # get field names - fieldnames = [*self.source_dataset.get_item_keys(self)] + fieldnames = [*self.source_dataset.get_columns()] if "score" not in fieldnames: fieldnames.append("score") diff --git a/processors/metrics/hatebase.py b/processors/metrics/hatebase.py index 6a587519f..264d9d54d 100644 --- a/processors/metrics/hatebase.py +++ b/processors/metrics/hatebase.py @@ -99,7 +99,7 @@ def process(self): processed = 0 matches = 0 with self.dataset.get_results_path().open("w") as output: - fieldnames = self.source_dataset.get_item_keys(self) + fieldnames = self.source_dataset.get_columns() fieldnames += ("hatebase_num", "hatebase_num_ambiguous", "hatebase_num_unambiguous", "hatebase_terms", "hatebase_terms_ambiguous", "hatebase_terms_unambiguous", "hatebase_offensiveness_avg") diff --git a/processors/metrics/overtime-hatebase.py b/processors/metrics/overtime-hatebase.py index 48a7e9ce3..5172fa23a 100644 --- a/processors/metrics/overtime-hatebase.py +++ b/processors/metrics/overtime-hatebase.py @@ -103,7 +103,7 @@ def process(self): views = {} intervals = set() - fieldnames = self.source_dataset.get_item_keys(self) + fieldnames = self.source_dataset.get_columns() if "views" in fieldnames: engagement_field = "views" elif "score" in fieldnames: