From 00874576c354235f4655f1d433ec4382010e18e3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 30 May 2024 14:54:51 +0200 Subject: [PATCH] image_category_wall fix float categories --- datasources/douyin/search_douyin.py | 2 +- .../visualisation/image_category_wall.py | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index 5dfa6e7d3..e66b177ff 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -219,7 +219,7 @@ def map_item(item): "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")), # Adding this as different Douyin pages contain different data "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}", - "region": item.get("region"), + "region": item.get("region", ""), "hashtags": ",".join( [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if hashtag_key in tag]), diff --git a/processors/visualisation/image_category_wall.py b/processors/visualisation/image_category_wall.py index c8fbae139..18ce14e81 100644 --- a/processors/visualisation/image_category_wall.py +++ b/processors/visualisation/image_category_wall.py @@ -185,7 +185,7 @@ def process(self): image_data = json.load(file) filename_map = {post_id: staging_area.joinpath(image.get("filename")) for image in image_data.values() if image.get("success") for post_id in image.get("post_ids")} - self.dataset.log(filename_map) + # Organize posts into categories category_type = None categories = {} @@ -200,10 +200,7 @@ def process(self): continue # Identify category type and collect post_category - if post.get(category_column) is None: - self.dataset.finish_with_error("Unable to find category column in dataset") - return - elif special_case and category_column == "top_categories": + if special_case and category_column == "top_categories": if category_type is None: category_type = float # Special case @@ -218,8 +215,11 @@ def process(self): else: if category_type is None: try: - float(post.get(category_column)) - category_type = float + if post.get(category_column) is None: + category_type = str + else: + float(post.get(category_column)) + category_type = float except ValueError: category_type = str @@ -232,6 +232,9 @@ def process(self): else: categories[post_category].append({"id": post.get("id")}) elif category_type == float: + if post.get(category_column) is None: + self.dataset.log(f"Post {post.get('id')} has no data; skipping") + continue try: post_category = float(post.get(category_column)) post_values.append((post_category, post.get("id"))) @@ -240,7 +243,7 @@ def process(self): raise ProcessorException( f"Mixed category types detected; unable to render image wall (item {i} {post_category})") - if len(categories) == 0: + if len(categories) == 0 and len(post_values) == 0: self.dataset.finish_with_error("No categories found") return