From 323a83076bbf4fb66ce6048da4494a9983028b85 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 12 Oct 2023 14:56:33 +0200 Subject: [PATCH] douyin add stream stats and video tags to map_item --- datasources/douyin/search_douyin.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index 519b35240..24ee6dc23 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -53,6 +53,7 @@ def map_item(post): video_description = stream_data.get("title") duration = "Unknown" prevent_download = None + stats = stream_data.get("stats") # Author is stream owner author = stream_data.get("owner") @@ -68,6 +69,7 @@ def map_item(post): video_description = post["desc"] duration = post.get("duration", post.get("video", {}).get("duration", "Unknown")) prevent_download = "yes" if post["download"]["prevent"] else "no" + stats = post["stats"] # Author is, well, author author = post["authorInfo"] @@ -88,7 +90,6 @@ def map_item(post): mix_name_key = "mixName" # Stats - stats = post["stats"] collect_count = stats["collectCount"] comment_count = stats["commentCount"] digg_count = stats["diggCount"] @@ -108,14 +109,17 @@ def map_item(post): if stream_data: subject = "Stream" stream_data = json.loads(stream_data) - post_timestamp = datetime.fromtimestamp(stream_data.get("create_time", post.get("create_time", metadata.get( - "timestamp_collected") / 1000))) # Some posts appear to have no timestamp! We substitute collection time + post_timestamp = datetime.fromtimestamp( + stream_data.get("create_time", post.get("create_time", metadata.get( + "timestamp_collected") / 1000))) # Some posts appear to have no timestamp! We substitute collection time video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1") video_description = stream_data.get("title") duration = "Unknown" # Author is stream owner author = stream_data.get("owner") + video_tags = stream_data.get("video_feed_tag") + stats = stream_data.get("stats") else: post_timestamp = datetime.fromtimestamp(post["create_time"]) @@ -127,6 +131,10 @@ def map_item(post): # Author is, well, author author = post["author"] + video_tags = ",".join( + [item["tag_name"] for item in (post["video_tag"] if post["video_tag"] is not None else []) if + "tag_name" in item]) + stats = post.get("statistics") prevent_download = ("yes" if post["prevent_download"] else "no") if "prevent_download" in post else None @@ -147,7 +155,6 @@ def map_item(post): is_fake_key = "is_ad_fake" # Stats - stats = post.get("statistics") collect_count = stats.get("collect_count") if stats else "Unknown" comment_count = stats.get("comment_count") if stats else "Unknown" digg_count = stats.get("digg_count") if stats else "Unknown" @@ -163,6 +170,10 @@ def map_item(post): mix_current_episode = post.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A") + # Stream Stats + count_total_streams_viewers = stats.get("total_user", "N/A") + count_current_stream_viewers = int(stats.get("user_count_str")) if "user_count_str" in stats else "N/A" + # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed displayed = True if post.get("ZS_collected_from_mix") and not post.get("ZS_first_mix_vid"): @@ -197,6 +208,8 @@ def map_item(post): "forward_count": forward_count, "play_count": play_count, "share_count": share_count, + "count_total_streams_viewers": count_total_streams_viewers, + "count_current_stream_viewers": count_current_stream_viewers, # Author data "author_user_id": post[author_id_key] if author_id_key in post else author.get("uid", author.get("id")), "author_nickname": author["nickname"],