Skip to content

Commit

Permalink
douyin add stream stats and video tags to map_item
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Oct 12, 2023
1 parent c53f53d commit 323a830
Showing 1 changed file with 17 additions and 4 deletions.
21 changes: 17 additions & 4 deletions datasources/douyin/search_douyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def map_item(post):
video_description = stream_data.get("title")
duration = "Unknown"
prevent_download = None
stats = stream_data.get("stats")

# Author is stream owner
author = stream_data.get("owner")
Expand All @@ -68,6 +69,7 @@ def map_item(post):
video_description = post["desc"]
duration = post.get("duration", post.get("video", {}).get("duration", "Unknown"))
prevent_download = "yes" if post["download"]["prevent"] else "no"
stats = post["stats"]

# Author is, well, author
author = post["authorInfo"]
Expand All @@ -88,7 +90,6 @@ def map_item(post):
mix_name_key = "mixName"

# Stats
stats = post["stats"]
collect_count = stats["collectCount"]
comment_count = stats["commentCount"]
digg_count = stats["diggCount"]
Expand All @@ -108,14 +109,17 @@ def map_item(post):
if stream_data:
subject = "Stream"
stream_data = json.loads(stream_data)
post_timestamp = datetime.fromtimestamp(stream_data.get("create_time", post.get("create_time", metadata.get(
"timestamp_collected") / 1000))) # Some posts appear to have no timestamp! We substitute collection time
post_timestamp = datetime.fromtimestamp(
stream_data.get("create_time", post.get("create_time", metadata.get(
"timestamp_collected") / 1000))) # Some posts appear to have no timestamp! We substitute collection time
video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
video_description = stream_data.get("title")
duration = "Unknown"

# Author is stream owner
author = stream_data.get("owner")
video_tags = stream_data.get("video_feed_tag")
stats = stream_data.get("stats")

else:
post_timestamp = datetime.fromtimestamp(post["create_time"])
Expand All @@ -127,6 +131,10 @@ def map_item(post):

# Author is, well, author
author = post["author"]
video_tags = ",".join(
[item["tag_name"] for item in (post["video_tag"] if post["video_tag"] is not None else []) if
"tag_name" in item])
stats = post.get("statistics")

prevent_download = ("yes" if post["prevent_download"] else "no") if "prevent_download" in post else None

Expand All @@ -147,7 +155,6 @@ def map_item(post):
is_fake_key = "is_ad_fake"

# Stats
stats = post.get("statistics")
collect_count = stats.get("collect_count") if stats else "Unknown"
comment_count = stats.get("comment_count") if stats else "Unknown"
digg_count = stats.get("digg_count") if stats else "Unknown"
Expand All @@ -163,6 +170,10 @@ def map_item(post):

mix_current_episode = post.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A")

# Stream Stats
count_total_streams_viewers = stats.get("total_user", "N/A")
count_current_stream_viewers = int(stats.get("user_count_str")) if "user_count_str" in stats else "N/A"

# Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed
displayed = True
if post.get("ZS_collected_from_mix") and not post.get("ZS_first_mix_vid"):
Expand Down Expand Up @@ -197,6 +208,8 @@ def map_item(post):
"forward_count": forward_count,
"play_count": play_count,
"share_count": share_count,
"count_total_streams_viewers": count_total_streams_viewers,
"count_current_stream_viewers": count_current_stream_viewers,
# Author data
"author_user_id": post[author_id_key] if author_id_key in post else author.get("uid", author.get("id")),
"author_nickname": author["nickname"],
Expand Down

0 comments on commit 323a830

Please sign in to comment.