Skip to content

Commit

Permalink
Don't crash on importing Twitter data with withheld retweets
Browse files Browse the repository at this point in the history
  • Loading branch information
stijn-uva committed Jul 11, 2023
1 parent 5eb2a28 commit a5a1d36
Showing 1 changed file with 21 additions and 13 deletions.
34 changes: 21 additions & 13 deletions datasources/twitter-import/search_twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,22 @@ def map_item(tweet):
@staticmethod
def map_item_modern(tweet):
timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
withheld = False

retweet = tweet["legacy"].get("retweeted_status_result")
if retweet:
# make sure the full RT is included, by default this is shortened
if "tweet" in retweet["result"]:
retweet["result"] = retweet["result"]["tweet"]

t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
print(json.dumps(retweet))
if retweet["result"].get("legacy", {}).get("withheld_scope"):
withheld = True
tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
else:
t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
": " + retweet["result"]["legacy"]["full_text"]
tweet["legacy"]["full_text"] = t_text
tweet["legacy"]["full_text"] = t_text

quote_tweet = tweet.get("quoted_status_result")
if quote_tweet and "tweet" in quote_tweet.get("result", {}):
Expand All @@ -84,13 +90,12 @@ def map_item_modern(tweet):
"quote_count": tweet["legacy"]["quote_count"],
"impression_count": tweet.get("views", {}).get("count", ""),
"is_retweet": "yes" if retweet else "no",
"retweeted_user": retweet["result"]["core"]["user_results"]["result"]["legacy"][
"screen_name"] if retweet else "",
"retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
"is_quote_tweet": "yes" if quote_tweet else "no",
"quoted_user": quote_tweet["result"]["core"]["user_results"]["result"]["legacy"][
"screen_name"] if quote_tweet else "",
"quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "",
"is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no",
"replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""),
"is_withheld": "yes" if withheld else "no",
"hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
"urls": ",".join([url["expanded_url"] for url in tweet["legacy"]["entities"]["urls"]]),
"images": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if
Expand All @@ -107,17 +112,21 @@ def map_item_modern(tweet):
def map_item_legacy(tweet):
timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
tweet_id = tweet["legacy"]["id_str"]
withheld = False

retweet = tweet["legacy"].get("retweeted_status_result")
if retweet:
# make sure the full RT is included, by default this is shortened
t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
if retweet["result"].get("legacy", {}).get("withheld_status"):
withheld = True
tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
else:
t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
" " + retweet["result"]["legacy"]["full_text"]
tweet["legacy"]["full_text"] = t_text
tweet["legacy"]["full_text"] = t_text

quote_tweet = tweet.get("quoted_status_result")

quote_tweet = tweet.get("quoted_status_result")
if quote_tweet and "tweet" in quote_tweet.get("result", {}):
# sometimes this is one level deeper, sometimes not...
quote_tweet["result"] = quote_tweet["result"]["tweet"]
Expand All @@ -141,14 +150,13 @@ def map_item_legacy(tweet):
"quote_count": tweet["legacy"]["quote_count"],
"impression_count": tweet.get("ext_views", {}).get("count", ""),
"is_retweet": "yes" if retweet else "no",
"retweeted_user": retweet["result"]["core"]["user_results"]["result"]["legacy"][
"screen_name"] if retweet else "",
"retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
"is_quote_tweet": "yes" if quote_tweet else "no",
"quoted_user": quote_tweet["result"]["core"]["user_results"]["result"]["legacy"][
"screen_name"] if quote_tweet else "",
"quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "",
"is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no",
"replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get(
"in_reply_to_screen_name") else "",
"is_withheld": "yes" if withheld else "no",
"hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
"urls": ",".join([url["expanded_url"] for url in tweet["legacy"]["entities"]["urls"]]),
"images": ",".join(
Expand Down

0 comments on commit a5a1d36

Please sign in to comment.