-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
186 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
The RedNote data source can be used to manipulate data collected from [RedNote](https://www.xiaohongshu.com/) - also | ||
known as Xiaohongshu or Little Red Book - with [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected with the browser extension; 4CAT | ||
cannot collect data on its own. After collecting data with Zeeschuimer it can be uploaded to 4CAT for further processing | ||
and analysis. See the Zeeschuimer documentation for more information on how to collect data with it. | ||
|
||
Data is collected as it is formatted internally by RedNote' website. Posts are stored as (large) JSON objects; it | ||
will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure | ||
is relatively straightforward and contains some data not included in the CSV exports. | ||
|
||
Note that depending on the page data is captured from, some metadata may not be available. For example, when capturing | ||
data from the 'Explore' page, the description and time of posting of a post are not available. These are however | ||
available when capturing from a post's page. After importing a dataset to 4CAT, the dataset status will summarise what | ||
information is and is not available. A `missing_fields` column additionally contains the names of columns with missing | ||
data for each imported item. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
""" | ||
Initialize RedNote data source | ||
""" | ||
|
||
# An init_datasource function is expected to be available to initialize this | ||
# data source. A default function that does this is available from the | ||
# backend helpers library. | ||
from common.lib.helpers import init_datasource | ||
|
||
# Internal identifier for this data source | ||
DATASOURCE = "xiaohongshu" | ||
NAME = "Xiaohongshu/RedNote" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
""" | ||
Import scraped RedNote data | ||
It's prohibitively difficult to scrape data from RedNote within 4CAT itself due | ||
to its aggressive rate limiting. Instead, import data collected elsewhere. | ||
""" | ||
from datetime import datetime | ||
|
||
from backend.lib.search import Search | ||
from common.lib.item_mapping import MappedItem, MissingMappedField | ||
|
||
|
||
class SearchRedNote(Search): | ||
""" | ||
Import scraped RedNote/Xiaohongshu/XSH data | ||
""" | ||
type = "xiaohongshu-search" # job ID | ||
category = "Search" # category | ||
title = "Import scraped RedNote data" # title displayed in UI | ||
description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI | ||
extension = "ndjson" # extension of result file, used internally and in UI | ||
is_from_zeeschuimer = True | ||
|
||
# not available as a processor for existing datasets | ||
accepts = [None] | ||
references = [ | ||
"[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", | ||
"[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" | ||
] | ||
|
||
def get_items(self, query): | ||
""" | ||
Run custom search | ||
Not available for RedNote | ||
""" | ||
raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") | ||
|
||
|
||
@staticmethod | ||
def map_item(post): | ||
""" | ||
Map XSH object to 4CAT item | ||
Depending on whether the object was captured from JSON or HTML, treat it | ||
differently. A lot of data is missing from HTML objects. | ||
:param post: | ||
:return: | ||
""" | ||
if post.get("_zs-origin") == "html": | ||
return SearchRedNote.map_item_from_html(post) | ||
else: | ||
if "note" in post: | ||
return SearchRedNote.map_item_from_json_embedded(post) | ||
else: | ||
if post.get("type") == "video": | ||
post["note_card"] = post.copy() | ||
return SearchRedNote.map_item_from_json_api_explore(post) | ||
|
||
@staticmethod | ||
def map_item_from_json_api_explore(post): | ||
""" | ||
Map API-sourced XSH object to 4CAT item | ||
Most straightforward - JSON objects from the XSH web API, which do | ||
however not always contain the same fields. | ||
:param dict post: | ||
:return MappedItem: | ||
""" | ||
item = post["note_card"] | ||
item_id = post.get("id", post.get("note_id")) | ||
|
||
image = item["image_list"].pop(0)["url_default"] if item.get("image_list") else item["cover"]["url_default"] | ||
|
||
# permalinks need this token to work, else you get a 404 not found | ||
xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" | ||
|
||
timestamp = item.get("time", None) | ||
return MappedItem({ | ||
"id": item_id, | ||
"url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", | ||
"title": item.get("display_title", ""), | ||
"body": item.get("desc", "") if "desc" in item else MissingMappedField(""), | ||
"timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), | ||
"author": item["user"]["nickname"], | ||
"author_avatar_url": item["user"]["avatar"], | ||
"image_url": image, | ||
# only available when loading an individual post page, so skip | ||
# "tags": ",".join(t["name"] for t in item["tag_list"]), | ||
"likes": item["interact_info"]["liked_count"], | ||
# "collects": item["interact_info"]["collected_count"], | ||
# "comments": item["interact_info"]["comment_count"], | ||
# "shares": item["interact_info"]["share_count"], | ||
"unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), | ||
}) | ||
|
||
@staticmethod | ||
def map_item_from_json_embedded(item): | ||
""" | ||
Map JSON object from an XHS HTML page | ||
JSON objects from the HTML are formatted slightly differently, mostly | ||
in that they use camelCase instead of underscores, but we can also | ||
make a few more assumptions about the data | ||
:param dict item: | ||
:return MappedItem: | ||
""" | ||
note = item["note"] | ||
image = note["imageList"].pop(0)["urlDefault"] | ||
# permalinks need this token to work, else you get a 404 not found | ||
xsec_bit = f"?xsec_token={note['xsecToken']}" | ||
timestamp = item.get("time", None) | ||
|
||
return MappedItem({ | ||
"id": item["id"], | ||
"url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", | ||
"title": note.get("title", ""), | ||
"body": note.get("desc", "") if "desc" in note else MissingMappedField(""), | ||
"timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), | ||
"author": note["user"]["nickname"], | ||
"author_avatar_url": note["user"]["avatar"], | ||
"image_url": image, | ||
# only available when loading an individual post page, so skip | ||
# "tags": ",".join(t["name"] for t in item["tag_list"]), | ||
"likes": item["interactInfo"]["likedCount"], | ||
# "collects": item["interact_info"]["collected_count"], | ||
# "comments": item["interact_info"]["comment_count"], | ||
# "shares": item["interact_info"]["share_count"], | ||
"unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), | ||
}) | ||
|
||
def map_item_from_html(item): | ||
""" | ||
Map pre-mapped item | ||
These have been mapped by Zeeschuimer from the page HTML and contain | ||
less data than JSON objects (but enough to be useful in some cases). | ||
:param dict item: | ||
:return MappedItem: | ||
""" | ||
return MappedItem({ | ||
"id": item["id"], | ||
"url": f"https://www.xiaohongshu.com{item['url']}", | ||
"title": item["title"], | ||
"body": MissingMappedField(""), | ||
"timestamp": MissingMappedField(""), | ||
"author": item["author_name"], | ||
"author_avatar_url": item["author_avatar_url"], | ||
"image_url": item["thumbnail_url"], | ||
# "tags": MissingMappedField(""), | ||
"likes": item["likes"], | ||
# "collects": MissingMappedField(""), | ||
# "comments": MissingMappedField(""), | ||
# "shares": MissingMappedField(""), | ||
"unix_timestamp": MissingMappedField(""), | ||
}) |