Skip to content

Commit

Permalink
Xiaohongshu/RedNote data source
Browse files Browse the repository at this point in the history
  • Loading branch information
stijn-uva committed Feb 17, 2025
1 parent 6b80bac commit 34b8409
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 0 deletions.
14 changes: 14 additions & 0 deletions datasources/xiaohongshu/DESCRIPTION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
The RedNote data source can be used to manipulate data collected from [RedNote](https://www.xiaohongshu.com/) - also
known as Xiaohongshu or Little Red Book - with [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected with the browser extension; 4CAT
cannot collect data on its own. After collecting data with Zeeschuimer it can be uploaded to 4CAT for further processing
and analysis. See the Zeeschuimer documentation for more information on how to collect data with it.

Data is collected as it is formatted internally by RedNote' website. Posts are stored as (large) JSON objects; it
will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure
is relatively straightforward and contains some data not included in the CSV exports.

Note that depending on the page data is captured from, some metadata may not be available. For example, when capturing
data from the 'Explore' page, the description and time of posting of a post are not available. These are however
available when capturing from a post's page. After importing a dataset to 4CAT, the dataset status will summarise what
information is and is not available. A `missing_fields` column additionally contains the names of columns with missing
data for each imported item.
12 changes: 12 additions & 0 deletions datasources/xiaohongshu/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
Initialize RedNote data source
"""

# An init_datasource function is expected to be available to initialize this
# data source. A default function that does this is available from the
# backend helpers library.
from common.lib.helpers import init_datasource

# Internal identifier for this data source
DATASOURCE = "xiaohongshu"
NAME = "Xiaohongshu/RedNote"
160 changes: 160 additions & 0 deletions datasources/xiaohongshu/search_rednote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
Import scraped RedNote data
It's prohibitively difficult to scrape data from RedNote within 4CAT itself due
to its aggressive rate limiting. Instead, import data collected elsewhere.
"""
from datetime import datetime

from backend.lib.search import Search
from common.lib.item_mapping import MappedItem, MissingMappedField


class SearchRedNote(Search):
"""
Import scraped RedNote/Xiaohongshu/XSH data
"""
type = "xiaohongshu-search" # job ID
category = "Search" # category
title = "Import scraped RedNote data" # title displayed in UI
description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI
extension = "ndjson" # extension of result file, used internally and in UI
is_from_zeeschuimer = True

# not available as a processor for existing datasets
accepts = [None]
references = [
"[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
"[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
]

def get_items(self, query):
"""
Run custom search
Not available for RedNote
"""
raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")


@staticmethod
def map_item(post):
"""
Map XSH object to 4CAT item
Depending on whether the object was captured from JSON or HTML, treat it
differently. A lot of data is missing from HTML objects.
:param post:
:return:
"""
if post.get("_zs-origin") == "html":
return SearchRedNote.map_item_from_html(post)
else:
if "note" in post:
return SearchRedNote.map_item_from_json_embedded(post)
else:
if post.get("type") == "video":
post["note_card"] = post.copy()
return SearchRedNote.map_item_from_json_api_explore(post)

@staticmethod
def map_item_from_json_api_explore(post):
"""
Map API-sourced XSH object to 4CAT item
Most straightforward - JSON objects from the XSH web API, which do
however not always contain the same fields.
:param dict post:
:return MappedItem:
"""
item = post["note_card"]
item_id = post.get("id", post.get("note_id"))

image = item["image_list"].pop(0)["url_default"] if item.get("image_list") else item["cover"]["url_default"]

# permalinks need this token to work, else you get a 404 not found
xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""

timestamp = item.get("time", None)
return MappedItem({
"id": item_id,
"url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
"title": item.get("display_title", ""),
"body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
"timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
"author": item["user"]["nickname"],
"author_avatar_url": item["user"]["avatar"],
"image_url": image,
# only available when loading an individual post page, so skip
# "tags": ",".join(t["name"] for t in item["tag_list"]),
"likes": item["interact_info"]["liked_count"],
# "collects": item["interact_info"]["collected_count"],
# "comments": item["interact_info"]["comment_count"],
# "shares": item["interact_info"]["share_count"],
"unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
})

@staticmethod
def map_item_from_json_embedded(item):
"""
Map JSON object from an XHS HTML page
JSON objects from the HTML are formatted slightly differently, mostly
in that they use camelCase instead of underscores, but we can also
make a few more assumptions about the data
:param dict item:
:return MappedItem:
"""
note = item["note"]
image = note["imageList"].pop(0)["urlDefault"]
# permalinks need this token to work, else you get a 404 not found
xsec_bit = f"?xsec_token={note['xsecToken']}"
timestamp = item.get("time", None)

return MappedItem({
"id": item["id"],
"url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
"title": note.get("title", ""),
"body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
"timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
"author": note["user"]["nickname"],
"author_avatar_url": note["user"]["avatar"],
"image_url": image,
# only available when loading an individual post page, so skip
# "tags": ",".join(t["name"] for t in item["tag_list"]),
"likes": item["interactInfo"]["likedCount"],
# "collects": item["interact_info"]["collected_count"],
# "comments": item["interact_info"]["comment_count"],
# "shares": item["interact_info"]["share_count"],
"unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
})

def map_item_from_html(item):
"""
Map pre-mapped item
These have been mapped by Zeeschuimer from the page HTML and contain
less data than JSON objects (but enough to be useful in some cases).
:param dict item:
:return MappedItem:
"""
return MappedItem({
"id": item["id"],
"url": f"https://www.xiaohongshu.com{item['url']}",
"title": item["title"],
"body": MissingMappedField(""),
"timestamp": MissingMappedField(""),
"author": item["author_name"],
"author_avatar_url": item["author_avatar_url"],
"image_url": item["thumbnail_url"],
# "tags": MissingMappedField(""),
"likes": item["likes"],
# "collects": MissingMappedField(""),
# "comments": MissingMappedField(""),
# "shares": MissingMappedField(""),
"unix_timestamp": MissingMappedField(""),
})

0 comments on commit 34b8409

Please sign in to comment.