Skip to content

Commit

Permalink
Added support for getting <LIMIT> num of issues from backlog
Browse files Browse the repository at this point in the history
  • Loading branch information
Armandur committed Apr 3, 2022
1 parent b2a0bee commit e96dfc1
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 29 deletions.
15 changes: 15 additions & 0 deletions examplebacklogconf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"credentials":
{
"site": "", # URL of reader webpage
"cdn": "", # Optional, some sites may use other CDN than https://mediacdn.prenly.com, look in dev tools network for webp/pdf requests to a CDN
"textalk-auth": "YOUR OWN", # Look in api/v2/ request header (X-Textalk-Content-Client-Authorize), seems to be persistent
"auth": "YOUR OWN" # Look in Storage/Local Storage/prenlyreadersessiontoken or headers of request to api/v2/, remove "Bearer ", seems to be persistent
},
"publication":
{
"title": 4019, # ID of a paper
"limit": 10 # Limit of backlog issues from newest to get
},
"prefix": "Newspaper - " # String to prefix files with
}
13 changes: 7 additions & 6 deletions exampleconf.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
{
"credentials":
{
"site": "", # URL of reader webpage
"cdn": "", # Optional, some sites may use other CDN than https://mediacdn.prenly.com, look in dev tools network for webp/pdf requests to a CDN
"textalk-auth": "YOUR OWN", # Look in api/v2/ request header (X-Textalk-Content-Client-Authorize), seems to be persistent
"auth": "YOUR OWN", # Look in Storage/Local Storage/prenlyreadersessiontoken or headers of request to api/v2/, remove "Bearer ", seems to be persistent
"auth": "YOUR OWN" # Look in Storage/Local Storage/prenlyreadersessiontoken or headers of request to api/v2/, remove "Bearer ", seems to be persistent
},
"issue":
"publication":
{
"title": 4019, # ID of a paper
"uids": [ # UIDs of issues to download from paper, look in request to /api/v2 for issue_uid
"517313",
"490377"
],
"site": "", # URL of reader webpage
"cdn": "" # Some sites may use other CDN than https://mediacdn.prenly.com, look in dev tools network for webp/pdf requests to a CDN
}
]
},
"prefix": "Newspaper - " # String to prefix files with
}
104 changes: 81 additions & 23 deletions prenly-dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,53 @@
from PyPDF2.utils import PdfReadError


def getIssueJSON(session, credentials, issue):
def getContextToken(session, conf):
url = "https://content.textalk.se/api/web-reader/v1/context-token"
headers = {
"Accept": "*/*",
"Accept-Language": "sv-SE,en-US,en",
"Accept-Encoding": "gzip, deflate, br",
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/json-rpc",
"X-Textalk-Content-Client-Authorize": conf["credentials"]["textalk-auth"],
"X-Textalk-Content-Product-Type": "webapp",
"Authorization": f"Bearer {conf['credentials']['auth']}",
"Origin": conf["credentials"]["site"],
"Referer": f"{conf['credentials']['site']}/",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site"
}
token = json.loads(session.get(url, headers=headers).text)["token"]

return token


def getCatalogueIssues(session, conf, contextToken, limit=50):
issues = []
url = f"https://apicdn.prenly.com/api/web-reader/v1/issues?title_ids[]={conf['publication']['title']}&limit={limit}&context_token={contextToken}"
headers = {
"Accept": "*/*",
"Accept-Language": "sv-SE,en-US,en",
"Accept-Encoding": "gzip",
"Origin": conf["credentials"]["site"],
"Referer": f"{conf['credentials']['site']}/",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site",
}

response = session.get(url, headers=headers)
JSON = json.loads(response.text)

for issue in JSON:
issues.append(issue["uid"])
return issues


def getIssueJSON(session, issue, conf):
url = "https://content.textalk.se/api/v2/"

headers = {
Expand All @@ -20,9 +66,9 @@ def getIssueJSON(session, credentials, issue):
"Accept-Encoding": "gzip, deflate, br",
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/json-rpc",
"X-Textalk-Content-Client-Authorize": credentials["textalk-auth"],
"X-Textalk-Content-Client-Authorize": conf["credentials"]["textalk-auth"],
"X-Textalk-Content-Product-Type": "webapp",
"Authorization": f"Bearer {credentials['auth']}",
"Authorization": f"Bearer {conf['credentials']['auth']}",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
Expand Down Expand Up @@ -50,14 +96,14 @@ def getIssueJSON(session, credentials, issue):
return JSON


def getPDF(session, issue, hash, cdn="https://mediacdn.prenly.com"):
def getPDF(session, conf, hash, cdn="https://mediacdn.prenly.com"):
# Some sites may use another cdn

url = f"{cdn}/api/v2/media/get/{issue['title']}/{hash}?h=abc"
url = f"{cdn}/api/v2/media/get/{conf['publication']['title']}/{hash}?h=abc"

headers = {
"Origin": issue["site"],
"Referer": f"{issue['site']}/",
"Origin": conf["credentials"]["site"],
"Referer": f"{conf['credentials']['site']}/",
"Accept": "application/pdf",
"Accept-Language": "sv-SE,sv;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
Expand Down Expand Up @@ -114,29 +160,40 @@ def pdfMerge(title):


def main(conf):
# Support for multiple uids
for uid in conf["issue"]["uids"]:
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"})

#download newest to limit number of issues of publication
if "limit" in conf["publication"]:
uids = getCatalogueIssues(session, conf, getContextToken(session, conf), conf["publication"]["limit"])
conf["publication"]["uids"] = uids

if len(conf["publication"]["uids"]) == 0 or "uids" not in conf["publication"]:
print("No uids supplied", file=sys.stderr)
exit(1)

if "prefix" not in conf:
conf["prefix"] = ""

for uid in conf["publication"]["uids"]:
issue = {
"title": conf["issue"]["title"],
"title": conf["publication"]["title"],
"uid": uid,
"site": conf["issue"]["site"]
"site": conf["credentials"]["site"]
}

session = requests.Session()
session.headers.update(
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"})

JSON = getIssueJSON(session, conf["credentials"], issue)
JSON = getIssueJSON(session, issue, conf)
hashes = getHashes(JSON) # Extract the hashes for individual pages

# Get all PDFs and write them to files.
for page_num in hashes:
if "cdn" in conf["issue"] and conf["issue"]["cdn"] != "": # Custom CDN supplied
if "cdn" in conf["credentials"] and conf["credentials"]["cdn"] != "": # Custom CDN supplied
req = getPDF(
session, issue, hashes[page_num], conf["issue"]["cdn"])
session, conf, hashes[page_num], conf["credentials"]["cdn"])
else:
req = getPDF(session, issue, hashes[page_num])
name = JSON['result']['name']
req = getPDF(session, conf, hashes[page_num])

name = f"{conf['prefix']}{JSON['result']['name']}"

content = req.content

Expand All @@ -161,7 +218,7 @@ def opts(argv):
print(usage)
sys.exit(1)

config = {}
conf = {}

try:
# title issue site, cdn, textalk, auth
Expand All @@ -172,14 +229,15 @@ def opts(argv):
print(repr(error), file=sys.stderr)
print(usage)
sys.exit(2)

for opt, arg in opts:
if opt == "--json":
with open(arg, "r") as file:
config = json.loads(file.read())
conf = json.loads(file.read())
break
# TODO rest of options, build config with supplied options

main(config)
main(conf)


if __name__ == '__main__':
Expand Down

0 comments on commit e96dfc1

Please sign in to comment.