|
3 | 3 | """
|
4 | 4 |
|
5 | 5 | import argparse
|
| 6 | +import base64 |
6 | 7 | import datetime
|
7 | 8 | import glob
|
8 | 9 | import io
|
|
35 | 36 | TalkByTopic = namedtuple('TalkByTopic', 'link speaker title topic')
|
36 | 37 |
|
37 | 38 | LDS_ORG_URL = 'https://www.churchofjesuschrist.org'
|
38 |
| -ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/general-conference/conferences' |
39 |
| -ALL_TOPICS_URL = f'{LDS_ORG_URL}/general-conference/topics' |
| 39 | +ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/study/general-conference' |
| 40 | +ALL_TOPICS_URL = f'{LDS_ORG_URL}/study/general-conference/topics' |
40 | 41 |
|
41 | 42 | GET_LANGS_REGEX = 'data-lang=\".*?\" data-clang=\"(.*?)\">(.*?)</a>'
|
42 | 43 | CONFERENCES_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><img[^>]*></div><span[^>]*>([A-Z][a-z]* \d{4})</span></a>'
|
43 | 44 | CONFERENCE_GROUPS_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><img[^>]*></div><span[^>]*>(\d{4}.\d{4})</span></a>'
|
44 | 45 | CONFERENCE_GROUPS_RANGE_REGEX = '.*/(\d{4})(\d{4})\?lang=.*'
|
45 | 46 | CONFERENCE_LINK_YEAR_MONTH_REGEX = '.*(\d{4})/(\d{2})\?lang=.*'
|
46 | 47 |
|
| 48 | +SCRIPT_BASE64_REGEX = '<script>window.__INITIAL_STATE__[^"]*"([^"]*)";</script>' |
47 | 49 | MP3_DOWNLOAD_REGEX = '<a[^>]*href="([^"]*)"[^>]*>This Page \(MP3\).*?</a>'
|
48 |
| -MP3_FILENAME_REGEX = '.*/(.*\.mp3)\?lang=.*' |
| 50 | +MP3_DOWNLOAD_FILENAME_REGEX = '.*/(.*\.mp3)\?lang=.*' |
| 51 | +MP3_MEDIAURL_REGEX = '{"mediaUrl":"([^"]*)","variant":"audio"}' |
| 52 | +MP3_MEDIAURL_FILENAME_REGEX = '.*/(.*\.mp3)' |
49 | 53 |
|
50 | 54 | SESSIONS_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><p><span[^>]*>([^<]*)</span></p></div></a><ul[^>]*>(.*?)</ul>'
|
51 | 55 | SESSION_TALKS_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><p><span[^>]*>([^<]*)</span></p><p[^>]*>([^<]*)</p></div></a>'
|
@@ -302,10 +306,23 @@ def get_all_talks_by_topic(args):
|
302 | 306 | def get_audio(args, talk):
|
303 | 307 | link_html = get_html(args, f'{LDS_ORG_URL}{decode(talk.link)}')
|
304 | 308 | mp3_link = re.search(MP3_DOWNLOAD_REGEX, link_html)
|
305 |
| - if not mp3_link: |
| 309 | + # In April 2022 the MP3 link became buried in base64 encoded script section |
| 310 | + match = re.search(SCRIPT_BASE64_REGEX, link_html) |
| 311 | + if mp3_link: |
| 312 | + # Extract and reuse the filename from the MP3 URL (exclude language) |
| 313 | + mp3_file = re.match(MP3_DOWNLOAD_FILENAME_REGEX, mp3_link.group(1)) |
| 314 | + elif not mp3_link and not match: |
306 | 315 | return
|
| 316 | + elif not mp3_link and match: |
| 317 | + # MP3 link is probably in the base64 encoded script section |
| 318 | + script_data = str(base64.b64decode(match.group(1))) |
| 319 | + # Search for JSON object containing mediaUrl key and value |
| 320 | + mp3_link = re.search(MP3_MEDIAURL_REGEX, script_data) |
| 321 | + if not mp3_link: |
| 322 | + return |
| 323 | + # Extract and reuse the filename from the MP3 URL |
| 324 | + mp3_file = re.match(MP3_MEDIAURL_FILENAME_REGEX, mp3_link.group(1)) |
307 | 325 |
|
308 |
| - mp3_file = re.match(MP3_FILENAME_REGEX, mp3_link.group(1)) |
309 | 326 | if not mp3_file:
|
310 | 327 | return
|
311 | 328 |
|
|
0 commit comments