From 0eccfd32e2c531acd11f55df8aaa3aae1e700baa Mon Sep 17 00:00:00 2001 From: bradlywilson Date: Sun, 26 Oct 2025 18:23:17 -0600 Subject: [PATCH] Some files are m4a on the Church website and were getting ignored; those are now downloaded and also added to the playists. Also there were a few talks that didn't have an audio file that caused the script to break so I added a try except to prevent that. --- gen_conf_downloader.py | 48 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index 86ad416..eb6a1ec 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -32,7 +32,7 @@ from urllib.parse import quote_plus import urllib.request import zlib -from mutagen.mp3 import MP3 +import mutagen Conference = namedtuple('Conference', 'link title year month') Session = namedtuple('Session', 'conference link title number') @@ -57,6 +57,7 @@ MP3_MEDIAURL_REGEX = r'{"mediaUrl":"([^"]*)","variant":"audio"}' MP3_MEDIAURL_FILENAME_REGEX = r'.*/(.*\.mp3)' +M4A_MEDIAURL_FILENAME_REGEX = r'.*/(.*\.m4a)' SESSIONS_REGEX = r']*href="([^"]*)"[^>]*>]*>

]*>([^<]*)

]*>(.*?)' SESSION_TALKS_REGEX = r']*href="([^"]*)"[^>]*>]*>

]*>(.*?)

]*>([^<]*)

' @@ -180,12 +181,14 @@ def clean_title(title): def create_mp3_filename(args, talk): - return "{}-{:02d}-{}-{}-{}.mp3".format( + ## Took out extension. We'll add it in get_audio after determining if mp3 or m4a + fname = "{}-{:02d}-{}-{}-{}.".format( talk.session.conference.year, talk.session.conference.month, talk.session.number * 100 + talk.number, talk.speaker.lower().replace(" ", "-"), args.lang) + return fname def create_playlists(args, all_talks): @@ -224,6 +227,8 @@ def download_all_content(args): for talk in all_talks: progress_bar.set_description_str(talk.title, refresh=True) audio = get_audio(args, f'{LDS_ORG_URL}{decode(talk.link)}', create_mp3_filename(args, talk)) + if args.verbose and not audio: + sys.stderr.write(f'No audio file found at URL') if audio and download_audio(progress_bar, args, get_relative_path(args, talk.session), audio): if not args.noplaylists: update_playlists(args, playlists, talk, audio) @@ -243,6 +248,8 @@ def download_all_content(args): def download_audio(progress_bar, args, relpath, audio): # If audio file doesn't yet exist, attempt to retrieve it file_path = f'{get_output_dir(args)}/{relpath}/{audio.file}' + if args.verbose: + print("Using file path: {}".format(file_path)) if not os.path.isfile(file_path): try: req = urllib.request.Request(audio.link) @@ -380,11 +387,21 @@ def get_audio_2024(args, url): driver.get(url) try: options = driver.find_element(By.XPATH, '//button[@title="Options"]') + print("Giving options click with URL: {}".format(url)) options.click() + ## Some pages don't have an audio file, eg + ## 1993-10 Combatting Spiritual Drift—Our Global Pandemic By Elder Russell M. Nelson + ## https://www.churchofjesuschrist.org/study/general-conference/1993/10/combatting-spiritual-drift-our-global-pandemic?lang=eng + ## 2014-10 Joseph Smith By Elder Neil L. Andersen + ## https://www.churchofjesuschrist.org/study/general-conference/2014/10/joseph-smith?lang=eng try: download = WebDriverWait(driver, 20).until( EC.element_to_be_clickable((By.XPATH, '//button[@data-testid="download-menu-button"]')) ) + except: + pass + else: + print("Downloading with URL: {}".format(url)) download.click() add_to_cache(args, driver.page_source, url) return re.search(MP3_DOWNLOAD_REGEX, driver.page_source) @@ -396,6 +413,7 @@ def get_audio_2024(args, url): def get_audio(args, url, mp3_file_name): link_html = get_html(args, url) + ext = "mp3" mp3_link = re.search(MP3_DOWNLOAD_REGEX, link_html) # In April 2022 the MP3 link became buried in base64 encoded script section match = re.search(SCRIPT_BASE64_REGEX, link_html) @@ -411,17 +429,27 @@ def get_audio(args, url, mp3_file_name): # Search for JSON object containing mediaUrl key and value mp3_link = re.search(MP3_MEDIAURL_REGEX, script_data) if not mp3_link: + if args.verbose: + print("No regular link found, using post 2024 style") mp3_link = get_audio_2024(args, url) if not mp3_link: sys.stderr.write(f'Problem finding mp3 link ({url}') return # Extract and reuse the filename from the MP3 URL + if args.verbose: + print("Defined mp3_link as: {}".format(mp3_link)) + mp3_file = re.match(MP3_MEDIAURL_FILENAME_REGEX, mp3_link.group(1)) + if not mp3_file: + ## Some file are occasionally m4a instead of mp3 + mp3_file = re.match(M4A_MEDIAURL_FILENAME_REGEX, mp3_link.group(1)) + ext = "m4a" if not mp3_file: return # Create audio object with link and filename + mp3_file_name = str(mp3_file_name) + ext return Audio(mp3_link.group(1), mp3_file_name) @@ -636,7 +664,14 @@ def update_playlists(args, playlists, talk, audio): month_path = get_month_path(args, talk.session.conference) session_path = get_session_path(args, talk.session, nonumbers=True) relative_path = get_relative_path(args, talk.session) - duration = MP3(f'{get_output_dir(args)}/{relative_path}/{audio.file}').info.length + # ~ duration = MP3(f'{get_output_dir(args)}/{relative_path}/{audio.file}').info.length + file_name, file_extension = os.path.splitext(audio.file) + duration = 0 + if file_extension == 'mp3': + duration = mutagen.mp3(f'{get_output_dir(args)}/{relative_path}/{audio.file}').info.length + elif file_extension == 'm4a': + #if not duration: #file_extension == 'm4a' + duration = mutagen.mp4(f'{get_output_dir(args)}/{relative_path}/{audio.file}').info.length # Add this talk to the year, conference, or session playlists playlists[f'Conferences/{year_path}'].append({'duration' : duration, 'path' : f'../{relative_path}/{audio.file}', 'title' : talk.title}) @@ -700,7 +735,10 @@ def write_playlist_file(args, playlist_path, playlist_data): f.write("#EXTM3U\n\n") for audio_info in playlist_data: f.write(f"#EXTINF:{get_duration_text(audio_info['duration'])}, {audio_info['title']}\n") - f.write(audio_info['path'].replace("/","\\")) + if os.name == 'nt': ## For windows use backslashes in paths + f.write(audio_info['path'].replace("/","\\")) + else: ## MacOS, Unix, Linux, BSD, leave the slashes. + f.write(audio_info['path']) f.write("\n\n") @@ -717,6 +755,8 @@ def write_playlists(args, playlists): def write_mp3_to_disk(data, filename): os.makedirs(os.path.dirname(filename), exist_ok=True) + if args.verbose: + print("Saving file: {}".format(filename)) with open(filename, "wb") as f: f.write(data)