From b7c564f71d2c12af684e7ce0a08f64fa77645b46 Mon Sep 17 00:00:00 2001 From: glop102 Date: Thu, 1 Sep 2022 02:50:49 -0400 Subject: [PATCH 1/3] Added tags to the title for common failure conditions There are two common failure modes for e(x)hentai galleries 1) the gallery is expunged (eg copyright takedown) 2) the user is out of GP to download the archive Failure 2 is easily detectable as the response to getting the archive URL specifically reports such a failure. Failure 1 is more of an educated guess as the point of detection is getting a 404 even though we were given an archive key. Also a seemingly innocuous bug of the condition of which login service to use. The condition to choose x or non-x was backwards. --- plugins/EHentai Downloader/main.py | 34 ++++++++++++++++++------------ 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/plugins/EHentai Downloader/main.py b/plugins/EHentai Downloader/main.py index 5c8dabd..f77a972 100644 --- a/plugins/EHentai Downloader/main.py +++ b/plugins/EHentai Downloader/main.py @@ -97,7 +97,7 @@ def download_query(item, is_exhentai): """ # get ehentai login - login_site = URLS['eh'] if is_exhentai else URLS['ex'] + login_site = URLS['ex'] if is_exhentai else URLS['eh'] login_status = hpx.command.GetLoginStatus(login_site) login_session = None if login_status: @@ -165,19 +165,27 @@ def download_query(item, is_exhentai): session=login_session ) r = hpx.command.SinglePOSTRequest().request(a_url, req_props) - if r.ok and "Key missing, or incorrect key provided" not in r.text: - soup = BeautifulSoup(r.text, "html.parser") - dp_url = soup.find("p", id="continue") - if dp_url and dp_url.a: # finally - download_requests.append( - DownloadRequest( - downloaditem=item, - url=dp_url.a['href'] + '?start=1', - properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session - filename=item.name.strip()+'.zip')) - archive_req = True + if r.ok: + if "Insufficient funds" in r.text: + log.info("Unable to grab gallery archive due to insufficent funds (GP) on the account") + item.name = "(Insufficient GP) "+item.name + elif "Key missing, or incorrect key provided" not in r.text: + soup = BeautifulSoup(r.text, "html.parser") + dp_url = soup.find("p", id="continue") + if dp_url and dp_url.a: # finally + download_requests.append( + DownloadRequest( + downloaditem=item, + url=dp_url.a['href'] + '?start=1', + properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session + filename=item.name.strip()+'.zip')) + archive_req = True else: log.warning(f"got invalid key page or bad status: {r.status_code}") + if r.status_code == 404 and "This gallery is currently unavailable" in r.text: + #We know that there is a valid key for us to get here, so the gallery existed at some point in the past + #Most of the time, it is a copyright takedown, but I don't see a simple way to determine what is the cause + item.name = "(Gallery Expunged) "+item.name else: log.warning(f"didn't find archiver key for data: {eh_data}") @@ -219,4 +227,4 @@ def parse_url(url): gallery_id, gallery_token = gallery_id_token.split('/') else: log.warning("Error extracting g_id and g_token from url: {}".format(url)) - return int(gallery_id), gallery_token \ No newline at end of file + return int(gallery_id), gallery_token From c3389884c10a5e35dce5976b4ab40d74eafcdc2b Mon Sep 17 00:00:00 2001 From: glop102 Date: Wed, 14 Sep 2022 23:58:15 -0400 Subject: [PATCH 2/3] Failed attempt at expunged galleries This is only a failure because the flag seems to make zero sense. I have examples where things are marked as expunged but are still totally available on both domains, archives included. I also have examples where the archives are not available but the expunged flag is still false. I have not found a single example yet where a gallery is marked as expunged and is unavailable, ergo, the flag is useless. --- plugins/EHentai Downloader/main.py | 34 ++++++++++++++++-------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/plugins/EHentai Downloader/main.py b/plugins/EHentai Downloader/main.py index f77a972..996e8d5 100644 --- a/plugins/EHentai Downloader/main.py +++ b/plugins/EHentai Downloader/main.py @@ -137,19 +137,22 @@ def download_query(item, is_exhentai): log.info("got empty response when trying to retrieve archiver key, this usually means that user has no access to exhentai") if response and not 'error' in response: for gdata in response['gmetadata']: + # Often the gallery is still available to download even though it is flagged as expunged + # This makes no sense to me, as other times when the gallery is not available, this flag is still set to false and so seems like it is totally pointless + # if "expunged" in gdata and gdata["expunged"]==True: + # item.name = "(Gallery Expunged) "+item.name + if 'title' in gdata: + item.name = gdata['title'] + if 'thumb' in gdata: + download_requests.append( + DownloadRequest( + downloaditem=item, + url=gdata['thumb'], + is_thumbnail=True, + properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session + )) + thumbnail_req = True if 'archiver_key' in gdata: - if 'title' in gdata: - item.name = gdata['title'] - if 'thumb' in gdata: - download_requests.append( - DownloadRequest( - downloaditem=item, - url=gdata['thumb'], - is_thumbnail=True, - properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session - )) - thumbnail_req = True - log.info(f"found archiver key for gallery {(gid, gtoken)}") a_key = gdata['archiver_key'] a_url = URLS['ex_archiver' if is_exhentai else 'e_archiver'].format(gallery_id=gid, gallery_token=gtoken, archiver_key=a_key) @@ -184,9 +187,8 @@ def download_query(item, is_exhentai): log.warning(f"got invalid key page or bad status: {r.status_code}") if r.status_code == 404 and "This gallery is currently unavailable" in r.text: #We know that there is a valid key for us to get here, so the gallery existed at some point in the past - #Most of the time, it is a copyright takedown, but I don't see a simple way to determine what is the cause - item.name = "(Gallery Expunged) "+item.name - + #This seems like it is most of the time a copyright takedown, but I have no idea why this is not marked as expunged + item.name = "(Gallery Unavailable) "+item.name else: log.warning(f"didn't find archiver key for data: {eh_data}") except Exception as e: @@ -212,7 +214,7 @@ def download_done(result): should return: the same :class:`DownloadResult` that was provided to the handler, potentially modified on the 'path' or `status` and `reason` properties """ - # there's nothing special to post-process in the case of nhentai downloader, so just return the result as is + # there's nothing special to post-process in the case of e(x)hentai downloader, so just return the result as is log.info(f"download of archive was successful for {result.downloaditem.name}") return result From c5e6c02f1ef7cae9c665cd517e85ed839db50cfd Mon Sep 17 00:00:00 2001 From: glop102 Date: Fri, 16 Sep 2022 16:03:47 -0400 Subject: [PATCH 3/3] Small refactor to the ehentai downloader The code was a bit silly with the indentation due to nesting things inside of if statements that never even had an else clause. Mostly code was just moved up in scope and the if statements were inverted to return early when meeting the pre-condition. Also added in comments for different sections and organized the sections to keep logical parts together. --- plugins/EHentai Downloader/main.py | 203 ++++++++++++++++------------- 1 file changed, 111 insertions(+), 92 deletions(-) diff --git a/plugins/EHentai Downloader/main.py b/plugins/EHentai Downloader/main.py index 996e8d5..bfaaa06 100644 --- a/plugins/EHentai Downloader/main.py +++ b/plugins/EHentai Downloader/main.py @@ -96,111 +96,128 @@ def download_query(item, is_exhentai): (though only once, meaning, no handler will be called upon again with the exact same URL during a single session) """ + gid, gtoken = parse_url(item.url) + download_requests = [] + + #=============================================================================== # get ehentai login login_site = URLS['ex'] if is_exhentai else URLS['eh'] login_status = hpx.command.GetLoginStatus(login_site) login_session = None if login_status: login_session = hpx.command.GetLoginSession(login_site) - - gid, gtoken = parse_url(item.url) - - download_requests = [] - - thumbnail_req = False - archive_req = False - - if login_session: - log.info("logged in, attempting to download archive") - # get the archiver key - log.info("getting archiver key") - # prepare request - eh_data = { - 'method': 'gdata', - 'gidlist': [[gid, gtoken]], - } - req_props = hpx.command.RequestProperties( - headers=HEADERS, - json=eh_data, - session=login_session - ) - api_url = URLS['ex_api' if is_exhentai else 'e_api'] - log.info(f"requesting with api url {api_url}") - r = hpx.command.SinglePOSTRequest().request(api_url, req_props) - - if r.ok: - try: - try: - response = r.json - except json.JSONDecodeError: - response = None - log.info("got empty response when trying to retrieve archiver key, this usually means that user has no access to exhentai") - if response and not 'error' in response: - for gdata in response['gmetadata']: - # Often the gallery is still available to download even though it is flagged as expunged - # This makes no sense to me, as other times when the gallery is not available, this flag is still set to false and so seems like it is totally pointless - # if "expunged" in gdata and gdata["expunged"]==True: - # item.name = "(Gallery Expunged) "+item.name - if 'title' in gdata: - item.name = gdata['title'] - if 'thumb' in gdata: + if not login_session: + log.warning("unable to get a login sesion for querying gallery data") + return () + log.info("logged in, attempting to download archive") + + #=============================================================================== + # get the gallery metadata which should have the archive key + # https://ehwiki.org/wiki/API#Gallery_Metadata + log.info("getting archiver key") + eh_data = { + 'method': 'gdata', + 'gidlist': [[gid, gtoken]], + } + req_props = hpx.command.RequestProperties( + headers=HEADERS, + json=eh_data, + session=login_session + ) + api_url = URLS['ex_api' if is_exhentai else 'e_api'] + log.info(f"requesting with api url {api_url}") + r = hpx.command.SinglePOSTRequest().request(api_url, req_props) + if not r.ok: + log.warning(f"got invalid metadata page or bad status: {r.status_code}") + log.debug(r.text) + return () + try: + response = r.json + except json.JSONDecodeError: + response = None + log.info("got empty response when trying to retrieve archiver key, this usually means that user has no access to exhentai") + return () + if not response or 'error' in response: + log.warning("response has an error of some sort, and so we have no archive key to use") + log.debug(r.text) + return () + + #=============================================================================== + # Read the metadata of the gallery to fill out the download queue item + # + # While in theory we should only ever have a single entry in the response, best keep it more general just in case this gets retrofitted to handle multiple urls at once + # The information we reliably get is the thumbnail url and the title of the gallery + # We seem to also always get an archive key, but the key is not always valid, and so the archive url request can fail + # + # Yes, there is an expunged flag in the metadata, but it is always false when the gallery/archive is not available + # It is also true sometimes and yet the gallery/archive is totally accessable and so is meaningless + for gdata in response['gmetadata']: + archive_req = False + try: + if 'title' in gdata: + item.name = gdata['title'] + if 'thumb' in gdata: + download_requests.append( + DownloadRequest( + downloaditem=item, + url=gdata['thumb'], + is_thumbnail=True, + properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session + )) + if 'archiver_key' in gdata: + log.info(f"found archiver key for gallery {(gid, gtoken)}") + a_key = gdata['archiver_key'] + a_url = URLS['ex_archiver' if is_exhentai else 'e_archiver'].format(gallery_id=gid, gallery_token=gtoken, archiver_key=a_key) + form_data = { + "dltype": "org", #original quality, instead of a resampled version + "dlcheck": "Download Original Archive" + } + req_props = hpx.command.RequestProperties( + headers=HEADERS, + data=form_data, + session=login_session + ) + r = hpx.command.SinglePOSTRequest().request(a_url, req_props) + if r.ok: + if "Insufficient funds" in r.text: + log.info("Unable to grab gallery archive due to insufficent funds (GP) on the account") + item.name = "(Insufficient GP) "+item.name + elif "Key missing, or incorrect key provided" not in r.text: + soup = BeautifulSoup(r.text, "html.parser") + dp_url = soup.find("p", id="continue") + if dp_url and dp_url.a: # finally download_requests.append( DownloadRequest( downloaditem=item, - url=gdata['thumb'], - is_thumbnail=True, + url=dp_url.a['href'] + '?start=1', properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session - )) - thumbnail_req = True - if 'archiver_key' in gdata: - log.info(f"found archiver key for gallery {(gid, gtoken)}") - a_key = gdata['archiver_key'] - a_url = URLS['ex_archiver' if is_exhentai else 'e_archiver'].format(gallery_id=gid, gallery_token=gtoken, archiver_key=a_key) - # prepare request - # get the download url - form_data = { - "dltype": "org", - "dlcheck": "Download Original Archive" - } - req_props = hpx.command.RequestProperties( - headers=HEADERS, - data=form_data, - session=login_session - ) - r = hpx.command.SinglePOSTRequest().request(a_url, req_props) - if r.ok: - if "Insufficient funds" in r.text: - log.info("Unable to grab gallery archive due to insufficent funds (GP) on the account") - item.name = "(Insufficient GP) "+item.name - elif "Key missing, or incorrect key provided" not in r.text: - soup = BeautifulSoup(r.text, "html.parser") - dp_url = soup.find("p", id="continue") - if dp_url and dp_url.a: # finally - download_requests.append( - DownloadRequest( - downloaditem=item, - url=dp_url.a['href'] + '?start=1', - properties=hpx.command.RequestProperties(method=hpx.Method.GET, headers=HEADERS, session=login_session), # we need to use the same session - filename=item.name.strip()+'.zip')) - archive_req = True - else: - log.warning(f"got invalid key page or bad status: {r.status_code}") - if r.status_code == 404 and "This gallery is currently unavailable" in r.text: - #We know that there is a valid key for us to get here, so the gallery existed at some point in the past - #This seems like it is most of the time a copyright takedown, but I have no idea why this is not marked as expunged - item.name = "(Gallery Unavailable) "+item.name - else: - log.warning(f"didn't find archiver key for data: {eh_data}") - except Exception as e: - log.debug(f"got an error, last request content: \n\t {r.text}") - raise - - if not archive_req: - pass - # TODO: download individual images instead + filename=item.name.strip()+'.zip')) + archive_req = True + log.debug(f"adding the archive url {download_requests[-1].url}") + if not archive_req: + log.info("Something went wrong and we did not actually find a URL") + #TODO Actually better handle the various cases of why we do not have a url + else: + log.warning(f"got invalid key page or bad status: {r.status_code}") + if r.status_code == 404 and "This gallery is currently unavailable" in r.text: + #We know that there is a valid key for us to get here, so the gallery existed at some point in the past + #This seems like it is most of the time a copyright takedown, but I have no idea why this is not marked as expunged + item.name = "(Gallery Unavailable) "+item.name + else: + log.warning(f"didn't find archiver key for data: {eh_data}") + item.name = "(Archive Unavailable) "+item.name + except Exception as e: + log.debug(f"got an error, last request content: \n\t {r.text}") + raise + + if not archive_req: + pass + # TODO: download individual images instead if download_requests: log.info(f"was able to prepare {len(download_requests)} requests") + else: + log.info("unable to prepare any URLs to download") return tuple(download_requests) @hpx.attach("Download.done", trigger=[EX_IDENTIFIER, EH_IDENTIFIER]) @@ -216,6 +233,8 @@ def download_done(result): """ # there's nothing special to post-process in the case of e(x)hentai downloader, so just return the result as is log.info(f"download of archive was successful for {result.downloaditem.name}") + #TODO Mark it as a failure if there was only a thumbnail to download + #TODO Archive the individual images together into a cbz or something if we grabbed individual images return result def parse_url(url):