From b1e35d0d6d82f278513b82f9dcf4be6edba5f9ff Mon Sep 17 00:00:00 2001 From: filak Date: Fri, 7 Jan 2022 00:11:31 +0100 Subject: [PATCH 1/4] Update app.py --- sickle/app.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sickle/app.py b/sickle/app.py index 86e3599..a4c8faa 100644 --- a/sickle/app.py +++ b/sickle/app.py @@ -89,6 +89,7 @@ def __init__(self, endpoint, default_retry_after=60, class_mapping=None, encoding=None, + custom_http_adapter=None, **request_args): self.endpoint = endpoint @@ -113,6 +114,10 @@ def __init__(self, endpoint, self.request_args = request_args self.session = requests.Session() + if custom_http_adapter: + self.session.mount(endpoint, custom_http_adapter) + + def harvest(self, **kwargs): # pragma: no cover """Make HTTP requests to the OAI server. From c98ce36f4ff617bb156a95258cb6dd3f3375302c Mon Sep 17 00:00:00 2001 From: filak Date: Fri, 7 Jan 2022 01:03:02 +0100 Subject: [PATCH 2/4] Update app.py --- sickle/app.py | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/sickle/app.py b/sickle/app.py index a4c8faa..0517abb 100644 --- a/sickle/app.py +++ b/sickle/app.py @@ -12,6 +12,8 @@ import time import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry from sickle.iterator import BaseOAIIterator, OAIItemIterator from sickle.response import OAIResponse @@ -56,11 +58,11 @@ class Sickle(object): use the value from the retry-after header (if present) and will wait the specified number of seconds between retries. :type max_retries: int - :param retry_status_codes: HTTP status codes to retry (default will only retry on 503) + :param retry_status_codes: HTTP status codes to retry (default will retry on 429, 500, 502, 503 and 504) :type retry_status_codes: iterable - :param default_retry_after: default number of seconds to wait between retries in case no retry-after header is found - on the response (defaults to 60 seconds) - :type default_retry_after: int + :param retry_backoff_factor: Backoff factor to apply between retries after the second try, + if no Retry-After header is sent by the server. Default: 2.0 + :type retry_backoff_factor: float :type protocol_version: str :param class_mapping: A dictionary that maps OAI verbs to classes representing OAI items. If not provided, @@ -86,7 +88,8 @@ def __init__(self, endpoint, iterator=OAIItemIterator, max_retries=0, retry_status_codes=None, - default_retry_after=60, + default_retry_after=None, + retry_backoff_factor=2, class_mapping=None, encoding=None, custom_http_adapter=None, @@ -105,14 +108,24 @@ def __init__(self, endpoint, else: raise TypeError( "Argument 'iterator' must be subclass of %s" % BaseOAIIterator.__name__) - self.max_retries = max_retries - self.retry_status_codes = retry_status_codes or [503] - self.default_retry_after = default_retry_after + + if default_retry_after is not None: + logger.warning("default_retry_after is no longer supported, please use retry_backoff_factor instead.") + + retry_adapter = requests.adapters.HTTPAdapter(max_retries=Retry( + total=max_retries, + backoff_factor=retry_backoff_factor, + status_forcelist=retry_status_codes or [429, 500, 502, 503, 504], + method_whitelist=frozenset(['GET', 'POST']) + )) + self.session = requests.Session() + self.session.mount('https://', retry_adapter) + self.session.mount('http://', retry_adapter) + self.oai_namespace = OAI_NAMESPACE % self.protocol_version self.class_mapping = class_mapping or DEFAULT_CLASS_MAP self.encoding = encoding self.request_args = request_args - self.session = requests.Session() if custom_http_adapter: self.session.mount(endpoint, custom_http_adapter) @@ -125,23 +138,17 @@ def harvest(self, **kwargs): # pragma: no cover :rtype: :class:`sickle.OAIResponse` """ http_response = self._request(kwargs) - for _ in range(self.max_retries): - if self._is_error_code(http_response.status_code) \ - and http_response.status_code in self.retry_status_codes: - retry_after = self.get_retry_after(http_response) - logger.warning( - "HTTP %d! Retrying after %d seconds..." % (http_response.status_code, retry_after)) - time.sleep(retry_after) - http_response = self._request(kwargs) - http_response.raise_for_status() if self.encoding: http_response.encoding = self.encoding return OAIResponse(http_response, params=kwargs) def _request(self, kwargs): if self.http_method == 'GET': - return self.session.get(self.endpoint, params=kwargs, **self.request_args) - return self.session.post(self.endpoint, data=kwargs, **self.request_args) + response = self.session.get(self.endpoint, params=kwargs, **self.request_args) + else: + response = self.session.post(self.endpoint, data=kwargs, **self.request_args) + response.raise_for_status() + return response def ListRecords(self, ignore_deleted=False, **kwargs): """Issue a ListRecords request. From 553606fcc698f1689d036d70199363a91a759075 Mon Sep 17 00:00:00 2001 From: filak Date: Fri, 7 Jan 2022 21:17:08 +0100 Subject: [PATCH 3/4] Update app.py --- sickle/app.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/sickle/app.py b/sickle/app.py index 0517abb..6869ff8 100644 --- a/sickle/app.py +++ b/sickle/app.py @@ -12,8 +12,7 @@ import time import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry +from urllib3.util.retry import Retry from sickle.iterator import BaseOAIIterator, OAIItemIterator from sickle.response import OAIResponse @@ -35,6 +34,14 @@ 'Identify': Identify, } +def get_adapter(max_retries, custom_adapter=None): + + if custom_adapter: + custom_adapter.max_retries = max_retries + return custom_adapter + else: + return requests.adapters.HTTPAdapter(max_retries=max_retries) + class Sickle(object): """Client for harvesting OAI interfaces. @@ -75,6 +82,8 @@ class Sickle(object): information is missing, `requests` will fallback to `'ISO-8859-1'`. :type encoding: str + :param custom_http_adapter: instance of subclass of requests.adapters.HTTPAdapter + :param custom_https_adapter: instance of subclass of requests.adapters.HTTPAdapter :param request_args: Arguments to be passed to requests when issuing HTTP requests. Useful examples are `auth=('username', 'password')` for basic auth-protected endpoints or `timeout=`. @@ -93,6 +102,7 @@ def __init__(self, endpoint, class_mapping=None, encoding=None, custom_http_adapter=None, + custom_https_adapter=None, **request_args): self.endpoint = endpoint @@ -112,23 +122,22 @@ def __init__(self, endpoint, if default_retry_after is not None: logger.warning("default_retry_after is no longer supported, please use retry_backoff_factor instead.") - retry_adapter = requests.adapters.HTTPAdapter(max_retries=Retry( - total=max_retries, - backoff_factor=retry_backoff_factor, - status_forcelist=retry_status_codes or [429, 500, 502, 503, 504], - method_whitelist=frozenset(['GET', 'POST']) - )) + max_retries = Retry( + total=max_retries, + backoff_factor=retry_backoff_factor, + status_forcelist=retry_status_codes or [429, 500, 502, 503, 504], + method_whitelist=frozenset(['GET', 'POST']) + ) + self.session = requests.Session() - self.session.mount('https://', retry_adapter) - self.session.mount('http://', retry_adapter) + + self.session.mount('https://', get_adapter(max_retries, custom_https_adapter)) + self.session.mount('http://', get_adapter(max_retries, custom_http_adapter)) self.oai_namespace = OAI_NAMESPACE % self.protocol_version self.class_mapping = class_mapping or DEFAULT_CLASS_MAP self.encoding = encoding - self.request_args = request_args - - if custom_http_adapter: - self.session.mount(endpoint, custom_http_adapter) + self.request_args = request_args def harvest(self, **kwargs): # pragma: no cover From ad6ef20d6534635083675815120cddf8cb4cf6a1 Mon Sep 17 00:00:00 2001 From: filak Date: Tue, 27 Sep 2022 15:59:34 +0200 Subject: [PATCH 4/4] Update response.py --- sickle/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sickle/response.py b/sickle/response.py index 76f6897..cb7a290 100644 --- a/sickle/response.py +++ b/sickle/response.py @@ -8,7 +8,7 @@ from lxml import etree -XMLParser = etree.XMLParser(remove_blank_text=True, recover=True, resolve_entities=False) +XMLParser = etree.XMLParser(remove_blank_text=True, huge_tree=True, recover=True, resolve_entities=False) class OAIResponse(object):