diff --git a/sickle/app.py b/sickle/app.py index 86e3599..6869ff8 100644 --- a/sickle/app.py +++ b/sickle/app.py @@ -12,6 +12,7 @@ import time import requests +from urllib3.util.retry import Retry from sickle.iterator import BaseOAIIterator, OAIItemIterator from sickle.response import OAIResponse @@ -33,6 +34,14 @@ 'Identify': Identify, } +def get_adapter(max_retries, custom_adapter=None): + + if custom_adapter: + custom_adapter.max_retries = max_retries + return custom_adapter + else: + return requests.adapters.HTTPAdapter(max_retries=max_retries) + class Sickle(object): """Client for harvesting OAI interfaces. @@ -56,11 +65,11 @@ class Sickle(object): use the value from the retry-after header (if present) and will wait the specified number of seconds between retries. :type max_retries: int - :param retry_status_codes: HTTP status codes to retry (default will only retry on 503) + :param retry_status_codes: HTTP status codes to retry (default will retry on 429, 500, 502, 503 and 504) :type retry_status_codes: iterable - :param default_retry_after: default number of seconds to wait between retries in case no retry-after header is found - on the response (defaults to 60 seconds) - :type default_retry_after: int + :param retry_backoff_factor: Backoff factor to apply between retries after the second try, + if no Retry-After header is sent by the server. Default: 2.0 + :type retry_backoff_factor: float :type protocol_version: str :param class_mapping: A dictionary that maps OAI verbs to classes representing OAI items. If not provided, @@ -73,6 +82,8 @@ class Sickle(object): information is missing, `requests` will fallback to `'ISO-8859-1'`. :type encoding: str + :param custom_http_adapter: instance of subclass of requests.adapters.HTTPAdapter + :param custom_https_adapter: instance of subclass of requests.adapters.HTTPAdapter :param request_args: Arguments to be passed to requests when issuing HTTP requests. Useful examples are `auth=('username', 'password')` for basic auth-protected endpoints or `timeout=`. @@ -86,9 +97,12 @@ def __init__(self, endpoint, iterator=OAIItemIterator, max_retries=0, retry_status_codes=None, - default_retry_after=60, + default_retry_after=None, + retry_backoff_factor=2, class_mapping=None, encoding=None, + custom_http_adapter=None, + custom_https_adapter=None, **request_args): self.endpoint = endpoint @@ -104,14 +118,27 @@ def __init__(self, endpoint, else: raise TypeError( "Argument 'iterator' must be subclass of %s" % BaseOAIIterator.__name__) - self.max_retries = max_retries - self.retry_status_codes = retry_status_codes or [503] - self.default_retry_after = default_retry_after + + if default_retry_after is not None: + logger.warning("default_retry_after is no longer supported, please use retry_backoff_factor instead.") + + max_retries = Retry( + total=max_retries, + backoff_factor=retry_backoff_factor, + status_forcelist=retry_status_codes or [429, 500, 502, 503, 504], + method_whitelist=frozenset(['GET', 'POST']) + ) + + self.session = requests.Session() + + self.session.mount('https://', get_adapter(max_retries, custom_https_adapter)) + self.session.mount('http://', get_adapter(max_retries, custom_http_adapter)) + self.oai_namespace = OAI_NAMESPACE % self.protocol_version self.class_mapping = class_mapping or DEFAULT_CLASS_MAP self.encoding = encoding - self.request_args = request_args - self.session = requests.Session() + self.request_args = request_args + def harvest(self, **kwargs): # pragma: no cover """Make HTTP requests to the OAI server. @@ -120,23 +147,17 @@ def harvest(self, **kwargs): # pragma: no cover :rtype: :class:`sickle.OAIResponse` """ http_response = self._request(kwargs) - for _ in range(self.max_retries): - if self._is_error_code(http_response.status_code) \ - and http_response.status_code in self.retry_status_codes: - retry_after = self.get_retry_after(http_response) - logger.warning( - "HTTP %d! Retrying after %d seconds..." % (http_response.status_code, retry_after)) - time.sleep(retry_after) - http_response = self._request(kwargs) - http_response.raise_for_status() if self.encoding: http_response.encoding = self.encoding return OAIResponse(http_response, params=kwargs) def _request(self, kwargs): if self.http_method == 'GET': - return self.session.get(self.endpoint, params=kwargs, **self.request_args) - return self.session.post(self.endpoint, data=kwargs, **self.request_args) + response = self.session.get(self.endpoint, params=kwargs, **self.request_args) + else: + response = self.session.post(self.endpoint, data=kwargs, **self.request_args) + response.raise_for_status() + return response def ListRecords(self, ignore_deleted=False, **kwargs): """Issue a ListRecords request. diff --git a/sickle/response.py b/sickle/response.py index 76f6897..cb7a290 100644 --- a/sickle/response.py +++ b/sickle/response.py @@ -8,7 +8,7 @@ from lxml import etree -XMLParser = etree.XMLParser(remove_blank_text=True, recover=True, resolve_entities=False) +XMLParser = etree.XMLParser(remove_blank_text=True, huge_tree=True, recover=True, resolve_entities=False) class OAIResponse(object):