From f749b1d2da82adc0269b5a96685276a81089f716 Mon Sep 17 00:00:00 2001 From: Jorge Rivera Date: Fri, 19 Dec 2025 15:17:47 +0100 Subject: [PATCH] handle broken urls --- CHANGELOG.md | 5 ++++- pyproject.toml | 2 +- src/oda_reader/common.py | 17 +++++++++++------ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9454209..12bc067 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ # Changelog for oda_reader +## 1.3.5 (2025-12-19) +- Fixes `_get_dataflow_version()` to gracefully handle URLs without a version pattern instead of crashing. + ## 1.3.4 (2025-12-19) -- Improves robustness of dataflow version fallback logic. The API error detection now checks response content regardless of HTTP status code, handling cases where error messages are returned with non-404 status codes. +- Improves robustness of dataflow version fallback logic. The API error detection now checks response content regardless of HTTP status code, handling cases where error messages like "Could not find Dataflow" are returned with various status codes. ## 1.3.3 (2025-12-19) - Reverts DAC1 dataflow version from 1.6 to 1.5 to ensure compatibility with published data. diff --git a/pyproject.toml b/pyproject.toml index cf65be9..d61f764 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "oda_reader" -version = "1.3.4" +version = "1.3.5" description = "A simple package to import ODA data from the OECD's API and AidData's database" readme = "README.md" license = "MIT" diff --git a/src/oda_reader/common.py b/src/oda_reader/common.py index fa85646..a16813c 100644 --- a/src/oda_reader/common.py +++ b/src/oda_reader/common.py @@ -197,11 +197,15 @@ def _replace_dataflow_version(url: str, version: str) -> str: return re.sub(pattern, f",{version}/", url) -def _get_dataflow_version(url: str) -> str: - """Get the dataflow version from the URL.""" - pattern = r",(\d+\.\d+)/" +def _get_dataflow_version(url: str) -> str | None: + """Get the dataflow version from the URL. - return re.search(pattern, url).group(1) + Returns: + The version string if found, None otherwise. + """ + pattern = r",(\d+\.\d+)/" + match = re.search(pattern, url) + return match.group(1) if match else None def _get_response_text(url: str, headers: dict) -> tuple[int, str, bool]: @@ -284,9 +288,10 @@ def get_data_from_api(url: str, compressed: bool = True, retries: int = 0) -> st # Check for dataflow not found errors - these should trigger version fallback # This check happens regardless of status code since the API may return # error messages with various status codes (404, 200, etc.) - if _is_dataflow_not_found_error(response): + # Only attempt fallback if the URL contains a dataflow version pattern + version = _get_dataflow_version(url) + if _is_dataflow_not_found_error(response) and version is not None: if retries < MAX_RETRIES: - version = _get_dataflow_version(url) new_version = str(round(float(version) - FALLBACK_STEP, 1)) new_url = _replace_dataflow_version(url=url, version=new_version) logger.info(