diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 95b20f5..61fc62b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,12 +22,15 @@ jobs: uses: astral-sh/setup-uv@v6 with: python-version: ${{ matrix.python-version }} + enable-cache: false - name: Install dependencies run: uv sync --all-groups - name: Run unit tests run: uv run pytest tests/ -n auto -m "not integration" -v + env: + ODA_READER_CACHE_DIR: ${{ runner.temp }}/oda_cache - name: Integration Tests if: github.event_name == 'pull_request' && matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest' diff --git a/CHANGELOG.md b/CHANGELOG.md index eb357d5..17e5537 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog for oda_reader +## 1.3.1 (2025-06-27) +- Improves cache management for very large files. Introduces tests and improved documentation + ## 1.3.0 (2025-06-16) - Improves cache management. diff --git a/docs/docs/advanced.md b/docs/docs/advanced.md index 0b8977c..9d84816 100644 --- a/docs/docs/advanced.md +++ b/docs/docs/advanced.md @@ -45,9 +45,9 @@ OECD occasionally changes dataflow versions (schema updates). ODA Reader handles When a dataflow version returns 404 (not found), ODA Reader automatically: -1. Tries the configured version (e.g., `1.0`) -2. If 404, retries with `0.9` -3. Continues decrementing: `0.8`, `0.7`, `0.6` +1. Tries the configured version (e.g., `1.5`) +2. If 404, retries with `1.4` +3. Continues decrementing: `1.3`, `1.2`, `1.1` 4. Returns data from first successful version (up to 5 attempts) This means your code keeps working even when OECD makes breaking schema changes. @@ -58,9 +58,9 @@ This means your code keeps working even when OECD makes breaking schema changes. from oda_reader import download_dac1 # ODA Reader will automatically try: -# 1.0 -> 404 -# 0.9 -> 404 -# 0.8 -> Success! Returns data with version 0.8 +# 1.5 -> 404 +# 1.4 -> 404 +# 1.3 -> Success! Returns data with version 1.3 data = download_dac1(start_year=2022, end_year=2022) ``` @@ -71,11 +71,11 @@ You'll see a message indicating which version succeeded. You can specify an exact dataflow version: ```python -# Force use of version 0.8 +# Force use of version 1.3 data = download_dac1( start_year=2022, end_year=2022, - dataflow_version="0.8" + dataflow_version="1.3" ) ``` @@ -177,26 +177,6 @@ combined = pd.merge( - Column names and codes must align - Filter carefully to avoid double-counting -## Custom Schema Handling - -If you need custom schema translation beyond built-in options: - -### Access Raw Data and Translate Manually - -```python -# Get raw API data -data = download_dac1( - start_year=2022, - end_year=2022, - pre_process=False, - dotstat_codes=False -) - -# Apply custom transformations -data = data.rename(columns={'DONOR': 'donor_custom'}) -data['donor_custom'] = data['donor_custom'].map(my_custom_mapping) -``` - ### Load Schema Mapping Files ```python @@ -234,51 +214,6 @@ def get_crs_data(): return pd.read_parquet("/data/crs_full.parquet") ``` -### Refresh Strategy - -```python -from pathlib import Path -from datetime import datetime, timedelta - -def refresh_if_old(file_path, max_age_days=7): - """Re-download if file is older than max_age_days""" - path = Path(file_path) - - if not path.exists(): - print("File doesn't exist, downloading...") - bulk_download_crs(save_to_path=file_path) - return - - file_age = datetime.now() - datetime.fromtimestamp(path.stat().st_mtime) - - if file_age > timedelta(days=max_age_days): - print(f"File is {file_age.days} days old, refreshing...") - bulk_download_crs(save_to_path=file_path) - else: - print(f"File is recent ({file_age.days} days old), using cached version") - -# Use in pipeline -refresh_if_old("/data/crs_full.parquet", max_age_days=7) -crs_data = pd.read_parquet("/data/crs_full.parquet") -``` - -### Memory-Efficient Aggregation - -```python -# Process bulk CRS in chunks, aggregate results -sector_totals = {} - -for chunk in bulk_download_crs(as_iterator=True): - # Aggregate by sector - sector_sums = chunk.groupby('purpose_code')['usd_commitment'].sum() - - # Accumulate - for sector, amount in sector_sums.items(): - sector_totals[sector] = sector_totals.get(sector, 0) + amount - -print(f"Total sectors: {len(sector_totals)}") -``` - ## Debugging Tips ### Enable Verbose Logging diff --git a/docs/docs/bulk-downloads.md b/docs/docs/bulk-downloads.md index 73c2e1e..6892632 100644 --- a/docs/docs/bulk-downloads.md +++ b/docs/docs/bulk-downloads.md @@ -71,8 +71,6 @@ bulk_download_crs( ) ``` -The reduced version omits some descriptive columns but retains all flow amounts and key dimensions. - ## Memory-Efficient Processing with Iterators For very large files, process in chunks to avoid loading the entire dataset into memory: @@ -206,37 +204,6 @@ Bulk downloads already have: See [Schema Translation](schema-translation.md) for detailed comparison. -## Combining Bulk and API Downloads - -You can mix approaches: - -```python -# Download full CRS as bulk file -crs_full = bulk_download_crs() - -# Use API for recent updates or specific queries -crs_recent = download_crs( - start_year=2023, - end_year=2023, - filters={"donor": "USA"} -) - -# Combine if schemas match -# (you may need to harmonize column names first) -``` - -## Performance Comparison - -Approximate times (varies by network speed and OECD server load): - -| Method | Dataset Size | Time | -|--------|-------------|------| -| API download (filtered) | 10,000 rows | 10-30 seconds | -| API download (large query) | 100,000 rows | 2-5 minutes | -| Bulk download CRS | ~2 million rows | 1-2 minutes | -| Bulk + iterator (filter) | Process 2 million rows | 2-5 minutes | - -Bulk downloads are consistently fast regardless of query complexity, while API times vary significantly with query size. ## Troubleshooting diff --git a/docs/docs/datasets.md b/docs/docs/datasets.md index dd3d3e6..9e627d6 100644 --- a/docs/docs/datasets.md +++ b/docs/docs/datasets.md @@ -7,16 +7,17 @@ ODA Reader provides access to five datasets covering official development assist | Dataset | What It Contains | Use When | |---------|------------------|----------| | **DAC1** | Aggregate flows by donor | Analyzing overall ODA trends, donor performance | -| **DAC2a** | Bilateral flows by donor-recipient | Recipient-level analysis, who gives to whom | +| **DAC2a** | Bilateral flows by donor-recipient | Recipient-level analysis | | **CRS** | Project-level microdata | Sector analysis, project details, activity-level data | | **Multisystem** | Multilateral system usage | Analyzing multilateral channels and contributions | -| **AidData** | Chinese development finance | Non-DAC donor analysis, Chinese aid flows | +| **AidData** | Chinese development finance | Chinese aid flows | ## DAC1: Aggregate Flows **What it contains**: Total ODA and OOF by donor, aggregated across all recipients and sectors. This is the highest-level view of development assistance. **Key dimensions**: + - Donor (bilateral donors and multilateral organizations) - Measure type (ODA, OOF, grants, loans, etc.) - Flow type (commitments, disbursements, grant equivalents) @@ -24,6 +25,7 @@ ODA Reader provides access to five datasets covering official development assist - Unit measure (USD millions, national currency, etc.) **Use when**: + - You need donor-level totals - Analyzing overall ODA trends over time - Comparing donor performance @@ -56,12 +58,14 @@ oda_constant = download_dac1( **What it contains**: Bilateral ODA and OOF flows broken down by both donor and recipient country. Shows who gives to whom. **Key dimensions**: + - Donor (bilateral donors) - Recipient (receiving countries and regions) - Measure type (bilateral ODA, imputed multilateral, etc.) - Price base (current or constant) **Use when**: + - Analyzing flows to specific recipient countries - Understanding bilateral relationships - Studying geographic distribution of aid @@ -95,6 +99,7 @@ germany_eastafrica = download_dac2a( **What it contains**: Individual project and activity-level data with detailed information about each development assistance activity. This is the most granular dataset. **Key dimensions**: + - Donor - Recipient - Sector (purpose codes at various levels of detail) @@ -104,6 +109,7 @@ germany_eastafrica = download_dac2a( - Microdata flag (True for project-level, False for semi-aggregates) **Use when**: + - You need project-level details (descriptions, amounts, sectors) - Analyzing sector-specific flows - Understanding implementation channels @@ -153,6 +159,7 @@ semi_agg = download_crs( **What it contains**: Data on how DAC members use the multilateral aid system, including core contributions to multilateral organizations and earmarked funding. **Key dimensions**: + - Donor - Recipient (multilateral organizations) - Channel (specific multilateral organizations) @@ -160,6 +167,7 @@ semi_agg = download_crs( - Measure type **Use when**: + - Analyzing multilateral contributions - Understanding core vs. earmarked funding - Studying specific multilateral channels (World Bank, UN agencies, etc.) @@ -191,6 +199,7 @@ ida_contributions = download_multisystem( **What it contains**: Project-level data on Chinese development finance activities, compiled by AidData. Covers official finance from China that may not be reported to the OECD. **Key dimensions**: + - Commitment year - Recipient country - Sector @@ -198,9 +207,9 @@ ida_contributions = download_multisystem( - Flow amounts and types **Use when**: + - Analyzing Chinese development finance -- Comparing traditional DAC donors with China -- Studying non-DAC donor activities +- Comparing DAC donors with China **Example**: @@ -213,7 +222,7 @@ chinese_aid = download_aiddata(start_year=2015, end_year=2020) # AidData is downloaded as bulk file, filtered by year after download ``` -**Note**: AidData comes from Excel files, not the OECD API. It uses a different schema than DAC datasets. +**Note**: AidData comes from Excel files from the Aid Data website, not the OECD API. It uses a different schema than DAC datasets. ## Discovering Available Filters diff --git a/docs/docs/filtering.md b/docs/docs/filtering.md index 8d2156c..9eba74d 100644 --- a/docs/docs/filtering.md +++ b/docs/docs/filtering.md @@ -101,12 +101,13 @@ multisystem_filters = get_available_filters("multisystem") ### DAC1 and DAC2a Common dimensions: + - `donor` - Donor country (ISO3 codes like "USA", "GBR", "FRA") - `recipient` - Recipient country or region (DAC2a only) - `measure` - Type of flow (ODA, OOF, grants, loans, etc.) - `flow_type` - Commitments, disbursements, net flows, etc. - `price_base` - "V" for current prices, "Q" for constant prices -- `unit_measure` - "USD" for US dollars, "XDC" for national currency +- `unit_measure` - "USD" for US dollars **Example**: Get net ODA disbursements in constant prices: @@ -127,6 +128,7 @@ data = download_dac1( ### CRS (Creditor Reporting System) CRS has additional dimensions: + - `sector` - Purpose codes (5-digit codes like "12220" for basic health) - `channel` - Implementing organization (government, NGO, multilateral, etc.) - `modality` - Grant, loan, equity, etc. @@ -178,6 +180,7 @@ The `_T` suffix means "total" - it aggregates across that dimension to avoid dou ### Multisystem Multisystem tracks multilateral contributions: + - `donor` - Contributing country - `channel` - Specific multilateral organization (e.g., "44002" for World Bank IDA) - `flow_type` - Commitments, disbursements @@ -214,7 +217,8 @@ print(data['measure'].unique()) # See all measure codes 3. **Use trial and error**: Download a small query and examine column values -**Note**: Codes differ between API schema and .Stat schema. By default, ODA Reader returns .Stat codes. See [Schema Translation](schema-translation.md) for details. +**Note**: Codes differ between API schema and .Stat schema. When making API calls, you must use the +API schema. However by default, ODA Reader returns .Stat codes. See [Schema Translation](schema-translation.md) for details. ## Empty Filters diff --git a/docs/docs/getting-started.md b/docs/docs/getting-started.md index 9279bdf..dcb8901 100644 --- a/docs/docs/getting-started.md +++ b/docs/docs/getting-started.md @@ -13,7 +13,7 @@ pip install oda-reader Or using uv (recommended for faster installs): ```bash -uv pip install oda-reader +uv add oda-reader ``` That's it! ODA Reader and its dependencies (pandas, requests, pyarrow, etc.) are now installed. @@ -121,6 +121,4 @@ Now that you've downloaded your first datasets, explore: **Query is slow**: First-time queries can take 10-30 seconds as ODA Reader fetches from OECD's API. Subsequent identical queries are instant due to caching. -**Rate limit errors**: By default, ODA Reader limits to 20 requests per 60 seconds. This should prevent rate limit errors. If you see them, your cache might have been cleared. Wait a minute and retry. - -**Import errors**: Make sure you installed with dependencies: `pip install oda-reader` (not just `oda_reader`). +**Rate limit errors**: By default, ODA Reader limits to 20 requests per hour. This should prevent rate limit errors. If you see them, your cache might have been cleared. Wait and retry. diff --git a/docs/docs/index.md b/docs/docs/index.md index bb8ccd3..ccab9b3 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -1,12 +1,12 @@ # ODA Reader -**Programmatic access to OECD DAC data without the headaches** +**Programmatic access to OECD DAC data** -Working with OECD Development Assistance Committee (DAC) data is frustrating. You need to navigate multiple datasets (DAC1, DAC2a, CRS), understand complex SDMX API syntax, manage rate limits, and reconcile different schema versions. The OECD doesn't provide any first-party Python library to help. +Working with OECD Development Assistance Committee (DAC) data can be frustrating. You need to navigate multiple datasets (DAC1, DAC2a, CRS,...), understand complex SDMX API syntax, manage toy rate limits, and reconcile different schema versions. The OECD doesn't provide any first-party Python library to help. -Worse, the OECD has a habit of introducing undocumented schema changes, breaking link URLs, and making format changes without notice. What works today might break tomorrow, making it extremely difficult to build robust data pipelines for research and analysis. +Unfortunately, the OECD has a habit of introducing undocumented schema changes, breaking link URLs, and making format changes without notice. What works today might break tomorrow, making it very difficult to build robust data pipelines for research and analysis. -ODA Reader eliminates these headaches. It provides a unified Python interface that handles complexity for you: automatic version fallbacks when schemas change, consistent APIs across datasets, smart caching to reduce dependency on flaky endpoints, and schema translation between API and legacy formats. +ODA Reader eliminates these headaches. It provides a unified Python interface that handles complexity for you: automatic search of the latest schema, consistent APIs across datasets, smart caching to reduce dependency on flaky endpoints, and schema translation between the data-explorer API and OECD.Stat formats. **Key features**: @@ -15,7 +15,8 @@ ODA Reader eliminates these headaches. It provides a unified Python interface th - **Bulk download large files** with memory-efficient streaming for the full CRS (1GB+) - **Automatic rate limiting** and caching to work within API constraints - **Schema translation** between Data Explorer API and OECD.Stat formats -- **Version fallback** automatically retries with older schema versions when OECD makes breaking changes +- **Version fallback** automatically searches for the most recent schema version since they +can unexpectedly change with new data releases. **Built for researchers, analysts, and developers** who need reliable, programmatic access to ODA data without fighting infrastructure. diff --git a/docs/docs/schema-translation.md b/docs/docs/schema-translation.md index f335ca5..6d1c6a3 100644 --- a/docs/docs/schema-translation.md +++ b/docs/docs/schema-translation.md @@ -7,6 +7,7 @@ OECD DAC data exists in two schema formats: the modern Data Explorer API schema ### Data Explorer API Schema (New) The current OECD Data Explorer uses a new schema: + - Column names: `DONOR`, `RECIPIENT`, `MEASURE`, etc. (all caps) - Dimension codes: Modern conventions (e.g., donor codes) - Used by: API downloads (`download_dac1()`, `download_crs()`, etc.) @@ -14,6 +15,7 @@ The current OECD Data Explorer uses a new schema: ### OECD.Stat Schema (Legacy) The older OECD.Stat system uses a different schema: + - Column names: `DonorCode`, `RecipientCode`, `Measure`, etc. (mixed case) - Dimension codes: Legacy conventions, sometimes different from API codes - Used by: Bulk download files, historical .Stat exports @@ -122,7 +124,6 @@ data = download_dac1(start_year=2022, end_year=2022) **Pros**: - Works with existing .Stat-based workflows -- Codes are human-readable (ISO3 country codes) - Compatible with bulk download files ### Mode 2: Raw API Response @@ -148,7 +149,6 @@ data = download_dac1( **Cons**: - Harder to work with (inconsistent naming) -- Codes are not human-readable ### Mode 3: Preprocessed with API Codes @@ -179,30 +179,23 @@ data = download_dac1( ### Donor Codes -| API Code | .Stat Code | Country | -|----------|------------|---------| -| `1` | `AUS` | Australia | -| `2` | `AUT` | Austria | -| `12` | `USA` | United States | -| `301` | `GBR` | United Kingdom | +| .Stat Code | API code | Country | +|----------|----------|---------| +| `1` | `AUS` | Australia | +| `2` | `AUT` | Austria | +| `12` | `USA` | United States | +| `301` | `GBR` | United Kingdom | ### Measure Codes (DAC1) -| API Code | .Stat Code | Description | -|----------|------------|-------------| -| `100` | `1010` | Net ODA | -| `106` | `1011` | ODA Grants | -| `11017` | `11017` | Grant equiv. of loans | +| .Stat Code | API Code | Description | +|------------|------------|-------------| +| `100` | `1010` | Net ODA | +| `106` | `1011` | ODA Grants | +| `11017` | `11017` | Grant equiv. of loans | (Note: Some codes are the same across schemas) -### Flow Type Codes - -| API Code | .Stat Code | Description | -|----------|------------|-------------| -| `A` | `1140` | Disbursements | -| `C` | `1110` | Commitments | -| `D` | `1160` | Net flows | Translation mappings are maintained in `src/oda_reader/schemas/mappings/` as JSON files. @@ -262,20 +255,20 @@ bulk_data = bulk_data.rename(columns={ ## When to Use Which Mode **Use default mode (pre_process=True, dotstat_codes=True)**: -- ✅ General analysis and research -- ✅ Combining API downloads with bulk files -- ✅ Working with historical .Stat exports -- ✅ Human-readable codes (ISO3 country codes) + +- General analysis and research +- Combining API downloads with bulk files +- Working with historical .Stat exports +- Human-readable codes (ISO3 country codes) **Use raw mode (pre_process=False, dotstat_codes=False)**: -- ✅ Debugging API issues -- ✅ Understanding API response structure -- ❌ Not recommended for analysis +- Debugging API issues +- Understanding API response structure **Use API codes mode (pre_process=True, dotstat_codes=False)**: -- ✅ Working exclusively with new Data Explorer API -- ✅ When you prefer OECD's latest code conventions -- ❌ Avoid if combining with bulk downloads or .Stat files +- Working exclusively with new Data Explorer API +- When you prefer OECD's latest code conventions +- Avoid if combining with bulk downloads or .Stat files ## Finding Code Mappings diff --git a/docs/docs/why-oda-reader.md b/docs/docs/why-oda-reader.md index 35bedc0..5b6346f 100644 --- a/docs/docs/why-oda-reader.md +++ b/docs/docs/why-oda-reader.md @@ -1,6 +1,5 @@ # Why ODA Reader? -This page explains why ODA Reader exists, how it compares to alternatives, and when you might want to use it (or not). ## The Problem with OECD DAC Data Access @@ -8,7 +7,7 @@ The OECD Development Assistance Committee publishes comprehensive data on offici **No official Python library**: The OECD doesn't provide any first-party Python tools for accessing DAC data. You're on your own to figure out the SDMX API, construct queries, and parse responses. -**Undocumented breaking changes**: The OECD regularly introduces schema changes without documentation or warning. A dataflow version that worked last month might return 404 errors today. Link URLs change, breaking saved bookmarks and automated downloads. +**Undocumented breaking changes**: The OECD regularly introduces schema changes without documentation or warning. A dataflow version that worked last month might return 404 errors today, or worse - return outdated data. Link URLs change, breaking saved bookmarks and automated downloads. **Inconsistent formats**: Different datasets use different schemas. The new Data Explorer API uses one set of dimension codes, while legacy .Stat files and bulk downloads use another. Reconciling these takes significant effort. @@ -31,7 +30,7 @@ The OECD Development Assistance Committee publishes comprehensive data on offici ### Manual Downloads from OECD.Stat -**Approach**: Download CSV or Excel files from OECD.Stat portal manually. +**Approach**: Download Parquet, CSV or Excel files from the data-explorer manually. **Challenges**: - No automation - manual clicking and downloading @@ -81,43 +80,24 @@ API calls to OECD are slow (often 10-30 seconds per query) and subject to rate l You can disable or clear caching when you need fresh data. -### How Version Fallback Works - -When OECD changes a dataflow schema version, ODA Reader: -1. Tries the configured version (e.g., `1.0`) -2. If 404 error, automatically retries with `0.9` -3. Continues decrementing (0.8, 0.7, 0.6) up to 5 attempts -4. Returns data from first successful version - -This means your code keeps working even when OECD makes breaking changes. ## Limitations and When Not to Use ODA Reader -**Be honest about limitations:** - -❌ **Not for real-time data**: Caching introduces delays. If you need the absolute latest data published in the last hour, you'll need to clear cache or use the OECD portal directly. -❌ **Requires Python knowledge**: This is a Python package. If you're not comfortable with Python and pandas, the OECD.Stat portal's Excel downloads might be easier. +**Requires Python knowledge**: This is a Python package. If you're not comfortable with Python and pandas, the OECD.Stat portal's Excel downloads might be easier. -❌ **Only covers DAC data**: ODA Reader focuses exclusively on Development Assistance Committee datasets. For other OECD data (economic indicators, education statistics, etc.), you'll need different tools. +**Mostly focused on DAC data**: ODA Reader focuses on Development Assistance Committee datasets. However, we recently introduced data from Aid Data. -❌ **Bulk downloads limited**: Only CRS, Multisystem, and AidData have bulk download options. For other datasets, you must use the API. - -❌ **Dependent on OECD availability**: While caching helps, initial downloads still depend on OECD's servers being available and responsive. +**Dependent on OECD availability**: While caching helps, initial downloads still depend on OECD's servers being available and responsive. ## When to Use ODA Reader -✅ You're doing research or analysis that requires ODA/OOF data - -✅ You need programmatic, reproducible access to multiple datasets - -✅ You're building data pipelines that need to be robust to OECD's changes - -✅ You want to avoid manually managing API rate limits and caching - -✅ You need to work with both API and bulk download formats - -✅ You're comfortable with Python and pandas +- You're doing research or analysis that requires ODA/OOF data +- You need programmatic, reproducible access to multiple datasets +- You're building data pipelines that need to be robust to OECD's changes +- You want to avoid manually managing API rate limits and caching +- You need to work with both API and bulk download formats +- You're comfortable with Python and pandas ## Next Steps diff --git a/pyproject.toml b/pyproject.toml index 50a7800..9eed507 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "oda_reader" -version = "1.3.0.post0" +version = "1.3.1" description = "A simple package to import ODA data from the OECD's API and AidData's database" readme = "README.md" license = "MIT" @@ -8,6 +8,21 @@ authors = [ { name = "Jorge Rivera", email = "jorge.rivera@one.org" } ] requires-python = ">=3.10" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering", + "Topic :: Software Development :: Libraries :: Python Modules", +] dependencies = [ "filelock>=3.18.0", "joblib>=1.4", diff --git a/src/oda_reader/_cache/README.md b/src/oda_reader/_cache/README.md index 10bfa47..7b14df0 100644 --- a/src/oda_reader/_cache/README.md +++ b/src/oda_reader/_cache/README.md @@ -48,7 +48,7 @@ User Request ↓ (cache miss) ┌─────────────────────────────────────────┐ │ HTTP Cache (requests-cache) │ -│ - SQLite backend │ +│ - Filesystem backend │ │ - 7-day TTL │ │ - Caches 200 and 404 responses │ └─────────────────────────────────────────┘ @@ -65,7 +65,9 @@ Default location: `~/.cache/oda-reader/{version}/` (macOS/Linux) or `%LOCALAPPDA ``` ~/.cache/oda-reader/1.2.2/ -├── http_cache.sqlite # HTTP response cache +├── http_cache/ # HTTP response cache (filesystem backend) +│ ├── +│ └── ├── dataframes/ # Processed DataFrames │ ├── 2986243275235237.parquet │ └── 00b396b02a62f1cb.parquet @@ -320,7 +322,7 @@ Second download: 0.03s (DataFrame cache) - 90x faster ### Storage Usage Typical cache sizes: -- HTTP cache: 1-10 MB (SQLite database) +- HTTP cache: 1-10 MB per response (filesystem backend, can handle >2GB responses) - DataFrame cache: 0.1-1 MB per query (compressed parquet) - Bulk files: 100-1000 MB per file (CRS full dataset ~900 MB) @@ -456,9 +458,10 @@ This ensures different preprocessing options get separate cache entries. ### HTTP Cache Backend -Uses `requests-cache` with SQLite backend: -- Database: `{cache_dir}/http_cache.sqlite` -- Stores responses, redirects, and metadata +Uses `requests-cache` with filesystem backend: +- Directory: `{cache_dir}/http_cache/` +- Stores responses, redirects, and metadata as individual files +- Handles large responses (>2GB) without issues - Automatic cleanup on expiration - Thread-safe for concurrent requests diff --git a/src/oda_reader/_cache/config.py b/src/oda_reader/_cache/config.py index 83e8e1d..7827567 100644 --- a/src/oda_reader/_cache/config.py +++ b/src/oda_reader/_cache/config.py @@ -11,7 +11,7 @@ # Version for cache versioning (hardcoded to avoid circular import) # This should match the version in __init__.py -__version__ = "1.3.0" +__version__ = "1.3.1" # Global override for cache directory (set via set_cache_dir) _CACHE_DIR_OVERRIDE: Path | None = None @@ -72,14 +72,15 @@ def reset_cache_dir() -> None: def get_http_cache_path() -> Path: - """Get the path for HTTP response cache (requests-cache SQLite file). + """Get the path for HTTP response cache (requests-cache filesystem directory). Returns: - Path: Path to the HTTP cache database file. + Path: Path to the HTTP cache directory. """ cache_dir = get_cache_dir() - cache_dir.mkdir(parents=True, exist_ok=True) - return cache_dir / "http_cache.sqlite" + http_cache_dir = cache_dir / "http_cache" + http_cache_dir.mkdir(parents=True, exist_ok=True) + return http_cache_dir def get_bulk_cache_dir() -> Path: diff --git a/src/oda_reader/common.py b/src/oda_reader/common.py index ddcd658..4eef453 100644 --- a/src/oda_reader/common.py +++ b/src/oda_reader/common.py @@ -29,6 +29,7 @@ def _get_http_session() -> requests_cache.CachedSession: """Get or create the global HTTP cache session. All responses are cached for 7 days (604800 seconds). + Uses filesystem backend to handle large responses (>2GB). Returns: CachedSession: requests-cache session with 7-day expiration. @@ -40,7 +41,7 @@ def _get_http_session() -> requests_cache.CachedSession: _HTTP_SESSION = requests_cache.CachedSession( cache_name=cache_path, - backend="sqlite", + backend="filesystem", expire_after=604800, # 7 days allowable_codes=(200, 404), # Cache 404s for version fallback stale_if_error=True, # Use stale cache if API errors diff --git a/tests/common/unit/test_cache.py b/tests/common/unit/test_cache.py index 45c30a9..b6c1c18 100644 --- a/tests/common/unit/test_cache.py +++ b/tests/common/unit/test_cache.py @@ -43,7 +43,7 @@ def test_enable_cache_sets_flag(self): assert common._CACHE_ENABLED is True - def test_clear_cache_resets_counters(self): + def test_clear_cache_resets_counters(self, temp_cache_dir): """Test that clear_http_cache resets cache statistics.""" enable_http_cache() clear_http_cache() @@ -53,7 +53,7 @@ def test_clear_cache_resets_counters(self): assert info["response_count"] == 0 assert info["redirects_count"] == 0 - def test_get_cache_info_returns_dict(self): + def test_get_cache_info_returns_dict(self, temp_cache_dir): """Test that get_http_cache_info returns expected structure.""" enable_http_cache() diff --git a/tests/conftest.py b/tests/conftest.py index 1670d6b..f166012 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,11 +27,20 @@ def temp_cache_dir(tmp_path, monkeypatch): Yields: Path: Path to the temporary cache directory """ + from oda_reader import common + cache_dir = tmp_path / "test_cache" cache_dir.mkdir() monkeypatch.setenv("ODA_READER_CACHE_DIR", str(cache_dir)) + + # Reset global HTTP session so it gets reinitialized with new cache path + common._HTTP_SESSION = None + yield cache_dir + # Clean up: reset session again after test + common._HTTP_SESSION = None + @pytest.fixture def rate_limiter_fast(): diff --git a/uv.lock b/uv.lock index 531fd49..400b7a0 100644 --- a/uv.lock +++ b/uv.lock @@ -789,7 +789,7 @@ wheels = [ [[package]] name = "oda-reader" -version = "1.3.0.post0" +version = "1.3.1" source = { editable = "." } dependencies = [ { name = "filelock" },