Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions docs/config-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,36 @@ Note pratiche per `ckan`:
- se il portale restituisce un file URL in `http://`, il toolkit lo forza automaticamente a `https://`
- se `filename` non e dichiarato, il toolkit prova a inferire l'estensione dall'URL risolto

Esempio `sdmx`:

```yaml
raw:
sources:
- name: popolazione_residente
type: sdmx
client:
timeout: 60
retries: 2
args:
agency: IT1
flow: 22_289
version: "1.5"
filters:
FREQ: A
REF_AREA: "001001"
DATA_TYPE: JAN
SEX: "9"
AGE: TOTAL
MARITAL_STATUS: "99"
```

Note pratiche per `sdmx`:

- la `version` e' obbligatoria e deve coincidere con la versione corrente esposta dal dataflow
- non esiste fallback silenzioso a `latest`
- in v1 i `filters` sono supportati solo sulle dimensioni di serie, non su `TIME_PERIOD`
- il filtro temporale va applicato nel layer `clean.sql` (per esempio `WHERE TIME_PERIOD = '2024'`), non in `raw.sources[].args.filters`
- il plugin restituisce un CSV normalizzato con colonne `DIM`, `DIM_label` e `value`
## clean

| Campo | Tipo | Default |
Expand Down
1 change: 1 addition & 0 deletions smoke/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Progetti inclusi:
- `smoke/zip_http_csv`: `http_file` + extractor ZIP (`unzip_first_csv`) contro server locale
- `smoke/bdap_http_csv`: `http_file` contro CSV pubblico BDAP
- `smoke/bdap_ckan_csv`: `ckan` contro OpenBDAP, con fallback `package_show` e force `https`
- `smoke/istat_sdmx_22_289`: `sdmx` contro flow ISTAT reale `22_289`
- `smoke/finanze_http_zip_2023`: `http_file` contro ZIP pubblico reale, best-effort

Ogni progetto include:
Expand Down
24 changes: 24 additions & 0 deletions smoke/istat_sdmx_22_289/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# istat_sdmx_22_289

Smoke manuale per `sdmx` su dataflow ISTAT `22_289`.

## Comandi

```bash
toolkit run raw --config dataset.yml
toolkit profile raw --config dataset.yml
toolkit run clean --config dataset.yml
toolkit run mart --config dataset.yml
toolkit status --dataset istat_sdmx_22_289 --year 2024 --latest --config dataset.yml
```

## Verifiche attese

- `./_smoke_out/data/raw/istat_sdmx_22_289/2024/manifest.json`
- `./_smoke_out/data/raw/istat_sdmx_22_289/2024/raw_validation.json`
- `./_smoke_out/data/clean/istat_sdmx_22_289/2024/metadata.json`
- `./_smoke_out/data/mart/istat_sdmx_22_289/2024/mart_ok.parquet`

## Nota

Questo smoke usa un flow ISTAT reale e richiede che la versione `1.5` sia ancora quella esposta dal dataflow `22_289`.
37 changes: 37 additions & 0 deletions smoke/istat_sdmx_22_289/dataset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
root: "./_smoke_out"

dataset:
name: "istat_sdmx_22_289"
years: [2024]

raw:
sources:
- name: "popolazione_residente"
type: "sdmx"
args:
agency: "IT1"
flow: "22_289"
version: "1.5"
filters:
FREQ: "A"
REF_AREA: "001001"
DATA_TYPE: "JAN"
SEX: "9"
AGE: "TOTAL"
MARITAL_STATUS: "99"
filename: "istat_popolazione_residente_22_289.csv"

clean:
sql: "sql/clean.sql"
read_mode: "strict"
read:
mode: "explicit"
delim: ","
encoding: "utf-8"
header: true
validate: {}

mart:
tables:
- name: "mart_ok"
sql: "sql/mart/mart_ok.sql"
7 changes: 7 additions & 0 deletions smoke/istat_sdmx_22_289/sql/clean.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
select
TIME_PERIOD,
REF_AREA,
REF_AREA_label,
try_cast(value as bigint) as residenti
from raw_input
where TIME_PERIOD = '2024'
6 changes: 6 additions & 0 deletions smoke/istat_sdmx_22_289/sql/mart/mart_ok.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
select
TIME_PERIOD,
REF_AREA,
REF_AREA_label,
residenti
from clean_input
2 changes: 2 additions & 0 deletions tests/test_raw_ext_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def test_infer_ext_ckan_uses_resolved_origin():
assert _infer_ext("ckan", {}, origin="https://example.org/archive.zip.php") == ".zip"


def test_infer_ext_sdmx_is_csv():
assert _infer_ext("sdmx", {"flow": "22_289"}) == ".csv"
def test_infer_ext_never_returns_php():
assert _infer_ext("http_file", {"url": "https://example.org/download.php?id=42"}) != ".php"
assert _infer_ext("local_file", {"path": "C:/tmp/file.php"}) != ".php"
Expand Down
1 change: 1 addition & 0 deletions tests/test_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@ def test_register_builtin_plugins_registers_present_plugins():

plugins = r.list_plugins()
assert "ckan" in plugins
assert "sdmx" in plugins
assert "http_file" in plugins
assert "local_file" in plugins
154 changes: 154 additions & 0 deletions tests/test_sdmx_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from toolkit.core.exceptions import DownloadError
from toolkit.plugins.sdmx import SdmxSource


class _FakeResponse:
def __init__(self, status_code: int, text: str, url: str):
self.status_code = status_code
self.text = text
self.url = url


DATAFLOW_XML = """<?xml version="1.0" encoding="UTF-8"?>
<mes:Structure xmlns:mes="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message"
xmlns:str="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure">
<mes:Structures>
<str:Dataflows>
<str:Dataflow id="22_289">
<str:Structure>
<Ref id="DCIS_POPRES1" version="1.5" agencyID="IT1" package="datastructure" class="DataStructure" />
</str:Structure>
</str:Dataflow>
</str:Dataflows>
</mes:Structures>
</mes:Structure>
"""

DATA_JSON = """
{
"dataSets": [
{
"series": {
"0:0:0:0:0:0": {
"observations": {
"0": [2634],
"1": [2621]
}
}
}
}
],
"structure": {
"dimensions": {
"series": [
{"id": "FREQ", "values": [{"id": "A", "name": "annual"}]},
{"id": "REF_AREA", "values": [{"id": "001001", "name": "Agliè"}]},
{"id": "DATA_TYPE", "values": [{"id": "JAN", "name": "population on 1st January"}]},
{"id": "SEX", "values": [{"id": "9", "name": "total"}]},
{"id": "AGE", "values": [{"id": "TOTAL", "name": "total"}]},
{"id": "MARITAL_STATUS", "values": [{"id": "99", "name": "total"}]}
],
"observation": [
{
"id": "TIME_PERIOD",
"values": [
{"id": "2024", "name": "2024"},
{"id": "2025", "name": "2025"}
]
}
]
}
}
}
"""

PREVIEW_JSON = """
{
"dataSets": [{"series": {}}],
"structure": {
"dimensions": {
"series": [
{"id": "FREQ"},
{"id": "REF_AREA"},
{"id": "DATA_TYPE"},
{"id": "SEX"},
{"id": "AGE"},
{"id": "MARITAL_STATUS"}
]
}
}
}
"""


def test_sdmx_fetch_normalizes_csv(monkeypatch):
calls = []

def _fake_get(url, params=None, timeout=None, headers=None):
calls.append((url, params, headers.get("Accept") if headers else None))
if url.endswith("/dataflow/IT1/22_289"):
return _FakeResponse(200, DATAFLOW_XML, url)
if url.endswith("/data/IT1,22_289,1.5/all"):
return _FakeResponse(200, PREVIEW_JSON, url)
if url.endswith("/data/IT1,22_289,1.5/A.001001.JAN.9.TOTAL.99"):
return _FakeResponse(200, DATA_JSON, url)
raise AssertionError(f"Unexpected URL {url}")

monkeypatch.setattr("toolkit.plugins.sdmx.requests.get", _fake_get)

payload, origin = SdmxSource().fetch(
"IT1",
"22_289",
"1.5",
{
"FREQ": "A",
"REF_AREA": "001001",
"DATA_TYPE": "JAN",
"SEX": "9",
"AGE": "TOTAL",
"MARITAL_STATUS": "99",
},
)

text = payload.decode("utf-8")
assert origin.endswith("/data/IT1,22_289,1.5/A.001001.JAN.9.TOTAL.99")
assert "FREQ,FREQ_label" in text
assert "A,annual" in text
assert "001001,Agliè" in text
assert "2024,2024,2634" in text
assert any(call[2] == "application/json" for call in calls)
assert any(call[1] == {"firstNObservations": "0"} for call in calls)


def test_sdmx_fetch_blocks_version_mismatch(monkeypatch):
def _fake_get(url, params=None, timeout=None, headers=None):
if url.endswith("/dataflow/IT1/22_289"):
return _FakeResponse(200, DATAFLOW_XML, url)
raise AssertionError(f"Unexpected URL {url}")

monkeypatch.setattr("toolkit.plugins.sdmx.requests.get", _fake_get)

try:
SdmxSource().fetch("IT1", "22_289", "1.0", {"FREQ": "A"})
except DownloadError as exc:
assert "current version is 1.5" in str(exc)
else:
raise AssertionError("Expected DownloadError")


def test_sdmx_fetch_rejects_unknown_filter_dimension(monkeypatch):
def _fake_get(url, params=None, timeout=None, headers=None):
if url.endswith("/dataflow/IT1/22_289"):
return _FakeResponse(200, DATAFLOW_XML, url)
if url.endswith("/data/IT1,22_289,1.5/all"):
return _FakeResponse(200, PREVIEW_JSON, url)
raise AssertionError(f"Unexpected URL {url}")

monkeypatch.setattr("toolkit.plugins.sdmx.requests.get", _fake_get)

try:
SdmxSource().fetch("IT1", "22_289", "1.5", {"TIME_PERIOD": "2024"})
except DownloadError as exc:
assert "Unknown SDMX filter dimensions" in str(exc)
else:
raise AssertionError("Expected DownloadError")
10 changes: 9 additions & 1 deletion toolkit/core/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ def __init__(self):

def register(self, name: str, factory: Callable[..., Any], *, overwrite: bool = False) -> None:
if not overwrite and name in self._plugins:
raise ValueError(f"Plugin già registrato: '{name}'")
raise ValueError(f"Plugin già registrato: '{name}'")
self._plugins[name] = factory

def decorator(self, name: str, *, overwrite: bool = False):
def _wrap(factory: Callable[..., Any]):
self.register(name, factory, overwrite=overwrite)
return factory

return _wrap

def create(self, name: str, **kwargs):
Expand All @@ -51,6 +52,13 @@ def clear(self) -> None:
"optional": False,
"factory": lambda cls: (lambda **client: cls(**client)),
},
{
"name": "sdmx",
"module": "toolkit.plugins.sdmx",
"class_name": "SdmxSource",
"optional": False,
"factory": lambda cls: (lambda **client: cls(**client)),
},
{
"name": "http_file",
"module": "toolkit.plugins.http_file",
Expand Down
1 change: 1 addition & 0 deletions toolkit/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
- `local_file`
- `http_file`
- `ckan`
- `sdmx`
"""
Loading
Loading