From 2ea580d108545636910b510090903e76b4e7a446 Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Sun, 1 Mar 2026 22:29:46 +0000 Subject: [PATCH 1/6] allineamente finale con template --- CODE_OF_CONDUCT.md | 31 +++++++ README.md | 10 +++ SECURITY.md | 25 +++--- tests/test_cli_path_contract.py | 144 ++++++++++++++++++++++++++++++++ toolkit/clean/run.py | 11 ++- toolkit/cli/cmd_run.py | 5 +- toolkit/mart/run.py | 11 ++- 7 files changed, 219 insertions(+), 18 deletions(-) create mode 100644 CODE_OF_CONDUCT.md create mode 100644 tests/test_cli_path_contract.py diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..befd34a --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,31 @@ +# Code of Conduct + +## Our Pledge + +We want this project to be a respectful, constructive, and harassment-free space for everyone involved. + +## Our Standards + +Examples of behavior that support a healthy community: + +- being respectful in disagreements +- giving and receiving feedback constructively +- focusing on the technical problem, not the person +- showing patience with contributors at different experience levels + +Examples of unacceptable behavior: + +- harassment, insults, or personal attacks +- discriminatory or hateful language +- repeated bad-faith disruption of discussions or reviews +- publishing private information without permission + +## Enforcement + +If you experience or witness unacceptable behavior, report it to the maintainers or repository owners. + +Project maintainers may remove, edit, or reject comments, issues, commits, code, or other contributions that violate this Code of Conduct. + +## Scope + +This Code of Conduct applies to repository discussions, issues, pull requests, code review, and other project spaces managed by the maintainers. diff --git a/README.md b/README.md index df9c119..6382c7e 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,16 @@ pip install -e .[dev] Richiede Python 3.10+. +## CLI Naming Note + +Il comando CLI canonico del progetto e' `toolkit`. + +Se nel tuo ambiente c'e' una collisione di nome o il console script non e' nel `PATH`, puoi usare direttamente il modulo Python: + +```bash +python -m toolkit.cli.app run all --config dataset.yml +``` + ## Quickstart Giro offline completo con il progetto di esempio, eseguibile in pochi minuti su una macchina pulita. diff --git a/SECURITY.md b/SECURITY.md index d9c53d4..7f618eb 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,23 +2,18 @@ ## Reporting a Vulnerability -If you discover a security issue in this repository, do not open a public issue first. +If you believe you found a security issue in this repository, please do not open a public issue first. -Report it privately to the maintainers with: +Instead: -- a clear description of the problem -- affected versions or commits, if known -- reproduction steps or a minimal proof of concept -- suggested mitigations, if available +- contact the maintainers privately, if a private contact path is available +- or ask the repository owners for a private reporting channel before sharing details publicly -If no dedicated private channel is available yet, contact the project maintainers through the repository owners and request a private disclosure path before sharing details publicly. +When possible, include: -## Response Expectations +- a short description of the issue +- affected files, versions, or commits +- steps to reproduce +- possible impact -- We will acknowledge receipt of a report as soon as practical. -- We will evaluate impact and reproduction details. -- We will coordinate remediation and disclosure timing when the report is valid. - -## Scope - -This policy applies to the source code and release artifacts of this repository. +There is currently no bug bounty program for this project. diff --git a/tests/test_cli_path_contract.py b/tests/test_cli_path_contract.py new file mode 100644 index 0000000..13965ee --- /dev/null +++ b/tests/test_cli_path_contract.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import json +import shutil +from pathlib import Path + +from typer.testing import CliRunner + +from toolkit.cli.app import app + + +def _copy_project_example(dst: Path) -> Path: + src = Path("project-example") + shutil.copytree(src, dst) + shutil.rmtree(dst / "_smoke_out", ignore_errors=True) + return dst / "dataset.yml" + + +def _write_failed_run_record(path: Path, run_id: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps( + { + "dataset": "project_example", + "year": 2022, + "run_id": run_id, + "started_at": "2026-03-01T10:00:00+00:00", + "finished_at": "2026-03-01T10:01:00+00:00", + "status": "FAILED", + "layers": { + "raw": {"status": "SUCCESS", "started_at": "2026-03-01T10:00:00+00:00", "finished_at": "2026-03-01T10:00:10+00:00"}, + "clean": {"status": "FAILED", "started_at": "2026-03-01T10:00:10+00:00", "finished_at": "2026-03-01T10:00:20+00:00"}, + "mart": {"status": "PENDING", "started_at": None, "finished_at": None}, + }, + "validations": {"raw": {}, "clean": {}, "mart": {}}, + "resumed_from": None, + "error": "clean failed", + }, + indent=2, + ), + encoding="utf-8", + ) + + +def test_cli_dry_run_resolves_sql_from_config_dir_not_cwd(tmp_path: Path, monkeypatch) -> None: + project_dir = tmp_path / "project-example" + config_path = _copy_project_example(project_dir) + + monkeypatch.chdir(tmp_path) + runner = CliRunner() + result = runner.invoke( + app, + ["run", "all", "--config", str(config_path), "--dry-run", "--strict-config"], + ) + + assert result.exit_code == 0 + assert "Execution Plan" in result.output + assert "steps: raw, clean, mart" in result.output + + +def test_cli_commands_use_dataset_yml_dir_as_path_base(tmp_path: Path, monkeypatch) -> None: + project_dir = tmp_path / "project-example" + config_path = _copy_project_example(project_dir) + + monkeypatch.chdir(tmp_path) + runner = CliRunner() + + run_result = runner.invoke(app, ["run", "all", "--config", str(config_path), "--strict-config"]) + assert run_result.exit_code == 0, run_result.output + + validate_result = runner.invoke( + app, + ["validate", "all", "--config", str(config_path), "--strict-config"], + ) + assert validate_result.exit_code == 0, validate_result.output + + profile_result = runner.invoke( + app, + ["profile", "raw", "--config", str(config_path), "--strict-config"], + ) + assert profile_result.exit_code == 0, profile_result.output + + status_result = runner.invoke( + app, + [ + "status", + "--dataset", + "project_example", + "--year", + "2022", + "--latest", + "--config", + str(config_path), + "--strict-config", + ], + ) + assert status_result.exit_code == 0, status_result.output + assert "status: SUCCESS" in status_result.output + + root = project_dir / "_smoke_out" + raw_dir = root / "data" / "raw" / "project_example" / "2022" + clean_dir = root / "data" / "clean" / "project_example" / "2022" + mart_dir = root / "data" / "mart" / "project_example" / "2022" + + assert (raw_dir / "ispra_dettaglio_comunale_2022.csv").exists() + assert (raw_dir / "_profile" / "suggested_read.yml").exists() + assert (clean_dir / "project_example_2022_clean.parquet").exists() + assert (mart_dir / "rd_by_regione.parquet").exists() + assert (mart_dir / "rd_by_provincia.parquet").exists() + + +def test_cli_resume_from_other_cwd_falls_back_and_reuses_relative_paths(tmp_path: Path, monkeypatch) -> None: + project_dir = tmp_path / "project-example" + config_path = _copy_project_example(project_dir) + runs_dir = project_dir / "_smoke_out" / "data" / "_runs" / "project_example" / "2022" + failed_run_id = "failed-run" + _write_failed_run_record(runs_dir / f"{failed_run_id}.json", failed_run_id) + + monkeypatch.chdir(tmp_path) + runner = CliRunner() + result = runner.invoke( + app, + [ + "resume", + "--dataset", + "project_example", + "--year", + "2022", + "--run-id", + failed_run_id, + "--config", + str(config_path), + "--strict-config", + ], + ) + + assert result.exit_code == 0, result.output + assert "Falling back to 'raw'" in result.output + assert "starting at raw" in result.output + + root = project_dir / "_smoke_out" + assert (root / "data" / "raw" / "project_example" / "2022" / "ispra_dettaglio_comunale_2022.csv").exists() + assert (root / "data" / "clean" / "project_example" / "2022" / "project_example_2022_clean.parquet").exists() + assert (root / "data" / "mart" / "project_example" / "2022" / "rd_by_regione.parquet").exists() diff --git a/toolkit/clean/run.py b/toolkit/clean/run.py index 7c751c9..cc13afc 100644 --- a/toolkit/clean/run.py +++ b/toolkit/clean/run.py @@ -26,6 +26,15 @@ def _serialize_metadata_path(path: Path | None, rel_root: Path | None) -> str | return to_root_relative(path, rel_root) +def _resolve_sql_path(sql_ref: str | Path, *, base_dir: Path | None) -> Path: + path = Path(sql_ref) + if path.is_absolute(): + return path + if base_dir is None: + return path + return base_dir / path + + def _run_sql( input_files: list[Path], sql_query: str, @@ -78,7 +87,7 @@ def run_clean( if not sql_rel: raise ValueError("clean.sql missing in dataset.yml (expected: clean: { sql: 'sql/clean.sql' })") - sql_path_obj = Path(sql_rel) + sql_path_obj = _resolve_sql_path(sql_rel, base_dir=base_dir) if not sql_path_obj.exists(): raise FileNotFoundError(f"CLEAN SQL file not found: {sql_path_obj}") diff --git a/toolkit/cli/cmd_run.py b/toolkit/cli/cmd_run.py index b033c93..947b673 100644 --- a/toolkit/cli/cmd_run.py +++ b/toolkit/cli/cmd_run.py @@ -39,7 +39,10 @@ def _planned_layers(step: str) -> list[str]: def _resolve_sql_path(cfg, rel_path: str | None) -> Path: if not rel_path: raise ValueError("Missing SQL path in dataset.yml") - return Path(rel_path) + path = Path(rel_path) + if path.is_absolute(): + return path + return Path(cfg.base_dir) / path def _validate_execution_plan(cfg, step: str) -> list[str]: diff --git a/toolkit/mart/run.py b/toolkit/mart/run.py index 82e18fd..acd50ec 100644 --- a/toolkit/mart/run.py +++ b/toolkit/mart/run.py @@ -19,6 +19,15 @@ def _serialize_metadata_path(path: Path | None, rel_root: Path | None) -> str | return to_root_relative(path, rel_root) +def _resolve_sql_path(sql_ref: str | Path, *, base_dir: Path | None) -> Path: + path = Path(sql_ref) + if path.is_absolute(): + return path + if base_dir is None: + return path + return base_dir / path + + def run_mart( dataset: str, year: int, @@ -78,7 +87,7 @@ def run_mart( if not name or not sql_rel: raise ValueError("Each mart.tables entry must include: name, sql") - sql_path = Path(sql_rel) + sql_path = _resolve_sql_path(sql_rel, base_dir=base_dir) if not sql_path.exists(): raise FileNotFoundError(f"MART SQL file not found: {sql_path}") From f669c8d8ae5396c2a998f2a17aa443e62f69b55a Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Mon, 2 Mar 2026 09:16:44 +0000 Subject: [PATCH 2/6] riordino documentale --- README.md | 148 +++++++++---------------------------- docs/advanced-workflows.md | 102 +++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 113 deletions(-) create mode 100644 docs/advanced-workflows.md diff --git a/README.md b/README.md index 6382c7e..4bc885e 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,13 @@ python -m toolkit.cli.app run all --config dataset.yml ## Quickstart +Il percorso canonico per i repo dataset clonati dal template e': + +1. `toolkit run all --config dataset.yml` +2. `toolkit validate all --config dataset.yml` +3. `toolkit status --dataset --year --latest --config dataset.yml` +4. notebook locali che leggono gli output reali sotto `root/data/...` + Giro offline completo con il progetto di esempio, eseguibile in pochi minuti su una macchina pulita. Windows PowerShell: @@ -50,6 +57,7 @@ Windows PowerShell: $env:TOOLKIT_OUTDIR = Join-Path $env:TEMP "dataciviclab-toolkit-quickstart" py -m pip install -e ".[dev]" py -m toolkit.cli.app run all -c project-example/dataset.yml +py -m toolkit.cli.app validate all -c project-example/dataset.yml py -m toolkit.cli.app status --dataset project_example --year 2022 --config project-example/dataset.yml ``` @@ -59,6 +67,7 @@ Linux/macOS: export TOOLKIT_OUTDIR="$(mktemp -d)/dataciviclab-toolkit-quickstart" python -m pip install -e ".[dev]" python -m toolkit.cli.app run all -c project-example/dataset.yml +python -m toolkit.cli.app validate all -c project-example/dataset.yml python -m toolkit.cli.app status --dataset project_example --year 2022 --config project-example/dataset.yml ``` @@ -76,6 +85,7 @@ Interpretazione errori config: - warning di deprecazione -> config ancora accettata, ma in forma legacy da migrare Schema completo e legacy supportato: [docs/config-schema.md](docs/config-schema.md) +Flow avanzati e tooling secondario: [docs/advanced-workflows.md](docs/advanced-workflows.md) Artefatti attesi: @@ -137,91 +147,47 @@ Convenzioni: ## CLI -Esecuzione step singolo: - -```bash -toolkit run raw --config dataset.yml -toolkit run clean --config dataset.yml -toolkit run mart --config dataset.yml -``` - -Esecuzione end-to-end: +Workflow canonico: ```bash toolkit run all --config dataset.yml +toolkit validate all --config dataset.yml ``` -Dry-run: - -```bash -toolkit run all --config dataset.yml --dry-run -``` - -Il dry-run: - -- valida config e path SQL richiesti -- stampa l'execution plan per dataset/year -- crea solo il run record in `data/_runs/...` -- non scarica RAW, non esegue DuckDB, non scrive artefatti nei layer - -Resume: - -```bash -toolkit resume --dataset project_example --year 2022 --config dataset.yml -toolkit resume --dataset project_example --year 2022 --run-id --config dataset.yml -``` - -`resume`: - -- legge un run record esistente -- trova il primo layer non `SUCCESS` -- crea un nuovo `run_id` -- salva `resumed_from=` nel nuovo record - -Status: +Per il percorso base: -```bash -toolkit status --dataset project_example --year 2022 --latest --config dataset.yml -toolkit status --dataset project_example --year 2022 --run-id --config dataset.yml -``` +- `run all` esegue RAW -> CLEAN -> MART +- `validate all` esegue i quality checks su CLEAN e MART +- `status` legge il run record e mostra lo stato piu` recente +- `--dry-run` valida config e SQL senza eseguire la pipeline -Validazione separata: +Esempi: ```bash -toolkit validate clean --config dataset.yml -toolkit validate mart --config dataset.yml -toolkit validate all --config dataset.yml +toolkit run all --config dataset.yml --strict-config +toolkit validate all --config dataset.yml --strict-config +toolkit status --dataset my_dataset --year 2024 --latest --config dataset.yml +toolkit run all --config dataset.yml --dry-run --strict-config ``` -Profilazione RAW: +`resume`, `profile raw`, `run raw|clean|mart`, `gen-sql` e la policy completa degli artifacts restano disponibili, ma sono tooling avanzato: vedi [docs/advanced-workflows.md](docs/advanced-workflows.md). -```bash -toolkit profile raw --config dataset.yml -``` +## Notebook locali -`toolkit profile raw` scrive sempre hint utilizzabili anche se il parsing DuckDB fallisce. -Tutti gli artefatti di profiling vivono in `raw///_profile/`. -Il nome canonico del profilo JSON e` `raw_profile.json`; `profile.json` resta un alias di compatibilita` opzionale. -Gli output effettivi dipendono dalla policy `output.artifacts`. -`suggested_read.yml` usa le stesse chiavi che CLEAN passa a `clean.read`, senza mapping extra. -Se DuckDB non riesce a sniffare il file, il profiler usa un fallback Python leggero per `header`, `delim`, `decimal`, `encoding` e aggiunge warning espliciti. -L'output resta quindi consumabile da CLEAN anche su CSV sporchi o irregolari. +Nei repo dataset clonati dal template, i notebook dovrebbero leggere gli output reali gia` scritti dal toolkit, non ricostruire logica di path. -Artifacts policy: +In pratica: -```yaml -output: - artifacts: standard # minimal | standard | debug - legacy_aliases: true # abilita l'alias legacy profile.json -``` +- RAW: `root/data/raw///` +- CLEAN: `root/data/clean///` +- MART: `root/data/mart///` +- run records: `root/data/_runs///` -`standard` resta il default compatibile. `minimal` tiene solo gli artefatti di pipeline e salta report/debug SQL. `debug` tiene tutto. +Questo mantiene il contratto semplice tra toolkit e repo dataset: -Generazione SQL CLEAN da mapping dichiarativo: - -```bash -toolkit gen-sql --config dataset.yml -``` +- il toolkit produce artefatti e metadata stabili +- i notebook li ispezionano localmente +- `dataset.yml` resta la fonte di verita` per dataset, anni e path relativi ## Run Tracking @@ -254,51 +220,6 @@ Comportamento: La CLI `validate` resta disponibile per eseguire i check separatamente. -## CLEAN Input Selection - -La selezione degli input RAW per CLEAN e` configurabile via `clean.read`. - -Opzioni supportate: - -- `mode: explicit` -- `mode: latest` -- `mode: largest` -- `mode: all` -- `glob: "*"` -- `include: [...]` -- `prefer_from_raw_run: true` -- `allow_ambiguous: false` - -Note operative: - -- `explicit` richiede `include` -- `latest` seleziona il file con `mtime` piu` recente -- `largest` seleziona il file piu` grande -- `all` passa tutti i candidati a DuckDB in ordine deterministico -- se `mode` non e` specificato, il toolkit usa il fallback legacy su `largest` e logga un warning di deprecazione - -CSV read mode: - -- `clean.read_mode: strict` usa solo i parametri dichiarati -- `clean.read_mode: fallback` prova strict e, se fallisce, riprova con preset robusto loggando il fallback -- `clean.read_mode: robust` usa direttamente il preset robusto -- il preset robusto mantiene `delim`/`decimal`/`encoding` noti e aggiunge poche opzioni conservative come `ignore_errors`, `null_padding`, `strict_mode: false`, `sample_size: -1` -- forma canonica: - -```yaml -clean: - read: - source: auto # oppure config_only -``` - -- `clean.read.source: auto` usa anche i format hints di `raw///_profile/suggested_read.yml`; `config_only` li ignora -- da `suggested_read.yml` vengono applicate solo chiavi di formato come `delim`, `decimal`, `encoding`, `header`, `skip`, `quote`, `escape`, `comment`, `nullstr`, `trim_whitespace`, `columns` -- le opzioni di robustezza presenti nel file suggerito non cambiano la policy di lettura: restano governate da `clean.read_mode` e dal preset robusto -- il metadata CLEAN salva anche `read_source_used` (`strict` / `robust` / `parquet`) -- il metadata CLEAN salva `read_params_used` con i parametri finali effettivamente usati dal reader -- il metadata CLEAN salva `read_params_source` con le sorgenti del merge (`defaults`, `suggested`, `config_overrides`) -- ogni `metadata.json` include `metadata_schema_version: 1` - ## Layer ### RAW @@ -429,6 +350,7 @@ Vedi [docs/conventions.md](docs/conventions.md) per: - policy di selezione input CLEAN - precedence del read config - metadata, manifest e validation contracts +- workflow avanzati e tooling secondario: [docs/advanced-workflows.md](docs/advanced-workflows.md) ## Testing diff --git a/docs/advanced-workflows.md b/docs/advanced-workflows.md new file mode 100644 index 0000000..5bb499a --- /dev/null +++ b/docs/advanced-workflows.md @@ -0,0 +1,102 @@ +# Advanced Workflows + +Questa nota raccoglie i flussi e le opzioni del toolkit che restano supportati, ma non fanno parte del percorso canonico dei repo dataset clonati dal template. + +Percorso canonico: + +- `toolkit run all --config dataset.yml` +- `toolkit validate all --config dataset.yml` +- `toolkit status --dataset --year --latest --config dataset.yml` +- notebook locali che leggono output e metadata sotto `root/data/...` + +## Step singoli + +Utili per debug o per ripetere solo una parte della pipeline: + +```bash +toolkit run raw --config dataset.yml +toolkit run clean --config dataset.yml +toolkit run mart --config dataset.yml +``` + +Questi comandi non sono il happy path raccomandato per i nuovi repo dataset, ma restano strumenti operativi supportati. + +## Resume + +`resume` serve quando esiste gia` un run record e vuoi ripartire dal primo layer non `SUCCESS` oppure forzare una ripartenza da `raw|clean|mart`. + +Esempi: + +```bash +toolkit resume --dataset my_dataset --year 2024 --latest --config dataset.yml +toolkit resume --dataset my_dataset --year 2024 --run-id --from-layer clean --config dataset.yml +``` + +Il comando verifica anche gli artefatti minimi del layer precedente prima di ripartire. + +## Profile RAW + +`toolkit profile raw --config dataset.yml` genera hint utili per `clean.read` quando il RAW e` sporco, ambiguo o poco noto. + +Artefatti principali: + +- `raw///_profile/raw_profile.json` +- `raw///_profile/suggested_read.yml` + +`profile.json` resta un alias legacy opzionale e non e` il nome canonico da promuovere nei nuovi repo. + +## CLEAN read e input selection + +Opzioni utili ma avanzate: + +- `clean.read.mode`: `explicit | latest | largest | all` +- `clean.read.include` +- `clean.read.glob` +- `clean.read.prefer_from_raw_run` +- `clean.read.allow_ambiguous` +- `clean.read.source`: `auto | config_only` +- `clean.read_mode`: `strict | fallback | robust` + +Uso consigliato: + +- repo dataset nuovi: configurazione esplicita e `--strict-config` +- `profile raw` solo se serve capire meglio il formato RAW + +## Artifact policy + +La policy artifacts resta disponibile per tuning operativo: + +```yaml +output: + artifacts: standard # minimal | standard | debug + legacy_aliases: true +``` + +Regola pratica: + +- `standard`: default consigliato +- `minimal`: riduce artefatti opzionali +- `debug`: conserva anche SQL renderizzate e dettagli di debug + +`legacy_aliases` resta supportato per compatibilita`, ma non va promosso nei nuovi repo dataset. + +## gen-sql + +`toolkit gen-sql --config dataset.yml` resta disponibile come bootstrap helper da mapping dichiarativo. + +Stato raccomandato: + +- supportato +- utile per bootstrap guidato +- non parte del workflow operativo standard +- da considerare congelato: bugfix si`, espansioni solo se emerge uso reale + +## Compat legacy + +Il toolkit mantiene compatibilita` con alcune forme legacy del config per facilitare la migrazione. + +Per i repo nuovi: + +- usa la shape canonica documentata in [config-schema.md](./config-schema.md) +- usa `--strict-config` nei comandi CLI +- non basarti su alias o campi legacy nei notebook e negli script del repo dataset From c73296cc707a689831484362c0203b0019541274 Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Mon, 2 Mar 2026 09:42:20 +0000 Subject: [PATCH 3/6] Add inspect paths helper and tighten dataset repo docs --- README.md | 10 +++ docs/feature-stability.md | 27 +++++++++ docs/notebook-contract.md | 28 +++++++++ tests/test_cli_inspect_paths.py | 60 ++++++++++++++++++ toolkit/cli/app.py | 2 + toolkit/cli/cmd_inspect.py | 104 ++++++++++++++++++++++++++++++++ 6 files changed, 231 insertions(+) create mode 100644 docs/feature-stability.md create mode 100644 docs/notebook-contract.md create mode 100644 tests/test_cli_inspect_paths.py create mode 100644 toolkit/cli/cmd_inspect.py diff --git a/README.md b/README.md index 4bc885e..dd7366b 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,8 @@ Interpretazione errori config: Schema completo e legacy supportato: [docs/config-schema.md](docs/config-schema.md) Flow avanzati e tooling secondario: [docs/advanced-workflows.md](docs/advanced-workflows.md) +Matrice di stabilita`: [docs/feature-stability.md](docs/feature-stability.md) +Contratto notebook/output: [docs/notebook-contract.md](docs/notebook-contract.md) Artefatti attesi: @@ -159,6 +161,7 @@ Per il percorso base: - `run all` esegue RAW -> CLEAN -> MART - `validate all` esegue i quality checks su CLEAN e MART - `status` legge il run record e mostra lo stato piu` recente +- `inspect paths` espone i path stabili per notebook e script locali - `--dry-run` valida config e SQL senza eseguire la pipeline Esempi: @@ -167,6 +170,7 @@ Esempi: toolkit run all --config dataset.yml --strict-config toolkit validate all --config dataset.yml --strict-config toolkit status --dataset my_dataset --year 2024 --latest --config dataset.yml +toolkit inspect paths --config dataset.yml --year 2024 --json toolkit run all --config dataset.yml --dry-run --strict-config ``` @@ -183,6 +187,12 @@ In pratica: - MART: `root/data/mart///` - run records: `root/data/_runs///` +Helper ufficiale per evitare path logic duplicata nei notebook: + +```bash +toolkit inspect paths --config dataset.yml --year 2024 --json +``` + Questo mantiene il contratto semplice tra toolkit e repo dataset: - il toolkit produce artefatti e metadata stabili diff --git a/docs/feature-stability.md b/docs/feature-stability.md new file mode 100644 index 0000000..b320c43 --- /dev/null +++ b/docs/feature-stability.md @@ -0,0 +1,27 @@ +# Feature Stability + +Questa matrice serve a chiarire cosa il toolkit considera percorso canonico, cosa resta supportato ma secondario, e cosa non va trattato come parte del quickstart dei repo dataset clonati dal template. + +| Area | Stato | Uso raccomandato | +|---|---|---| +| `run all` | stable | percorso canonico | +| `validate all` | stable | percorso canonico | +| `status` | stable | percorso canonico | +| path contract di `dataset.yml` | stable | percorso canonico | +| output `raw/clean/mart/_runs` | stable | percorso canonico | +| `inspect paths` | stable | helper per notebook e repo dataset | +| `resume` | supported / advanced | debug operativo e recovery | +| `profile raw` | supported / advanced | diagnostica su RAW sporchi o ambigui | +| `run raw|clean|mart` | supported / advanced | debug e re-run parziali | +| artifact policy `minimal|standard|debug` | supported / advanced | tuning operativo | +| `legacy_aliases` | compatibility only | non promuovere nei repo nuovi | +| config legacy | compatibility only | usare `--strict-config` nei repo nuovi | +| `gen-sql` | frozen helper | bootstrap guidato, non workflow standard | +| `api_json_paged` | experimental | usare solo con evidenza reale | +| `html_table` | experimental | usare solo con evidenza reale | + +Regola pratica: + +- se stai creando o clonando un repo dataset nuovo, resta nel percorso canonico +- se devi fare recovery, diagnostica o bootstrap, usa i comandi advanced +- non basarti su compat legacy o helper frozen come parte del contratto stabile diff --git a/docs/notebook-contract.md b/docs/notebook-contract.md new file mode 100644 index 0000000..a968b68 --- /dev/null +++ b/docs/notebook-contract.md @@ -0,0 +1,28 @@ +# Notebook Contract + +Nei repo dataset clonati dal template, i notebook non dovrebbero ricostruire la logica della pipeline. Dovrebbero leggere gli output reali e i metadata stabili prodotti dal toolkit. + +Contratto stabile: + +- RAW: `root/data/raw///` +- CLEAN: `root/data/clean///` +- MART: `root/data/mart///` +- run records: `root/data/_runs///` + +File utili: + +- RAW: `manifest.json`, `metadata.json`, `raw_validation.json` +- CLEAN: `__clean.parquet`, `manifest.json`, `metadata.json` +- MART: `.parquet`, `manifest.json`, `metadata.json` + +Per evitare duplicazione di path logic nei notebook: + +- leggi `dataset.yml` +- usa `toolkit inspect paths --config dataset.yml --year --json` +- poi apri parquet, metadata e run record dai path restituiti + +Regola pratica: + +- il toolkit produce +- i notebook ispezionano +- `dataset.yml` resta la fonte di verita` per root, dataset, anni e path relativi diff --git a/tests/test_cli_inspect_paths.py b/tests/test_cli_inspect_paths.py new file mode 100644 index 0000000..6dc5e94 --- /dev/null +++ b/tests/test_cli_inspect_paths.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import json +import shutil +from pathlib import Path + +from typer.testing import CliRunner + +from toolkit.cli.app import app + + +def test_inspect_paths_reports_dataset_repo_layout_from_other_cwd(tmp_path: Path, monkeypatch) -> None: + src = Path("project-example") + dst = tmp_path / "project-example" + shutil.copytree(src, dst) + shutil.rmtree(dst / "_smoke_out", ignore_errors=True) + config_path = dst / "dataset.yml" + + runner = CliRunner() + monkeypatch.chdir(tmp_path) + + run_result = runner.invoke(app, ["run", "all", "--config", str(config_path), "--strict-config"]) + assert run_result.exit_code == 0, run_result.output + + result = runner.invoke( + app, + ["inspect", "paths", "--config", str(config_path), "--year", "2022", "--strict-config"], + ) + + assert result.exit_code == 0, result.output + assert f"config_path: {config_path}" in result.output + assert f"root: {dst / '_smoke_out'}" in result.output + assert f"raw_dir: {dst / '_smoke_out' / 'data' / 'raw' / 'project_example' / '2022'}" in result.output + assert f"clean_output: {dst / '_smoke_out' / 'data' / 'clean' / 'project_example' / '2022' / 'project_example_2022_clean.parquet'}" in result.output + assert "latest_run_status: SUCCESS" in result.output + + +def test_inspect_paths_json_is_notebook_friendly(tmp_path: Path, monkeypatch) -> None: + src = Path("project-example") + dst = tmp_path / "project-example" + shutil.copytree(src, dst) + shutil.rmtree(dst / "_smoke_out", ignore_errors=True) + config_path = dst / "dataset.yml" + + runner = CliRunner() + monkeypatch.chdir(tmp_path) + + result = runner.invoke( + app, + ["inspect", "paths", "--config", str(config_path), "--year", "2022", "--json", "--strict-config"], + ) + + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + assert payload["dataset"] == "project_example" + assert payload["year"] == 2022 + assert payload["config_path"] == str(config_path) + assert payload["paths"]["clean_output"].endswith("project_example_2022_clean.parquet") + assert payload["paths"]["mart_outputs"] + assert payload["latest_run"] is None diff --git a/toolkit/cli/app.py b/toolkit/cli/app.py index cb03ea8..80b4cc0 100644 --- a/toolkit/cli/app.py +++ b/toolkit/cli/app.py @@ -8,6 +8,7 @@ from toolkit.cli.cmd_status import register as register_status from toolkit.cli.cmd_validate import register as register_validate from toolkit.cli.cmd_gen_sql import register as register_gen_sql +from toolkit.cli.cmd_inspect import register as register_inspect app = typer.Typer(no_args_is_help=True, add_completion=False) @@ -18,6 +19,7 @@ register_status(app) register_validate(app) register_gen_sql(app) +register_inspect(app) def main(): app() diff --git a/toolkit/cli/cmd_inspect.py b/toolkit/cli/cmd_inspect.py new file mode 100644 index 0000000..dab6890 --- /dev/null +++ b/toolkit/cli/cmd_inspect.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import typer + +from toolkit.cli.common import iter_years +from toolkit.core.config import load_config +from toolkit.core.paths import layer_year_dir +from toolkit.core.run_context import get_run_dir, latest_run + + +def _clean_output_path(root: Path, dataset: str, year: int) -> Path: + return layer_year_dir(root, "clean", dataset, year) / f"{dataset}_{year}_clean.parquet" + + +def _mart_output_paths(root: Path, year_dir: Path, tables: list[dict[str, Any]]) -> list[Path]: + return [year_dir / f"{table['name']}.parquet" for table in tables if isinstance(table, dict) and table.get("name")] + + +def _payload_for_year(cfg, year: int) -> dict[str, Any]: + root = Path(cfg.root) + raw_dir = layer_year_dir(root, "raw", cfg.dataset, year) + clean_dir = layer_year_dir(root, "clean", cfg.dataset, year) + mart_dir = layer_year_dir(root, "mart", cfg.dataset, year) + run_dir = get_run_dir(root, cfg.dataset, year) + mart_tables = cfg.mart.get("tables") or [] + + latest_payload: dict[str, Any] | None = None + try: + latest_record = latest_run(run_dir) + latest_payload = { + "run_id": latest_record.get("run_id"), + "status": latest_record.get("status"), + "started_at": latest_record.get("started_at"), + "path": str(run_dir / f"{latest_record.get('run_id')}.json"), + } + except FileNotFoundError: + latest_payload = None + + return { + "dataset": cfg.dataset, + "year": year, + "config_path": str(cfg.base_dir / "dataset.yml"), + "root": str(root), + "paths": { + "raw_dir": str(raw_dir), + "clean_dir": str(clean_dir), + "clean_output": str(_clean_output_path(root, cfg.dataset, year)), + "mart_dir": str(mart_dir), + "mart_outputs": [str(path) for path in _mart_output_paths(root, mart_dir, mart_tables)], + "run_dir": str(run_dir), + }, + "latest_run": latest_payload, + } + + +def paths( + config: str = typer.Option(..., "--config", "-c", help="Path to dataset.yml"), + year: int | None = typer.Option(None, "--year", help="Dataset year"), + as_json: bool = typer.Option(False, "--json", help="Emit JSON output for notebooks/scripts"), + strict_config: bool = typer.Option(False, "--strict-config", help="Treat deprecated config forms as errors"), +): + """ + Mostra i path stabili di output e l'ultimo run record per dataset/year. + """ + strict_config_flag = strict_config if isinstance(strict_config, bool) else False + cfg = load_config(config, strict_config=strict_config_flag) + years = iter_years(cfg, year) + payload = [_payload_for_year(cfg, selected_year) for selected_year in years] + + if as_json: + typer.echo(json.dumps(payload if len(payload) > 1 else payload[0], indent=2, ensure_ascii=False)) + return + + for item in payload: + typer.echo(f"dataset: {item['dataset']}") + typer.echo(f"year: {item['year']}") + typer.echo(f"config_path: {item['config_path']}") + typer.echo(f"root: {item['root']}") + typer.echo(f"raw_dir: {item['paths']['raw_dir']}") + typer.echo(f"clean_dir: {item['paths']['clean_dir']}") + typer.echo(f"clean_output: {item['paths']['clean_output']}") + typer.echo(f"mart_dir: {item['paths']['mart_dir']}") + typer.echo("mart_outputs:") + for output in item["paths"]["mart_outputs"]: + typer.echo(f" - {output}") + typer.echo(f"run_dir: {item['paths']['run_dir']}") + latest_info = item.get("latest_run") + if latest_info is None: + typer.echo("latest_run: none") + else: + typer.echo(f"latest_run_id: {latest_info['run_id']}") + typer.echo(f"latest_run_status: {latest_info['status']}") + typer.echo(f"latest_run_record: {latest_info['path']}") + typer.echo("") + + +def register(app: typer.Typer) -> None: + inspect_app = typer.Typer(no_args_is_help=True, add_completion=False) + inspect_app.command("paths")(paths) + app.add_typer(inspect_app, name="inspect") From b1c08991b89625bf31cebd13d6bf2f51fde77d7a Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Mon, 2 Mar 2026 09:47:48 +0000 Subject: [PATCH 4/6] Refine inspect paths output and trim public README --- README.md | 160 +++----------------------------- docs/notebook-contract.md | 2 +- tests/test_cli_inspect_paths.py | 10 +- toolkit/cli/cmd_inspect.py | 62 ++++++++++--- 4 files changed, 72 insertions(+), 162 deletions(-) diff --git a/README.md b/README.md index dd7366b..e6d6d5f 100644 --- a/README.md +++ b/README.md @@ -199,158 +199,26 @@ Questo mantiene il contratto semplice tra toolkit e repo dataset: - i notebook li ispezionano localmente - `dataset.yml` resta la fonte di verita` per dataset, anni e path relativi -## Run Tracking +## Operative Notes -Ogni comando `toolkit run ...` o `toolkit resume ...` scrive un record JSON in: +Run tracking: -```text -data/_runs///.json -``` - -Il record contiene almeno: - -- `status`: `RUNNING`, `SUCCESS`, `FAILED`, `SUCCESS_WITH_WARNINGS`, `DRY_RUN` -- `started_at`, `finished_at` -- `layers.raw|clean|mart.status` -- `validations.raw|clean|mart` -- `error` se presente -- `resumed_from` se il run deriva da una ripresa - -Questo file e` la fonte per i comandi `status` e `resume`. - -## Validation Gate - -`toolkit run ...` esegue automaticamente la validazione dopo ogni layer completato con successo. - -Comportamento: - -- se la validazione passa, il run prosegue -- se la validazione fallisce e `validation.fail_on_error: true`, la pipeline si interrompe -- se la validazione fallisce e `validation.fail_on_error: false`, la pipeline continua e il run termina come `SUCCESS_WITH_WARNINGS` - -La CLI `validate` resta disponibile per eseguire i check separatamente. - -## Layer - -### RAW - -Responsabilita`: - -- legge o scarica il payload da plugin sorgente -- applica extractor opzionale -- scrive file normalizzati nel layer RAW -- produce metadata, manifest e validation report - -Output tipici: - -- file sorgente normalizzati -- `metadata.json` -- `raw_validation.json` -- `manifest.json` - -`manifest.json` nel RAW dichiara sempre il file primario da usare a valle. -Campi minimi: `dataset`, `year`, `run_id`, `created_at`, `sources`, `primary_output_file`. -`primary_output_file` e gli `output_file` delle source sono path relativi al RAW year-dir, in formato posix. -`raw.output_policy` supporta `versioned` (default, suffix `_1/_2`) e `overwrite` (stesso filename sovrascritto). -Con piu` source, si puo` fissare il primario con `primary: true`; altrimenti il toolkit usa la prima source e logga un warning. - -### CLEAN - -Responsabilita`: - -- seleziona gli input dal RAW year-dir -- renderizza `clean.sql` -- esegue SQL in DuckDB -- esporta un parquet clean - -Output tipici: - -- `__clean.parquet` -- `_run/clean_rendered.sql` -- `metadata.json` -- `manifest.json` -- `_validate/clean_validation.json` - -### MART +- ogni `run` e `resume` scrive un record in `data/_runs///.json` +- `status` legge questi record +- `inspect paths` espone i path stabili da usare in notebook e script -Responsabilita`: +Validation gate: -- legge parquet CLEAN -- renderizza le SQL delle tabelle finali -- esporta un parquet per tabella +- `toolkit run ...` valida automaticamente dopo ogni layer completato +- con `validation.fail_on_error: true` la pipeline si ferma +- con `validation.fail_on_error: false` il run puo` terminare come `SUCCESS_WITH_WARNINGS` -Output tipici: - -- `
.parquet` -- `_run/*_rendered.sql` -- `metadata.json` -- `manifest.json` -- `_validate/mart_validation.json` - -## Validazioni - -### CLEAN - -- `required_columns` -- `min_rows` -- `not_null` -- `primary_key` -- `ranges` -- `max_null_pct` - -### MART - -- `min_rows` -- `required_columns` -- `not_null` -- `primary_key` -- `ranges` - -Le validazioni vengono eseguite automaticamente da `toolkit run ...` dopo ogni layer completato con successo. -La CLI `toolkit validate ...` resta disponibile per eseguirle separatamente. - -## Plugin sorgente - -Plugin registrati: - -- `local_file` -- `http_file` -- `api_json_paged` -- `html_table` - -Stabilita`: - -- core pipeline `raw`, `clean`, `mart`: stable -- plugin `local_file`, `http_file`: stable -- plugin `api_json_paged`, `html_table`: experimental - -Come aggiungere un plugin: - -- definisci una classe plugin in `toolkit/plugins/.py` -- contratto minimo: - - `__init__(**client)` per ricevere configurazione client - - `fetch(...) -> bytes` per restituire il payload RAW -- registra il plugin in modo esplicito in `toolkit.core.registry.register_builtin_plugins()` -- se il plugin dipende da librerie opzionali, il fallimento di import deve essere trattato come plugin opzionale non disponibile: - - warning `DCLPLUGIN001` in non-strict - - errore in strict mode - -## Smoke locale - -`project-example/` e` pensato per un giro completo locale, senza rete: - -```bash -cd project-example -py -m toolkit.cli.app run all --config dataset.yml -py -m toolkit.cli.app status --dataset project_example --year 2022 --latest --config dataset.yml -``` - -Artefatti attesi: +Per dettagli completi su layer, validazioni, plugin, artifact policy e flow avanzati, vedi: -- `project-example/_smoke_out/data/raw/project_example/2022/raw_validation.json` -- `project-example/_smoke_out/data/clean/project_example/2022/project_example_2022_clean.parquet` -- `project-example/_smoke_out/data/mart/project_example/2022/rd_by_regione.parquet` -- `project-example/_smoke_out/data/_runs/project_example/2022/.json` +- [docs/conventions.md](docs/conventions.md) +- [docs/config-schema.md](docs/config-schema.md) +- [docs/feature-stability.md](docs/feature-stability.md) +- [docs/advanced-workflows.md](docs/advanced-workflows.md) ## Conventions diff --git a/docs/notebook-contract.md b/docs/notebook-contract.md index a968b68..e1170f7 100644 --- a/docs/notebook-contract.md +++ b/docs/notebook-contract.md @@ -19,7 +19,7 @@ Per evitare duplicazione di path logic nei notebook: - leggi `dataset.yml` - usa `toolkit inspect paths --config dataset.yml --year --json` -- poi apri parquet, metadata e run record dai path restituiti +- poi apri parquet, metadata, manifest, validation e run record dai path restituiti Regola pratica: diff --git a/tests/test_cli_inspect_paths.py b/tests/test_cli_inspect_paths.py index 6dc5e94..46a2f8a 100644 --- a/tests/test_cli_inspect_paths.py +++ b/tests/test_cli_inspect_paths.py @@ -31,7 +31,10 @@ def test_inspect_paths_reports_dataset_repo_layout_from_other_cwd(tmp_path: Path assert f"config_path: {config_path}" in result.output assert f"root: {dst / '_smoke_out'}" in result.output assert f"raw_dir: {dst / '_smoke_out' / 'data' / 'raw' / 'project_example' / '2022'}" in result.output + assert f"raw_manifest: {dst / '_smoke_out' / 'data' / 'raw' / 'project_example' / '2022' / 'manifest.json'}" in result.output assert f"clean_output: {dst / '_smoke_out' / 'data' / 'clean' / 'project_example' / '2022' / 'project_example_2022_clean.parquet'}" in result.output + assert f"clean_validation: {dst / '_smoke_out' / 'data' / 'clean' / 'project_example' / '2022' / '_validate' / 'clean_validation.json'}" in result.output + assert f"mart_manifest: {dst / '_smoke_out' / 'data' / 'mart' / 'project_example' / '2022' / 'manifest.json'}" in result.output assert "latest_run_status: SUCCESS" in result.output @@ -55,6 +58,9 @@ def test_inspect_paths_json_is_notebook_friendly(tmp_path: Path, monkeypatch) -> assert payload["dataset"] == "project_example" assert payload["year"] == 2022 assert payload["config_path"] == str(config_path) - assert payload["paths"]["clean_output"].endswith("project_example_2022_clean.parquet") - assert payload["paths"]["mart_outputs"] + assert payload["paths"]["clean"]["output"].endswith("project_example_2022_clean.parquet") + assert payload["paths"]["clean"]["validation"].endswith("clean_validation.json") + assert payload["paths"]["raw"]["manifest"].endswith("manifest.json") + assert payload["paths"]["mart"]["outputs"] + assert payload["paths"]["mart"]["metadata"].endswith("metadata.json") assert payload["latest_run"] is None diff --git a/toolkit/cli/cmd_inspect.py b/toolkit/cli/cmd_inspect.py index dab6890..6330344 100644 --- a/toolkit/cli/cmd_inspect.py +++ b/toolkit/cli/cmd_inspect.py @@ -12,19 +12,48 @@ from toolkit.core.run_context import get_run_dir, latest_run +def _raw_output_paths(root: Path, dataset: str, year: int) -> dict[str, str]: + raw_dir = layer_year_dir(root, "raw", dataset, year) + return { + "dir": str(raw_dir), + "manifest": str(raw_dir / "manifest.json"), + "metadata": str(raw_dir / "metadata.json"), + "validation": str(raw_dir / "raw_validation.json"), + } + + def _clean_output_path(root: Path, dataset: str, year: int) -> Path: return layer_year_dir(root, "clean", dataset, year) / f"{dataset}_{year}_clean.parquet" +def _clean_paths(root: Path, dataset: str, year: int) -> dict[str, str]: + clean_dir = layer_year_dir(root, "clean", dataset, year) + return { + "dir": str(clean_dir), + "output": str(_clean_output_path(root, dataset, year)), + "manifest": str(clean_dir / "manifest.json"), + "metadata": str(clean_dir / "metadata.json"), + "validation": str(clean_dir / "_validate" / "clean_validation.json"), + } + + def _mart_output_paths(root: Path, year_dir: Path, tables: list[dict[str, Any]]) -> list[Path]: return [year_dir / f"{table['name']}.parquet" for table in tables if isinstance(table, dict) and table.get("name")] +def _mart_paths(root: Path, dataset: str, year: int, tables: list[dict[str, Any]]) -> dict[str, Any]: + mart_dir = layer_year_dir(root, "mart", dataset, year) + return { + "dir": str(mart_dir), + "outputs": [str(path) for path in _mart_output_paths(root, mart_dir, tables)], + "manifest": str(mart_dir / "manifest.json"), + "metadata": str(mart_dir / "metadata.json"), + "validation": str(mart_dir / "_validate" / "mart_validation.json"), + } + + def _payload_for_year(cfg, year: int) -> dict[str, Any]: root = Path(cfg.root) - raw_dir = layer_year_dir(root, "raw", cfg.dataset, year) - clean_dir = layer_year_dir(root, "clean", cfg.dataset, year) - mart_dir = layer_year_dir(root, "mart", cfg.dataset, year) run_dir = get_run_dir(root, cfg.dataset, year) mart_tables = cfg.mart.get("tables") or [] @@ -46,11 +75,9 @@ def _payload_for_year(cfg, year: int) -> dict[str, Any]: "config_path": str(cfg.base_dir / "dataset.yml"), "root": str(root), "paths": { - "raw_dir": str(raw_dir), - "clean_dir": str(clean_dir), - "clean_output": str(_clean_output_path(root, cfg.dataset, year)), - "mart_dir": str(mart_dir), - "mart_outputs": [str(path) for path in _mart_output_paths(root, mart_dir, mart_tables)], + "raw": _raw_output_paths(root, cfg.dataset, year), + "clean": _clean_paths(root, cfg.dataset, year), + "mart": _mart_paths(root, cfg.dataset, year, mart_tables), "run_dir": str(run_dir), }, "latest_run": latest_payload, @@ -80,13 +107,22 @@ def paths( typer.echo(f"year: {item['year']}") typer.echo(f"config_path: {item['config_path']}") typer.echo(f"root: {item['root']}") - typer.echo(f"raw_dir: {item['paths']['raw_dir']}") - typer.echo(f"clean_dir: {item['paths']['clean_dir']}") - typer.echo(f"clean_output: {item['paths']['clean_output']}") - typer.echo(f"mart_dir: {item['paths']['mart_dir']}") + typer.echo(f"raw_dir: {item['paths']['raw']['dir']}") + typer.echo(f"raw_manifest: {item['paths']['raw']['manifest']}") + typer.echo(f"raw_metadata: {item['paths']['raw']['metadata']}") + typer.echo(f"raw_validation: {item['paths']['raw']['validation']}") + typer.echo(f"clean_dir: {item['paths']['clean']['dir']}") + typer.echo(f"clean_output: {item['paths']['clean']['output']}") + typer.echo(f"clean_manifest: {item['paths']['clean']['manifest']}") + typer.echo(f"clean_metadata: {item['paths']['clean']['metadata']}") + typer.echo(f"clean_validation: {item['paths']['clean']['validation']}") + typer.echo(f"mart_dir: {item['paths']['mart']['dir']}") typer.echo("mart_outputs:") - for output in item["paths"]["mart_outputs"]: + for output in item["paths"]["mart"]["outputs"]: typer.echo(f" - {output}") + typer.echo(f"mart_manifest: {item['paths']['mart']['manifest']}") + typer.echo(f"mart_metadata: {item['paths']['mart']['metadata']}") + typer.echo(f"mart_validation: {item['paths']['mart']['validation']}") typer.echo(f"run_dir: {item['paths']['run_dir']}") latest_info = item.get("latest_run") if latest_info is None: From 9e904c75b97428a222223da761024fdbdd8b6809 Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:51:53 +0000 Subject: [PATCH 5/6] Clarify toolkit role in the DataCivicLab ecosystem --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index e6d6d5f..c5b2661 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,20 @@ Toolkit Python per pipeline dati riproducibili `RAW -> CLEAN -> MART`, con approccio SQL-first, audit degli artefatti e run tracking persistente. +## Ruolo Nell'Ecosistema + +Questa repo e' il motore tecnico della pipeline dati di DataCivicLab. + +Ruoli delle repo correlate: + +- `.github`: policy condivise, community health, template issue/PR, onboarding GitHub +- `dataciviclab`: hub pubblico e minimale dell'organizzazione +- `toolkit`: runtime, CLI, contract di config/path/output, documentazione tecnica del motore +- `project-template`: template operativo dei repo dataset +- repo dataset: progetti concreti che usano il toolkit + +Questa repo non e' l'hub dell'organizzazione e non replica la documentazione org-wide: resta focalizzata sul motore e sul suo contratto tecnico. + ## Obiettivi - mantenere una struttura progetto semplice: `dataset.yml` + `sql/` @@ -88,6 +102,7 @@ Schema completo e legacy supportato: [docs/config-schema.md](docs/config-schema. Flow avanzati e tooling secondario: [docs/advanced-workflows.md](docs/advanced-workflows.md) Matrice di stabilita`: [docs/feature-stability.md](docs/feature-stability.md) Contratto notebook/output: [docs/notebook-contract.md](docs/notebook-contract.md) +Per policy condivise e community health organizzativa, fai riferimento alla repo `.github` dell'organizzazione. Artefatti attesi: From f86a657a995a3670ba440d9f7b7d13fd485f5c2c Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:52:02 +0000 Subject: [PATCH 6/6] Point repository conduct and security docs to org defaults --- CODE_OF_CONDUCT.md | 2 ++ SECURITY.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index befd34a..530cf04 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,5 +1,7 @@ # Code of Conduct +This repository follows the shared collaboration baseline of the DataCivicLab organization. For organization-wide community health and default GitHub policy, refer to the `dataciviclab/.github` repository. This file keeps a short local reference for the toolkit repository. + ## Our Pledge We want this project to be a respectful, constructive, and harassment-free space for everyone involved. diff --git a/SECURITY.md b/SECURITY.md index 7f618eb..6ab4be1 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,6 +4,8 @@ If you believe you found a security issue in this repository, please do not open a public issue first. +For shared organizational GitHub policy and community health defaults, see the `dataciviclab/.github` repository. This file only covers the reporting path for this technical repository. + Instead: - contact the maintainers privately, if a private contact path is available