From 3c28cb3428dfa6b267a6df85d91673acb1e0cd03 Mon Sep 17 00:00:00 2001 From: Nicholas Karlson Date: Mon, 19 Jan 2026 15:34:10 -0800 Subject: [PATCH] Feat: Track D mini-library: schema primitives (keep shims) --- scripts/_business_schema.py | 207 ++-------- src/pystatsv1/assets/workbook_track_d.zip | Bin 162838 -> 161705 bytes src/pystatsv1/trackd/__init__.py | 12 + src/pystatsv1/trackd/schema.py | 362 ++++++++++++++++++ tests/test_trackd_schema_required_columns.py | 53 +++ .../scripts/_business_schema.py | 207 ++-------- 6 files changed, 472 insertions(+), 369 deletions(-) create mode 100644 src/pystatsv1/trackd/schema.py create mode 100644 tests/test_trackd_schema_required_columns.py diff --git a/scripts/_business_schema.py b/scripts/_business_schema.py index 16a58d8..d5b0002 100644 --- a/scripts/_business_schema.py +++ b/scripts/_business_schema.py @@ -1,190 +1,29 @@ -# SPDX-License-Identifier: MIT -"""Schema contracts for Track D business datasets.""" +"""Backwards-compatible shim for Track D schema helpers. -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import pandas as pd - -DATASET_NSO_V1 = "nso_v1" +The Track D workbook template (and legacy scripts) historically imported +``scripts._business_schema``. +As of PR-1.2b, the canonical implementation lives in +``pystatsv1.trackd.schema`` so it can be reused by chapter runners and future +"bring-your-own data" pipelines. +""" -@dataclass(frozen=True) -class TableSchema: - name: str - required_columns: tuple[str, ...] - - -CONTRACT_TABLES: dict[str, TableSchema] = { - "chart_of_accounts": TableSchema( - name="chart_of_accounts.csv", - required_columns=("account_id", "account_name", "account_type", "normal_side"), - ), - "gl_journal": TableSchema( - name="gl_journal.csv", - required_columns=("txn_id", "date", "doc_id", "description", "account_id", "debit", "credit"), - ), - "trial_balance_monthly": TableSchema( - name="trial_balance_monthly.csv", - required_columns=( - "month", - "account_id", - "account_name", - "account_type", - "normal_side", - "debit", - "credit", - "ending_side", - "ending_balance", - ), - ), - "statements_is_monthly": TableSchema( - name="statements_is_monthly.csv", - required_columns=("month", "line", "amount"), - ), - "statements_bs_monthly": TableSchema( - name="statements_bs_monthly.csv", - required_columns=("month", "line", "amount"), - ), - "statements_cf_monthly": TableSchema( - name="statements_cf_monthly.csv", - required_columns=("month", "line", "amount"), - ), - "inventory_movements": TableSchema( - name="inventory_movements.csv", - required_columns=("month", "txn_id", "date", "sku", "movement_type", "qty", "unit_cost", "amount"), - ), - "fixed_assets": TableSchema( - name="fixed_assets.csv", - required_columns=( - "asset_id", - "asset_name", - "in_service_month", - "cost", - "useful_life_months", - "salvage_value", - "method", - ), - ), - "depreciation_schedule": TableSchema( - name="depreciation_schedule.csv", - required_columns=("month", "asset_id", "dep_expense", "accum_dep", "net_book_value"), - ), - # Chapter 5 - "payroll_events": TableSchema( - name="payroll_events.csv", - required_columns=( - "month", - "txn_id", - "date", - "event_type", - "gross_wages", - "employee_withholding", - "employer_tax", - "cash_paid", - "wages_payable_delta", - "payroll_taxes_payable_delta", - ), - ), - "sales_tax_events": TableSchema( - name="sales_tax_events.csv", - required_columns=( - "month", - "txn_id", - "date", - "event_type", - "taxable_sales", - "tax_amount", - "cash_paid", - "sales_tax_payable_delta", - ), - ), - "debt_schedule": TableSchema( - name="debt_schedule.csv", - required_columns=("month", "loan_id", "txn_id", "beginning_balance", "payment", "interest", "principal", "ending_balance"), - ), - "equity_events": TableSchema( - name="equity_events.csv", - required_columns=("month", "txn_id", "date", "event_type", "amount"), - ), - "ap_events": TableSchema( - name="ap_events.csv", - required_columns=("month", "txn_id", "date", "vendor", "invoice_id", "event_type", "amount", "ap_delta", "cash_paid"), - ), - # Chapter 6 - "ar_events": TableSchema( - name="ar_events.csv", - required_columns=("month", "txn_id", "date", "customer", "invoice_id", "event_type", "amount", "ar_delta", "cash_received"), - ), - "bank_statement": TableSchema( - name="bank_statement.csv", - required_columns=("month", "bank_txn_id", "posted_date", "description", "amount", "gl_txn_id"), - ), -} +from __future__ import annotations -NSO_V1_TABLE_ORDER: tuple[str, ...] = ( - "chart_of_accounts", - "gl_journal", - "trial_balance_monthly", - "statements_is_monthly", - "statements_bs_monthly", - "statements_cf_monthly", - "inventory_movements", - "fixed_assets", - "depreciation_schedule", - # Chapter 5 - "payroll_events", - "sales_tax_events", - "debt_schedule", - "equity_events", - "ap_events", - # Chapter 6 - "ar_events", - "bank_statement", +from pystatsv1.trackd.schema import ( + DATASET_NSO_V1, + NSO_V1, + TableSchema, + assert_schema, + validate_schema, + validate_table_map, ) -NSO_V1_TABLES: tuple[TableSchema, ...] = tuple(CONTRACT_TABLES[k] for k in NSO_V1_TABLE_ORDER) - - -def schemas_for_dataset(dataset: str) -> tuple[TableSchema, ...]: - if dataset == DATASET_NSO_V1: - return NSO_V1_TABLES - raise ValueError(f"Unknown dataset: {dataset}") - - -def validate_schema(datadir: Path, dataset: str) -> dict[str, Any]: - """Validate presence + required columns. Returns a report dict.""" - report: dict[str, Any] = { - "dataset": dataset, - "datadir": str(datadir), - "missing_tables": [], - "tables": {}, - "ok": True, - } - - for schema in schemas_for_dataset(dataset): - table_path = datadir / schema.name - if not table_path.exists(): - report["missing_tables"].append(schema.name) - report["tables"][schema.name] = { - "exists": False, - "missing_columns": list(schema.required_columns), - } - report["ok"] = False - continue - - df = pd.read_csv(table_path) - cols = set(map(str, df.columns)) - missing = [c for c in schema.required_columns if c not in cols] - report["tables"][schema.name] = { - "exists": True, - "n_rows": int(df.shape[0]), - "missing_columns": missing, - } - if missing: - report["ok"] = False - - return report +__all__ = [ + "DATASET_NSO_V1", + "NSO_V1", + "TableSchema", + "validate_schema", + "validate_table_map", + "assert_schema", +] diff --git a/src/pystatsv1/assets/workbook_track_d.zip b/src/pystatsv1/assets/workbook_track_d.zip index 1580ad5a5b4a7650df4f04f8cc4e90c6b13e4eb9..de3c49cacc74172b9e1ca1e17dc7116d57f9abed 100644 GIT binary patch delta 2109 zcmY*a4NOy4811`QY^Ap4ui}GJdB7N;P*7lQ5X4b+Fcguv$-<=kh0=m8P>=$Ft+5)# zY4;MYPEne;OWq<>AdoRyknlvvx=R4l&5zj!5}dG0SfhAN^9U6CaoR4yfk3bhhfrot8=i{u2t&( zo!stxr!U+2lqo)xbK)U)c;As^Uq|T&nx`_?#EaLJR}U%H-@SNo;HR?KKkfZ5-a7Ai zRK3Ywxo)oH`h35>UHRt<9qRmFT{nh?mD+-Y>?@awBTDLPKWY6tex#-3EB#PsiRrOZ zdg0^yRbXBJ5ohO{+}EqRDiYe~C^AY#8L|$IoAs z_M{+hvb!KBHam1c{OK*{`E5CU3AdJhE4kYHm&RqivtrqJSH=Ho^`O2n*hel7!oJUA;8 zO@G`>nh53@z#>ww3>OG=bu(=u=rVw4Mg*-@fD3XnrxBc|H&#>{Ocm2^I~i6jh}=vz zy0Znktn55pLQOfTfaeW>W8Cn*0k9}#!k2}bY9N{AX2yK^;ea5RSq=P9L~b=MtE~o6 zF`^)3DM8&83@DjOVvP*eET>yQlQ;TIyy*wS)IY$P@lN5%w$L~H&6Fl z6h)u@o(5(bff~gZ8v%!UN{QgViB`iNBM>9-GXgp|IBWtcJ~}2;7@~3s3)LokjDv}o zg_$Og5nsEU4w%Rw(*4WBrls(-2`AZBdzcKAqN%~ajUwXnUJZytK5Hy2?5V*Cw@t*P z%@QT#>l^8m#}-nst1(_k*-#5ObmwEBe=V5D|6ofJlvj4s%92i^oZ77xl<}KzJ6n3- z+D$-<`umUol9RmiOom|d-!tAOfKs1U(Q&(2NL1SX?-j%Fvk~C6kgxY0M&__uM+{sbA}%)_9T$ zz!^#HL>)*$=;C@@_GF6(4Tih6dmUVax_ppOHHVlu!CcmWRTbY6YGngRL|#n(1-xBn<<=R;B5e2o^$}gteY7J;N=#V zYm;6sF&ra4t7bt>BM3%5DUDcgSWF8Rdj%H4OJ~#k5-#LKU$dP73vA!C!_h-ynHv|L*+ET)XP5zoDpnOz$p@(9t>ERxEV;$ zyatt&I*FRs+ReDTg#k35>~-Hscs`KksqvZS&q#j}1V=XGV~d-yag2%@1iM6 T0o+>&)LpGW=?luy+e7p}j=^qO delta 3391 zcmY+F2Ut^Q8-{ZZ2H8SL0z`<608(byvPHrWP(VRY5!nPn5{6I&iHMBiV1#;r@)yyF zpjI|o5L2o_t;GRa6f2a1l`10+EVhFG;e?n&a^*_$-1qms-}mO6D@}8TslThlik6KSx9^`R>126mbP$i(>OIyW+_W^jot9Y*Dq$|IH9>W>&8wS<5Ts8q;9%6%bkC^-L@f^QFbuV30%gaP>wXChf+I96GbFSwB zqq(12x_feae!lp6Y;4R-ZDU+{MseW>#g>C@4mT3Kc>TJ4loF4=PB}II$(v`kn(aEY z;1pEE4yo_fY&NL(iXs+S?L4G$V1OmOy!e-U97Rwfu&#CJT~Zl6zpx?Dd)r{jNt@Z& z)PKu4*-^?GzS_jVr|!X&Fy`R-I~{mtwXM0z#-mr1cA08~w(1*=s|YkE-LJo??5&992DMU0#n_4;4PFVR&2`Ma;(c08*yb2Y4Q zxi6iWFJ(rp=xx- zBQbGK_V~>jrw_Pejav77>gu%=DP9vM?3@ceYMdYk`9~++dT1wjm_oC7!P#R=e2+Ei zVbP|1Km8Q=fy8G3R4X?q*c8)K@>yTm`BKRTG*pOut%<(vJ#v}VO>#*q&Y z?4jdcCPmI;m#&yjRYfOEO@tXeJd!(P|zkdGeTg>j!h);t(bo6v-4 zUpkxR^-rB%c3-{oZ&S=auMz%x^b0jqZms-QmL-R8Z&%1=5dEG7?mY!CC=mp2BRdDpJSL+z~z z6n^2(M3u1a_3}UCX>|kAuS2%#82T)zjitT$DNrzRhwogZpZuhZYUS86!7{q#XEq)F ztHM0Tq_Nrl^2dj&>uw0!cjar-@dZ3j*PTQ7k8}sG%u_w?Yhq>{WICSu>c>~p_oNp# zGPm!2W$zN;^y%3pQ>j<=`g=&0$r#Iq%CNzXH~glC*Zx~}ce*&+?G}vUzX}T&mbU40 z@^i5-s^+Q7E_Nu@J}wY16F7LwJo)ES)bpi&bQ^l(Mdz+rQdc!s z^TYTXUVXO1lh)}LruH*(v*x(^VXGzWrZep>rzzuS6>sAYR%&+JWRLtNbn;$T%X;!e zWe+AT;?CEb^Do$J!&Kd}P|Bk|iVSjWw$w)B1rLl7bZjN>3**R zwb>0WRcxB={=?KQ3=(%9=Q`|iS@m*7iiseL^Laz9hv?`hlFv+$av~MIYv!0r$;rBKNB9 zdZ;WO#KPk^6^|@ZuwDkxU_7V=ZxC=nFG_Sr7MZ(fis!X00a^4i2mzP2>54&{2)IYU zz)>7(&=K1=1+kTrDPq8ZffNr2hg1?lHK>OUTEH8c7TJ{t5j!k{Dz+mMkun2i%K-Wg z{s!CEMd4c>xBj#-v@U7+Ok9PlB*;7ol!1Td1J__cw(!uY<${=z%6c z5*KHQ*nhSw>?cdWRLn+2>|XE+BQe2E%dMG6xl-0jT*d@#;J6=-;q&b)fh^b{5z;FJ zOvHt(SqO6Ci4dCwn!yX|MCss7QTj}j%CQkw`1^9|25Tvz!2Uf1u|Z=v2wVV2&~r92 z&F)Xo-)wLLoXVykyXpf$C}l}zt;E|azseJ= zLwj}-uT=rwoKhOHIg$!mvX@8}X~?9CacI6t!gJTHYW7kU`A4FDz3b5&HF4zw z$bSCPM03=n{Y1Q$jOM5z7mIj+HkzY`d{e~#)Io7e$#73)h^}TmG)B$TNyLQ|G)Il| zYZ1>jKy%bMbM@fgVx^aF6F2x-u8P615 tuple[TableSchema, ...]: + if dataset == DATASET_NSO_V1: + return NSO_V1_TABLES + raise ValueError(f"Unknown dataset: {dataset}") + + +def _read_header(path: Path) -> list[str]: + # Small helper to keep schema checks cheap. + # We only need headers for required-column validation. + df = pd.read_csv(path, nrows=0) + return [str(c) for c in df.columns] + + +def validate_schema(datadir: Path, dataset: str) -> dict[str, Any]: + """Validate presence + required columns. + + Returns a report dict (workbook-friendly), e.g.: + { + "ok": bool, + "dataset": "nso_v1", + "datadir": "...", + "missing_tables": [...], + "tables": { + "chart_of_accounts.csv": {"exists": bool, "missing_columns": [...], "n_rows": int?}, + ... + } + } + """ + + report: dict[str, Any] = { + "dataset": dataset, + "datadir": str(datadir), + "missing_tables": [], + "tables": {}, + "ok": True, + } + + for schema in schemas_for_dataset(dataset): + table_path = datadir / schema.name + if not table_path.exists(): + report["missing_tables"].append(schema.name) + report["tables"][schema.name] = { + "exists": False, + "missing_columns": list(schema.required_columns), + } + report["ok"] = False + continue + + cols = set(_read_header(table_path)) + missing = [c for c in schema.required_columns if c not in cols] + + # Optional: if the file is big, avoid reading it all just for row count. + # For now, keep it simple: an approximate count is not worth it. + df = pd.read_csv(table_path) + + report["tables"][schema.name] = { + "exists": True, + "n_rows": int(df.shape[0]), + "missing_columns": missing, + } + + if missing: + report["ok"] = False + + return report + + +def validate_table_map(table_map: Mapping[str, Path], schemas: Mapping[str, TableSchema]) -> dict[str, Any]: + """Validate a provided mapping of logical table keys -> CSV paths. + + This is useful for future BYOD adapters where files may not live in a single directory. + The report format matches validate_schema(...), but keys are schema.name (filename). + """ + + report: dict[str, Any] = { + "missing_tables": [], + "tables": {}, + "ok": True, + } + + for key, schema in schemas.items(): + path = table_map.get(key) + if path is None or not Path(path).exists(): + report["missing_tables"].append(schema.name) + report["tables"][schema.name] = { + "exists": False, + "missing_columns": list(schema.required_columns), + } + report["ok"] = False + continue + + cols = set(_read_header(Path(path))) + missing = [c for c in schema.required_columns if c not in cols] + report["tables"][schema.name] = { + "exists": True, + "missing_columns": missing, + } + if missing: + report["ok"] = False + + return report + + +def assert_schema(datadir: Path, dataset: str) -> None: + """Fail-fast wrapper around validate_schema(...). + + Raises TrackDSchemaError with one friendly summary message if invalid. + """ + + report = validate_schema(datadir=datadir, dataset=dataset) + if report.get("ok"): + return + + missing_tables: list[str] = list(report.get("missing_tables", [])) + tables: dict[str, Any] = dict(report.get("tables", {})) + missing_cols = { + name: info.get("missing_columns", []) + for name, info in tables.items() + if info.get("exists") and info.get("missing_columns") + } + + lines: list[str] = [ + "Track D dataset schema check failed.", + f"Dataset: {dataset}", + f"Data directory: {datadir}", + "", + ] + + if missing_tables: + lines += ["Missing CSV files:", *[f" - {n}" for n in missing_tables], ""] + + if missing_cols: + lines += ["CSV files with missing required columns:"] + for name, cols in sorted(missing_cols.items()): + lines.append(f" - {name}: missing {', '.join(map(str, cols))}") + lines.append("") + + lines += [ + "Fix: ensure the required CSVs exist and match the Track D headers.", + "Tip: compare your exported CSV headers against the downloads in the workbook docs.", + ] + + raise TrackDSchemaError("\n".join(lines)) diff --git a/tests/test_trackd_schema_required_columns.py b/tests/test_trackd_schema_required_columns.py new file mode 100644 index 0000000..51f2cd8 --- /dev/null +++ b/tests/test_trackd_schema_required_columns.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from pystatsv1.trackd._errors import TrackDSchemaError +from pystatsv1.trackd.schema import CONTRACT_TABLES, DATASET_NSO_V1, assert_schema, validate_schema + + +def _write_csv(path: Path, header: list[str]) -> None: + path.write_text(",".join(header) + "\n", encoding="utf-8") + + +def test_validate_schema_reports_missing_tables_and_columns(tmp_path: Path) -> None: + # Create one required file but with missing required columns. + # chart_of_accounts requires: account_id, account_name, account_type, normal_side + _write_csv( + tmp_path / "chart_of_accounts.csv", + header=["account_id", "account_name", "account_type"], + ) + + # Create another file that is OK. + _write_csv( + tmp_path / "gl_journal.csv", + header=list(CONTRACT_TABLES["gl_journal"].required_columns), + ) + + report = validate_schema(tmp_path, dataset=DATASET_NSO_V1) + + assert report["ok"] is False + + # Missing file list should include at least one known table. + assert "trial_balance_monthly.csv" in report["missing_tables"] + + chart = report["tables"]["chart_of_accounts.csv"] + assert chart["exists"] is True + assert "normal_side" in chart["missing_columns"] + + gl = report["tables"]["gl_journal.csv"] + assert gl["exists"] is True + assert gl["missing_columns"] == [] + + +def test_assert_schema_raises_single_friendly_error(tmp_path: Path) -> None: + # Leave datadir empty so we get missing-table errors. + with pytest.raises(TrackDSchemaError) as ei: + assert_schema(tmp_path, dataset=DATASET_NSO_V1) + + msg = str(ei.value) + assert "Missing CSV files" in msg + assert "chart_of_accounts.csv" in msg + assert "Dataset: nso_v1" in msg diff --git a/workbooks/track_d_template/scripts/_business_schema.py b/workbooks/track_d_template/scripts/_business_schema.py index 16a58d8..d40c61c 100644 --- a/workbooks/track_d_template/scripts/_business_schema.py +++ b/workbooks/track_d_template/scripts/_business_schema.py @@ -1,190 +1,27 @@ -# SPDX-License-Identifier: MIT -"""Schema contracts for Track D business datasets.""" +"""Backwards-compatible shim for Track D schema helpers. -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import pandas as pd - -DATASET_NSO_V1 = "nso_v1" +The shipped Track D workbook template imports ``scripts._business_schema``. +To keep all existing chapter runners working without edits, this file remains +as the import surface, but the implementation now lives in +``pystatsv1.trackd.schema``. +""" +from __future__ import annotations -@dataclass(frozen=True) -class TableSchema: - name: str - required_columns: tuple[str, ...] - - -CONTRACT_TABLES: dict[str, TableSchema] = { - "chart_of_accounts": TableSchema( - name="chart_of_accounts.csv", - required_columns=("account_id", "account_name", "account_type", "normal_side"), - ), - "gl_journal": TableSchema( - name="gl_journal.csv", - required_columns=("txn_id", "date", "doc_id", "description", "account_id", "debit", "credit"), - ), - "trial_balance_monthly": TableSchema( - name="trial_balance_monthly.csv", - required_columns=( - "month", - "account_id", - "account_name", - "account_type", - "normal_side", - "debit", - "credit", - "ending_side", - "ending_balance", - ), - ), - "statements_is_monthly": TableSchema( - name="statements_is_monthly.csv", - required_columns=("month", "line", "amount"), - ), - "statements_bs_monthly": TableSchema( - name="statements_bs_monthly.csv", - required_columns=("month", "line", "amount"), - ), - "statements_cf_monthly": TableSchema( - name="statements_cf_monthly.csv", - required_columns=("month", "line", "amount"), - ), - "inventory_movements": TableSchema( - name="inventory_movements.csv", - required_columns=("month", "txn_id", "date", "sku", "movement_type", "qty", "unit_cost", "amount"), - ), - "fixed_assets": TableSchema( - name="fixed_assets.csv", - required_columns=( - "asset_id", - "asset_name", - "in_service_month", - "cost", - "useful_life_months", - "salvage_value", - "method", - ), - ), - "depreciation_schedule": TableSchema( - name="depreciation_schedule.csv", - required_columns=("month", "asset_id", "dep_expense", "accum_dep", "net_book_value"), - ), - # Chapter 5 - "payroll_events": TableSchema( - name="payroll_events.csv", - required_columns=( - "month", - "txn_id", - "date", - "event_type", - "gross_wages", - "employee_withholding", - "employer_tax", - "cash_paid", - "wages_payable_delta", - "payroll_taxes_payable_delta", - ), - ), - "sales_tax_events": TableSchema( - name="sales_tax_events.csv", - required_columns=( - "month", - "txn_id", - "date", - "event_type", - "taxable_sales", - "tax_amount", - "cash_paid", - "sales_tax_payable_delta", - ), - ), - "debt_schedule": TableSchema( - name="debt_schedule.csv", - required_columns=("month", "loan_id", "txn_id", "beginning_balance", "payment", "interest", "principal", "ending_balance"), - ), - "equity_events": TableSchema( - name="equity_events.csv", - required_columns=("month", "txn_id", "date", "event_type", "amount"), - ), - "ap_events": TableSchema( - name="ap_events.csv", - required_columns=("month", "txn_id", "date", "vendor", "invoice_id", "event_type", "amount", "ap_delta", "cash_paid"), - ), - # Chapter 6 - "ar_events": TableSchema( - name="ar_events.csv", - required_columns=("month", "txn_id", "date", "customer", "invoice_id", "event_type", "amount", "ar_delta", "cash_received"), - ), - "bank_statement": TableSchema( - name="bank_statement.csv", - required_columns=("month", "bank_txn_id", "posted_date", "description", "amount", "gl_txn_id"), - ), -} - -NSO_V1_TABLE_ORDER: tuple[str, ...] = ( - "chart_of_accounts", - "gl_journal", - "trial_balance_monthly", - "statements_is_monthly", - "statements_bs_monthly", - "statements_cf_monthly", - "inventory_movements", - "fixed_assets", - "depreciation_schedule", - # Chapter 5 - "payroll_events", - "sales_tax_events", - "debt_schedule", - "equity_events", - "ap_events", - # Chapter 6 - "ar_events", - "bank_statement", +from pystatsv1.trackd.schema import ( + DATASET_NSO_V1, + NSO_V1, + TableSchema, + assert_schema, + validate_schema, + validate_table_map, ) -NSO_V1_TABLES: tuple[TableSchema, ...] = tuple(CONTRACT_TABLES[k] for k in NSO_V1_TABLE_ORDER) - - -def schemas_for_dataset(dataset: str) -> tuple[TableSchema, ...]: - if dataset == DATASET_NSO_V1: - return NSO_V1_TABLES - raise ValueError(f"Unknown dataset: {dataset}") - - -def validate_schema(datadir: Path, dataset: str) -> dict[str, Any]: - """Validate presence + required columns. Returns a report dict.""" - report: dict[str, Any] = { - "dataset": dataset, - "datadir": str(datadir), - "missing_tables": [], - "tables": {}, - "ok": True, - } - - for schema in schemas_for_dataset(dataset): - table_path = datadir / schema.name - if not table_path.exists(): - report["missing_tables"].append(schema.name) - report["tables"][schema.name] = { - "exists": False, - "missing_columns": list(schema.required_columns), - } - report["ok"] = False - continue - - df = pd.read_csv(table_path) - cols = set(map(str, df.columns)) - missing = [c for c in schema.required_columns if c not in cols] - report["tables"][schema.name] = { - "exists": True, - "n_rows": int(df.shape[0]), - "missing_columns": missing, - } - if missing: - report["ok"] = False - - return report +__all__ = [ + "DATASET_NSO_V1", + "NSO_V1", + "TableSchema", + "validate_schema", + "validate_table_map", + "assert_schema", +]