From fb9e6d9f6ade2108b0fe1510d9861c6358b020e0 Mon Sep 17 00:00:00 2001 From: Nicholas Karlson Date: Mon, 19 Jan 2026 22:44:41 -0800 Subject: [PATCH] Track D: make template _business_etl a shim + rebuild workbook zip --- src/pystatsv1/assets/workbook_track_d.zip | Bin 161705 -> 159115 bytes .../test_trackd_business_schema_shim_smoke.py | 23 ++ .../track_d_template/scripts/_business_etl.py | 260 ++---------------- 3 files changed, 45 insertions(+), 238 deletions(-) diff --git a/src/pystatsv1/assets/workbook_track_d.zip b/src/pystatsv1/assets/workbook_track_d.zip index de3c49cacc74172b9e1ca1e17dc7116d57f9abed..a0576d0f1ea67b787d61fd117ff58c963e643078 100644 GIT binary patch delta 1258 zcmY+CX;4#F6oB(ih-|@xB!r4A0v5`eKoA8xwu7{27+TN*!%}1ogf;9~ZIMtwskA_Z zGeSqCQ@RL<)+tymi=wtJ14t_^kct*mic_eTfldVJeSsO{dq2+k&N<(`Z|;5H&KZ6( zZg@st-^i!ApIJmP823PBs&8GK5fYfD7IS+ZhqDJ-%HH4EQ5GYW8m8|)xkDhWNjhN3 zX*F*M>74E87>TJ!jIiqc?V#jBY16xly$kv?w*oFJ<239ytSwqjozkRgCa-^4 zviPCmwn4tI#4$%)t{3F&rS9)~$l#A1K9bhbAg{S{xWDyJ!SGq*`vyw);z)agsrX%& zhdR}cm6yMt^;8AOs^Z5x!-5Xk$Ll4y-P;#$yTV>h{32F)tXuSN{gmV*yW@VszLJj% zH}O4=zBlHOSN^(7KTG`(9(grb7Y>;Xvbv^kux`{|D&;tynOCx>LdUNiH@+llO7+R| z`V}i5_)Dvtwheu7tCpYIT~#c;Td}1|)DY#=s3}!G?Xfw*yY`c-2f7J=E=qV*{7qB) zr)3iPk?0@XRaWz{SEfrelFZ?ydS{2`xR}jHzuS{`t9Dw6*9RKT<#vp0T^jvyuT9P~9g>CZPFJa^M%Ftl-Tq1A2c(}n#;&jOb9g9Dj*0-Q*K4W*O3 zib)>THOYhCJvtYJZAlOYUmK|I^DJ?b44k2fM{)ti%0L3v0<4q4cDQMV!!qy#b8}UQ ztrhAifU7!Y$7M6to=z&|71fhSq3XQdMyyUItCYM-GY%&cYeJ&!iFR1xt`yRX_SbUT zJ9gMcb5}6o?b=n*UJe$}7q1=QW^Bg&axyTSOfz1TLns)gYkLs|9vG2IdVWf6uQ3$@ zAh}-Ee9{FuX{0{WPI95j_q-z((VTHr%iV*vI8JlQ1kGrb4pH#doHnmchutu;qRkzQ zC`&ZS0B7bQ9)+exlo<|LQWhwuON!UlN`0P`8~#q0b_T4KEHWX6sTa3K)tL|pzZ^iZ z0)%K^PPw9c7TNr4-D*XV1$!Z&a}~yENE)Ca3ECeZDj*m{v-p<+{F&bK6lOmsA0W*p zyQ%-9urM2ZIL;c1k(-bsSLEf262+cKDS(fiEZ~~FxSGmm6wZ-h=Hn#_U2;gW?>U9Z zB;lx6ban4lU1kirH|tB`-5l_R&j2lQNnB=P7>S%{7QEQnL%iyaa$j=W*Wm!|tT#ZT zJQDK_@of@0e;DbyUaRgF;Y+R|8+&NS(*!kij9QQ0`NTEl>bl!j-P(Ne+gvrp(_{j| z`8Y#IwE!Ioh>;}JF+A4}Nm)RQf6Va&nSkgGI932bAaz5hLXa|_gaf+9gOxBikhP9J P$v4Sc!)Gw2B7o5UMWo6L delta 3704 zcmYk8cRbbKAII|{`*MxUi)<+(T?seawX^57Wn@G|=0!%s<(i+6ac?Sd?Y$!0+;6h8 zr9xIR%F6sv{qZ~hJYTQZ`>e;~obz~`CzCX<*J#SAsA!={(2xweGiR6tQkkfD(IuRd zDU-EZw)TGS^{fY(mU?uD#!WNjo3y`=_@E{kdt3tRp)}fyDUo{ErVN?YIkkzrZg3NZ z(ptyA%Qdg6VkUb=*Lw(y0Ltu8$B1wdT1oiLybM&Wljwban!da@b|rYRz4#nUIkkz|QL9n`*_OYMWlFoZn>wu_|ClpbG zxBSsC2Pw{<`NsPjT4{zT2^gi8>|UnW=uy!WYZxEaz4TPU6_sY}F?9-mejEtV=+GU&%NgJAo%?SP zIsZik9@M0RaA)pi7UBj} zeR^c7v>nH584R;2vwAUFCFO&7r~OYc8?w%u)$=7038MmL`{eg8Gtx5R5s?i~j+n*Y z*5Xxf!yKmr@Cby@E&}91y0|%%oC(Olej=DRX%1r-ni#f-zhLU#LhlPqOUzy=sErtz z>ap0Y8n(LJ#Ip4#OwF?dTbE4T86QTn!lxG&!9=}``bc!w-5Rv@A+GD3U}+G_$L<=> zO>+tl{3%Ix7INW%*))Bs=P@~#G zFlbm^&VuX4@8uY?{$OnM9tkMno~xTvAlalpCzi{Oz%Ulv6lEX!CoM7#YuFuGsF z+i~i?#lBe2FKGSeTrH1iiP_Oe?OGNyVXLrYTOrONF>79K>qmkXQrmAN?2o^ZzKXhQMj zVb_z1D%aelU(AQ25+S0zm-Z`ajMkbnqq6LuBLlZSu@s`mH#j=0P-y{|kA;$3x{jh^ zevgG94N4az9m^CNqkbc*X!jKTR)wb>9m}=w--_jYjjVicyjNUDJ=HY;Z7wtSsXXAaODdr{LdlRO7hjms*VwhS8 z&^v>X%2PftYS*Ehy~7~%+jdh>Amq;d)w<5!6@frj&-pNPWB?LI?v^B1lfQehlKNOi zhK(1!ll1hHRHMkcC&vh26J_t+U!PhNg)H?+VPazCIo7D8sLfiLEFpM(lnz|E13OBu zE9Ao)pPR-b`(*gan^5c+xs*Fp=fBQO(XDV+OOiX9Fe7U5wW@cse|27s%+6;uyn4oJ z+*Qlf4XTkBkr!c&6wG4Hpzp^bv;{(^-}#{ZA!(a{s_e@g^1DsE;^)mcpuzM3pNEcV zCHu9?n7o=l4^>i?lG2_R1QmYwH@bITA?BMQcgAh`H`uP*#iW2xos!XlIr!5T2u=%2 zgKpB`ULxK5#E)WHv5!S-Tr5BT6k%}T_MM<>$CGxm?dlh)msak6{~aEBy`}O(@)evI z8s7Oad!_Mx4AvN-H5XeoA*7l^c-M=o;2=rw-9vTt5)5g6*j%R>nq%TRXS52!7Rp3P*nQ5N zyiv`2%oVyvHIn9mTNd1QopZDllA0v5U6r>&O&jb*+gMEP9W4fxB8GfJn@BT1yk&u1 zKz?~wE9`8lj&UBu1$gM7(;c;KhFiPuTJpGITUAtMi8Mpp{?QznYvW6=U^dL7@*}Ki zW-?7P-PUnp4X2X?8K^0{n;LrqT|hMFfG6-=l$vCp*t7hZL2J{n0R23p=ELWc%2<>y z+CDK!nE@e0vevo&26zBSBC(1X!o{}~{#rq9${}+z({aA#O@*py+T!ZM& z@wPt)1B!efa1@;C!LA8@+}TVxcPK(y(|tvWo7LuHb`>Rm*D;5UA^t2t%hT|Q9tNT1 z3w1U77J3;!Fi3r!sP5xKW@_7^Djy_i#}^wt6Ikf;AVpJd~-uBQ@Tg$Kf$Z61TA!B%~mR>X=Vf7uSzXm|E7+_lh=Yy2`Xy8 z4GqhAil{9pYQl;?<``T--=AP!M%HhVn8iv&eFm5u-FV99C1eQSGboMp8(@tQD-!W{ zyR}WEE?r>U%q?1DPu&q#FQvqv<3crNN_5I*zLI<(I0<8r?FOu~ zxTQI+TD|E8rmIMsZ{ErJSV}VnrwB|iS9A?-Xlv++ObxW%Zh%A3tm$KtG`5U5E5dLY z*cHz;xD=g*#pI*Ws>gRtt}!~l!UgIy$GQ3Jg<5x9bKPx<0|WP-&;>>-vZ+6%z_!_! zT=cRcy1dmlSpGC>QtdC4UrB=WE(~+yuY@P(HH5<6{hXyM$}9JMx;_&v{D{Q0(RIcd zqOI;RRK9+0`?keP1DX0oKbvH$FN5Ydh*Nmf;n&Qf6haT0e$T{aBiOtj-fJ)*Daumk zg_CXBT%%HKXM9b2KDRnt_cHW-vfMn*!x)Kqo~Aqe4)B7tss7g^KA^a>AGbE3rn?^z zN8%2M2)j*}Qeg|xPFS59j;Y9#p#80@HQj`AnI<;Nq~}Y}_Ru{%_N#6=Sp7n)yI7j@ z>Vm?Znn#Q1I~GT0+g4%$h<-1_7xvvsvq+O}j$OT;?>}2j*rM}gXC8M)DEaMkv>eu+ zJOtx$?TweSekmNtyCnCpv+N9S*s|>Ww+IG)7=a0Wk*hsDSLS3_t;VjHsH}4mvS~1m zsN!w-5T4pz$tl1M<51ZmRFO(Ep!?<#mG$$%e8wVlv5wBtaWy+XTK1kihP@}Ia6#ai zp`6J4AW5=0!d9`M+d;#K=v6$+8w#|>_XY3S1Yu+@KZI^}o&U&l-w$~tOUT@cG{3Q~ z6^Ct|h1~l7rL%q&v$YphY@2_7xTC9w?fU2rj>6%tMrvzn~w1Jod#CX z14*tN7-ZjigWnPozh?;VIUib)7^`^wnyiB21ovL0fJit1JX_+^G>iY{gqKIM&A!35 z54+ChUXjy0+v3?+lf|IPE;HADar-iV)~qz+G(Jj9BEo=8nrN2CT?jsNdU4<_0G<^Yiy z%(nqp!95$`3eW=kV^aH=X+T{RAPy+yxLumd!K5Q9G^HBnsvOtd?hYS{X3wcif72~2x|{PutguITQ(>i|pta*k$0vP%d>)Y> zY&rFDOMm(vJDzOO`S*#2gCu(ZnmTt3g`D}!F+<`M-M=s#`$7z5CZU7XlbOE@?H)8)ptRO`gnFI*E zbuvR=-oI!jolG1IJ&kU){fnA!0jA^+#{M}`SHKYHUIT+qslN+$y8`Nf=@H1}256A) z0RU*_23#lCq5#13Q&D&p00&M54HW=F+)qR}H2`X#iai7 Pc+VSvQl8v|lT-O07WcaG diff --git a/tests/test_trackd_business_schema_shim_smoke.py b/tests/test_trackd_business_schema_shim_smoke.py index 78cec48..ece5458 100644 --- a/tests/test_trackd_business_schema_shim_smoke.py +++ b/tests/test_trackd_business_schema_shim_smoke.py @@ -5,7 +5,9 @@ import pytest import scripts._business_schema as shim +import scripts._business_etl as etl_shim from pystatsv1.trackd._errors import TrackDSchemaError +from pystatsv1.trackd import etl as trackd_etl from pystatsv1.trackd import schema as trackd_schema @@ -15,6 +17,14 @@ def test_business_schema_shim_exports_trackd_schema() -> None: assert shim.assert_schema is trackd_schema.assert_schema +def test_business_etl_shim_exports_trackd_etl() -> None: + # The shim should re-export the package implementation (same function objects). + assert etl_shim.prepare_gl_tidy is trackd_etl.prepare_gl_tidy + assert etl_shim.build_gl_tidy_dataset is trackd_etl.build_gl_tidy_dataset + assert etl_shim.prepare_gl_monthly_summary is trackd_etl.prepare_gl_monthly_summary + assert etl_shim.analyze_gl_preparation is trackd_etl.analyze_gl_preparation + + def test_business_schema_shim_validate_schema_report_shape(tmp_path: Path) -> None: report = shim.validate_schema(tmp_path, dataset=trackd_schema.DATASET_NSO_V1) @@ -42,3 +52,16 @@ def test_track_d_template_business_schema_is_a_shim() -> None: assert "pystatsv1.trackd.schema" in text assert "validate_schema" in text assert "assert_schema" in text + + +def test_track_d_template_business_etl_is_a_shim() -> None: + root = Path(__file__).resolve().parents[1] + template = root / "workbooks" / "track_d_template" / "scripts" / "_business_etl.py" + assert template.exists() + + text = template.read_text(encoding="utf-8") + assert "pystatsv1.trackd.etl" in text + assert "prepare_gl_tidy" in text + assert "build_gl_tidy_dataset" in text + assert "prepare_gl_monthly_summary" in text + assert "analyze_gl_preparation" in text diff --git a/workbooks/track_d_template/scripts/_business_etl.py b/workbooks/track_d_template/scripts/_business_etl.py index eaa8032..8d08691 100644 --- a/workbooks/track_d_template/scripts/_business_etl.py +++ b/workbooks/track_d_template/scripts/_business_etl.py @@ -1,245 +1,29 @@ # SPDX-License-Identifier: MIT -"""ETL helpers for Track D (Business). +"""Backwards-compatible shim for Track D ETL helpers. -Chapter 7: preparing accounting data for analysis. - -This module provides small, testable transformations that turn the synthetic -general ledger into analysis-friendly (“tidy”) tables. - -Core idea: -- A raw GL export typically has *two amount columns* (debit, credit). -- Many analytic workflows prefer a *single signed amount* column. -- "Signed" here means "positive when the account increases on its normal side". - (Assets/expenses normally increase with debits; liabilities/equity/revenue - normally increase with credits.) +The shipped Track D workbook template imports ``scripts._business_etl``. +To keep all existing chapter runners working without edits, this file remains +as the import surface, but the implementation now lives in +``pystatsv1.trackd.etl``. """ from __future__ import annotations -from dataclasses import dataclass -from typing import Any - -import numpy as np -import pandas as pd - - -@dataclass(frozen=True) -class GLPrepOutputs: - """Outputs for Chapter 7 ETL.""" - - gl_tidy: pd.DataFrame - gl_monthly_summary: pd.DataFrame - summary: dict[str, Any] - - -def _to_float(series: pd.Series) -> pd.Series: - return pd.to_numeric(series, errors="coerce").fillna(0.0).astype(float) - - -def prepare_gl_tidy(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> pd.DataFrame: - """Return a line-level tidy GL dataset. - - Parameters - ---------- - gl_journal: - Raw journal export with debit/credit columns. - chart_of_accounts: - COA mapping account_id -> account_name/account_type/normal_side. - - Returns - ------- - pd.DataFrame - A normalized table with one row per GL line, plus: - - joined account labels - - parsed dates + a month key - - `raw_amount = debit - credit` (debit-positive convention) - - `amount` where sign is aligned to the account's normal side - """ - - gl = gl_journal.copy() - coa = chart_of_accounts.copy() - - gl["account_id"] = gl["account_id"].astype(str) - coa["account_id"] = coa["account_id"].astype(str) - - coa_cols = ["account_id", "account_name", "account_type", "normal_side"] - out = gl.merge( - coa[coa_cols], - on="account_id", - how="left", - validate="many_to_one", - suffixes=("", "_coa"), - ) - - # If the GL already carried labels, keep them; otherwise fill from the COA. - for col in ("account_name", "account_type", "normal_side"): - rhs = f"{col}_coa" - if rhs in out.columns: - if col in out.columns: - out[col] = out[col].where(out[col].notna() & (out[col].astype(str) != ""), out[rhs]) - else: - out[col] = out[rhs] - out = out.drop(columns=[rhs]) - - - out["doc_id"] = out["doc_id"].astype(str) - out["description"] = out["description"].astype(str) - - out["date"] = pd.to_datetime(out["date"], errors="coerce") - out["month"] = out["date"].dt.strftime("%Y-%m") - - out["debit"] = _to_float(out.get("debit", 0.0)) - out["credit"] = _to_float(out.get("credit", 0.0)) - - out["dc"] = np.where(out["debit"] > 0, "D", np.where(out["credit"] > 0, "C", "")) - - # Debit-positive convention - out["raw_amount"] = out["debit"] - out["credit"] - - # Signed-by-normal-side: positive means "account increased" - normal = out["normal_side"].astype(str).str.lower() - out["amount"] = np.where(normal.eq("credit"), -out["raw_amount"], out["raw_amount"]) - - # Stable row ids (helpful for downstream joins) - out = out.sort_values(["date", "txn_id", "account_id"], kind="mergesort").reset_index(drop=True) - out["line_no"] = out.groupby("txn_id").cumcount() + 1 - out["gl_line_id"] = out["txn_id"].astype(str) + "-" + out["line_no"].astype(str) - - cols = [ - "gl_line_id", - "txn_id", - "line_no", - "date", - "month", - "doc_id", - "description", - "account_id", - "account_name", - "account_type", - "normal_side", - "dc", - "debit", - "credit", - "raw_amount", - "amount", - ] - - # Keep any extra columns at the end (future-proof) - extra = [c for c in out.columns if c not in cols] - return out[cols + extra] - - -def build_gl_tidy_dataset(gl: pd.DataFrame, coa: pd.DataFrame) -> pd.DataFrame: - """Backward-compatible alias for :func:`prepare_gl_tidy`. - - Chapter 8 imports ``build_gl_tidy_dataset``. - Chapter 7 uses the canonical name ``prepare_gl_tidy``. - """ - - return prepare_gl_tidy(gl, coa) - - -def prepare_gl_monthly_summary(gl_tidy: pd.DataFrame) -> pd.DataFrame: - """Monthly rollup of tidy GL. - - Produces one row per (month, account) with debit/credit totals and a - signed net change (`net_change`) aligned to the account's normal side. - """ - - g = gl_tidy.copy() - - group_cols = ["month", "account_id", "account_name", "account_type", "normal_side"] - out = ( - g.groupby(group_cols, dropna=False) - .agg( - n_lines=("gl_line_id", "count"), - debit=("debit", "sum"), - credit=("credit", "sum"), - net_change=("amount", "sum"), - ) - .reset_index() - ) - - out["debit"] = out["debit"].astype(float) - out["credit"] = out["credit"].astype(float) - out["net_change"] = out["net_change"].astype(float) - - return out.sort_values(["month", "account_id"], kind="mergesort").reset_index(drop=True) - - - - -def build_data_dictionary() -> dict[str, str]: - """A lightweight data dictionary for the Chapter 7 output tables. - - This is intentionally small and human-readable (useful for docs + downstream - notebooks). It is *not* intended to be a formal metadata standard. - """ - - return { - # Keys used in gl_tidy.csv - "gl_line_id": "Stable line identifier (txn_id-line_no).", - "txn_id": "Journal transaction id (groups debit/credit lines for one event).", - "line_no": "Line number within txn_id (1..k).", - "date": "Journal posting date (YYYY-MM-DD).", - "month": "Month key derived from date (YYYY-MM).", - "doc_id": "Source document id (invoice, payroll run, bank transfer, etc.).", - "description": "Text description from the journal.", - "account_id": "Chart-of-accounts id.", - "account_name": "Chart-of-accounts account name.", - "account_type": "High-level account class (Asset, Liability, Equity, Revenue, Expense).", - "normal_side": "Normal balance side for the account (debit or credit).", - "debit": "Debit amount for the line (0 if none).", - "credit": "Credit amount for the line (0 if none).", - "dc": "D if debit>0, C if credit>0, blank if both are 0.", - "raw_amount": "Single-column amount in debit-positive convention: debit - credit.", - "amount": "Signed amount aligned to the account's normal side (positive means the account increased).", - # Keys used in gl_monthly_summary.csv - "n_lines": "Number of GL lines aggregated into the month/account group.", - "net_change": "Sum of `amount` in the month/account group.", - } - - -def analyze_gl_preparation(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> GLPrepOutputs: - """Compute Chapter 7 outputs + a small QC summary.""" - - gl_tidy = prepare_gl_tidy(gl_journal, chart_of_accounts) - monthly = prepare_gl_monthly_summary(gl_tidy) - - n_lines = int(len(gl_tidy)) - n_txns = int(gl_tidy["txn_id"].nunique()) if n_lines else 0 - n_missing_accounts = int(gl_tidy["account_name"].isna().sum()) - n_bad_dates = int(gl_tidy["date"].isna().sum()) - - # Basic accounting invariant: sum of raw debit-positive amounts should be ~0 - raw_total = float(gl_tidy["raw_amount"].sum()) if n_lines else 0.0 - gl_balances = bool(abs(raw_total) < 1e-6) - - summary: dict[str, Any] = { - "checks": { - "gl_balances_raw_amount_sum_zero": gl_balances, - "coa_join_coverage_ok": n_missing_accounts == 0, - "no_missing_coa_mappings": n_missing_accounts == 0, - "all_gl_dates_parse": n_bad_dates == 0, - "no_unparseable_dates": n_bad_dates == 0, - }, - "metrics": { - "n_gl_lines": n_lines, - "n_txns": n_txns, - "n_accounts": int(gl_tidy["account_id"].nunique()) if n_lines else 0, - "n_months": int(gl_tidy["month"].nunique()) if n_lines else 0, - "n_missing_coa_mappings": n_missing_accounts, - "n_bad_dates": n_bad_dates, - "raw_amount_sum": raw_total, - }, - "data_dictionary": build_data_dictionary(), - "notes": { - "amount_definition": ( - "amount is signed so positive means the account increased on its normal side; " - "raw_amount uses debit-positive convention (debit - credit)." - ) - }, - } - - return GLPrepOutputs(gl_tidy=gl_tidy, gl_monthly_summary=monthly, summary=summary) +from pystatsv1.trackd.etl import ( + GLPrepOutputs, + analyze_gl_preparation, + build_data_dictionary, + build_gl_tidy_dataset, + prepare_gl_monthly_summary, + prepare_gl_tidy, +) + +__all__ = [ + "GLPrepOutputs", + "prepare_gl_tidy", + "build_gl_tidy_dataset", + "prepare_gl_monthly_summary", + "build_data_dictionary", + "analyze_gl_preparation", +]