diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f893fee..0d3cf79 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -100,9 +100,23 @@ You should now be able to run: pytest make lint ``` - without errors. +#### Track D workbook template editing + +Track D ships a workbook template ZIP used by `pystatsv1 workbook init --track d`. + +**Please do not edit the ZIP by hand.** Instead: + +1. Edit the source template under `workbooks/track_d_template/`. +2. Rebuild the committed ZIP: + + ```bash + python tools/build_workbook_zip.py + ``` + +3. Re-run checks (`pytest -q` is enough to confirm the ZIP drift guard). + ### 3. Create a Branch Use a short, descriptive branch name: diff --git a/tests/test_workbook_track_d_zip_is_current.py b/tests/test_workbook_track_d_zip_is_current.py new file mode 100644 index 0000000..a912fd9 --- /dev/null +++ b/tests/test_workbook_track_d_zip_is_current.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import hashlib +import subprocess +import sys +import tempfile +from pathlib import Path +from zipfile import ZipFile + + +def _zip_payload_hashes(zip_path: Path) -> dict[str, str]: + """Return sha256 hashes of *decompressed* member bytes. + + This intentionally ignores ZIP metadata (timestamps, compression levels, etc.). + """ + out: dict[str, str] = {} + with ZipFile(zip_path, "r") as zf: + for name in sorted(n for n in zf.namelist() if not n.endswith("/")): + data = zf.read(name) + out[name] = hashlib.sha256(data).hexdigest() + return out + + +def test_workbook_track_d_zip_is_current() -> None: + """Guardrail: committed Track D workbook ZIP must match the template source.""" + root = Path(__file__).resolve().parents[1] + template_dir = root / "workbooks" / "track_d_template" + committed_zip = root / "src" / "pystatsv1" / "assets" / "workbook_track_d.zip" + builder = root / "tools" / "build_workbook_zip.py" + + assert template_dir.exists(), ( + f"Missing template source directory: {template_dir}. " + "Did you forget to apply the Track D template source-of-truth patch?" + ) + assert committed_zip.exists(), f"Missing committed ZIP: {committed_zip}" + assert builder.exists(), f"Missing ZIP builder script: {builder}" + + with tempfile.TemporaryDirectory() as td: + built_zip = Path(td) / "workbook_track_d.zip" + subprocess.run( + [ + sys.executable, + str(builder), + "--src", + str(template_dir), + "--dest", + str(built_zip), + ], + check=True, + ) + + built = _zip_payload_hashes(built_zip) + committed = _zip_payload_hashes(committed_zip) + + assert built == committed, ( + "Committed Track D workbook ZIP is stale or mismatched vs template source-of-truth.\n\n" + "Fix: run `python tools/build_workbook_zip.py` and commit the updated ZIP." + ) \ No newline at end of file diff --git a/tools/build_workbook_zip.py b/tools/build_workbook_zip.py new file mode 100644 index 0000000..7e729a9 --- /dev/null +++ b/tools/build_workbook_zip.py @@ -0,0 +1,113 @@ +"""Build committed workbook template ZIPs from source-of-truth directories. + +Why this exists +-------------- +Historically, workbook templates were edited by manually opening a ZIP. +That inevitably causes drift and makes refactors scary. + +For Track D, the source template lives at: + workbooks/track_d_template/ + +This script (re)builds: + src/pystatsv1/assets/workbook_track_d.zip + +Usage +----- + python tools/build_workbook_zip.py + +Or explicitly: + python tools/build_workbook_zip.py --src workbooks/track_d_template --dest src/pystatsv1/assets/workbook_track_d.zip +""" + +from __future__ import annotations + +import argparse +import hashlib +import sys +from pathlib import Path +from typing import Iterable +from zipfile import ZIP_DEFLATED, ZipInfo, ZipFile + + +_FIXED_ZIP_DT = (2020, 1, 1, 0, 0, 0) +_SKIP_BASENAMES = {".DS_Store", "Thumbs.db"} + + +def _iter_files(src_dir: Path) -> Iterable[tuple[Path, str]]: + """Yield (path, zip_relpath) for all files under src_dir (sorted).""" + candidates = [p for p in src_dir.rglob("*") if p.is_file()] + for p in sorted(candidates): + if p.name in _SKIP_BASENAMES: + continue + if "__pycache__" in p.parts: + continue + rel = p.relative_to(src_dir).as_posix() + yield p, rel + + +def build_zip(src_dir: Path, dest_zip: Path) -> list[str]: + """Build a deterministic ZIP from src_dir. + + Returns a list of archived file paths (POSIX style) in the ZIP. + """ + if not src_dir.exists(): + raise FileNotFoundError(f"Template directory not found: {src_dir}") + if not src_dir.is_dir(): + raise NotADirectoryError(f"Template path is not a directory: {src_dir}") + + files = list(_iter_files(src_dir)) + if not files: + raise ValueError(f"Template directory is empty: {src_dir}") + + dest_zip.parent.mkdir(parents=True, exist_ok=True) + archived: list[str] = [] + with ZipFile(dest_zip, "w") as zf: + for fs_path, arc_path in files: + info = ZipInfo(arc_path) + info.date_time = _FIXED_ZIP_DT + info.compress_type = ZIP_DEFLATED + # Preserve unix permission bits where meaningful. + info.external_attr = (fs_path.stat().st_mode & 0o777) << 16 + + data = fs_path.read_bytes() + zf.writestr(info, data, compress_type=ZIP_DEFLATED) + archived.append(arc_path) + + return archived + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + h.update(chunk) + return h.hexdigest() + + +def main(argv: list[str] | None = None) -> int: + root = Path(__file__).resolve().parents[1] + default_src = root / "workbooks" / "track_d_template" + default_dest = root / "src" / "pystatsv1" / "assets" / "workbook_track_d.zip" + + p = argparse.ArgumentParser(description="Build workbook ZIP templates from source directories.") + p.add_argument("--src", type=Path, default=default_src, help=f"Template directory (default: {default_src})") + p.add_argument("--dest", type=Path, default=default_dest, help=f"Destination ZIP path (default: {default_dest})") + p.add_argument( + "--list", + action="store_true", + help="Print archived file list after building.", + ) + + ns = p.parse_args(argv) + + archived = build_zip(ns.src, ns.dest) + print(f"Wrote: {ns.dest} ({len(archived)} files, sha256={_sha256(ns.dest)[:12]}…)") + if ns.list: + for name in archived: + print(name) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/workbooks/track_d_template/Makefile b/workbooks/track_d_template/Makefile new file mode 100644 index 0000000..0bdb09a --- /dev/null +++ b/workbooks/track_d_template/Makefile @@ -0,0 +1,27 @@ +PYTHON ?= python + +.PHONY: help +help: + @echo "Targets:" + @echo " setup - (re)generate deterministic datasets under data/synthetic (seed=123)" + @echo " peek - print a quick preview of Track D datasets" + @echo " test - run workbook smoke tests" + @echo " d01..d23 - run a Track D chapter via wrapper (example: make d14)" + +.PHONY: setup +setup: + $(PYTHON) scripts/d00_setup_data.py + +.PHONY: peek +peek: + $(PYTHON) scripts/d00_peek_data.py + +.PHONY: test +test: + pytest -q + +# Pattern rule: make d14 -> runs scripts/d14.py +.PHONY: d01 d02 d03 d04 d05 d06 d07 d08 d09 d10 d11 d12 d13 d14 d15 d16 d17 d18 d19 d20 d21 d22 d23 + +d%: + $(PYTHON) scripts/$@.py diff --git a/workbooks/track_d_template/README.md b/workbooks/track_d_template/README.md new file mode 100644 index 0000000..cb1eea7 --- /dev/null +++ b/workbooks/track_d_template/README.md @@ -0,0 +1,88 @@ +# PyStatsV1 Track D Workbook Starter (Business Statistics) + +This folder is a **Track D-only workbook** built around an accounting running case. +It is designed for students to: + +- run a chapter script +- inspect outputs under `outputs/track_d/` +- repeat with confidence (datasets are deterministic with seed=123) + +It works on Linux, macOS, and Windows, and it does **not** require `make`. + +## 0) Setup + +Create and activate a virtual environment, then install PyStatsV1: + +```bash +python -m pip install -U pip +python -m pip install "pystatsv1[workbook]" + +# pytest is included via the [workbook] extra +``` + +## 1) Create this workbook + +If you already have this folder, you can skip this. + +```bash +pystatsv1 workbook init --track d --dest pystatsv1_track_d +cd pystatsv1_track_d +``` + +## 2) Peek at the data (recommended) + +```bash +pystatsv1 workbook run d00_peek_data +``` + +That script looks for the two Track D datasets under: + +- `data/synthetic/ledgerlab_ch01/` +- `data/synthetic/nso_v1/` + +and writes a friendly summary to: + +- `outputs/track_d/d00_peek_data_summary.md` + +## 3) Reset the datasets (optional) + +If you ever delete/edit files under `data/synthetic/`, you can regenerate them. +This keeps the default **seed=123** (same values as the canonical datasets). + +```bash +pystatsv1 workbook run d00_setup_data +# or (clean reset) +pystatsv1 workbook run d00_setup_data --force +``` + +## 4) Run a Track D chapter + +First, see the available Track D chapters: + +```bash +pystatsv1 workbook list --track d +``` + +Then run a chapter using the short wrapper names `d01` ... `d23`. +For example: + +```bash +pystatsv1 workbook run d01 +pystatsv1 workbook run d14 +pystatsv1 workbook run d23 +``` + +You can also run the full script names directly (same result): + +```bash +pystatsv1 workbook run business_ch01_accounting_measurement +pystatsv1 workbook run business_ch14_regression_driver_analysis +``` + +## 5) Check your environment (smoke test) + +```bash +pystatsv1 workbook check business_smoke +``` + +If you ever get stuck, see the PyStatsV1 docs on ReadTheDocs. diff --git a/workbooks/track_d_template/data/synthetic/.gitkeep b/workbooks/track_d_template/data/synthetic/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workbooks/track_d_template/data/synthetic/ledgerlab_ch01/.gitkeep b/workbooks/track_d_template/data/synthetic/ledgerlab_ch01/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workbooks/track_d_template/data/synthetic/nso_v1/.gitkeep b/workbooks/track_d_template/data/synthetic/nso_v1/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workbooks/track_d_template/requirements.txt b/workbooks/track_d_template/requirements.txt new file mode 100644 index 0000000..3f729bb --- /dev/null +++ b/workbooks/track_d_template/requirements.txt @@ -0,0 +1,14 @@ +# Track D workbook dependencies (optional) +# +# If you installed PyStatsV1 via: +# pip install pystatsv1[workbook] +# you already have everything you need. + +numpy>=1.24 +pandas>=2.0 +scipy>=1.10 +statsmodels>=0.14 +matplotlib>=3.8 +pingouin>=0.5 +scikit-learn>=1.3 +pytest>=8.2 diff --git a/workbooks/track_d_template/scripts/_business_etl.py b/workbooks/track_d_template/scripts/_business_etl.py new file mode 100644 index 0000000..eaa8032 --- /dev/null +++ b/workbooks/track_d_template/scripts/_business_etl.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: MIT + +"""ETL helpers for Track D (Business). + +Chapter 7: preparing accounting data for analysis. + +This module provides small, testable transformations that turn the synthetic +general ledger into analysis-friendly (“tidy”) tables. + +Core idea: +- A raw GL export typically has *two amount columns* (debit, credit). +- Many analytic workflows prefer a *single signed amount* column. +- "Signed" here means "positive when the account increases on its normal side". + (Assets/expenses normally increase with debits; liabilities/equity/revenue + normally increase with credits.) +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +import numpy as np +import pandas as pd + + +@dataclass(frozen=True) +class GLPrepOutputs: + """Outputs for Chapter 7 ETL.""" + + gl_tidy: pd.DataFrame + gl_monthly_summary: pd.DataFrame + summary: dict[str, Any] + + +def _to_float(series: pd.Series) -> pd.Series: + return pd.to_numeric(series, errors="coerce").fillna(0.0).astype(float) + + +def prepare_gl_tidy(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> pd.DataFrame: + """Return a line-level tidy GL dataset. + + Parameters + ---------- + gl_journal: + Raw journal export with debit/credit columns. + chart_of_accounts: + COA mapping account_id -> account_name/account_type/normal_side. + + Returns + ------- + pd.DataFrame + A normalized table with one row per GL line, plus: + - joined account labels + - parsed dates + a month key + - `raw_amount = debit - credit` (debit-positive convention) + - `amount` where sign is aligned to the account's normal side + """ + + gl = gl_journal.copy() + coa = chart_of_accounts.copy() + + gl["account_id"] = gl["account_id"].astype(str) + coa["account_id"] = coa["account_id"].astype(str) + + coa_cols = ["account_id", "account_name", "account_type", "normal_side"] + out = gl.merge( + coa[coa_cols], + on="account_id", + how="left", + validate="many_to_one", + suffixes=("", "_coa"), + ) + + # If the GL already carried labels, keep them; otherwise fill from the COA. + for col in ("account_name", "account_type", "normal_side"): + rhs = f"{col}_coa" + if rhs in out.columns: + if col in out.columns: + out[col] = out[col].where(out[col].notna() & (out[col].astype(str) != ""), out[rhs]) + else: + out[col] = out[rhs] + out = out.drop(columns=[rhs]) + + + out["doc_id"] = out["doc_id"].astype(str) + out["description"] = out["description"].astype(str) + + out["date"] = pd.to_datetime(out["date"], errors="coerce") + out["month"] = out["date"].dt.strftime("%Y-%m") + + out["debit"] = _to_float(out.get("debit", 0.0)) + out["credit"] = _to_float(out.get("credit", 0.0)) + + out["dc"] = np.where(out["debit"] > 0, "D", np.where(out["credit"] > 0, "C", "")) + + # Debit-positive convention + out["raw_amount"] = out["debit"] - out["credit"] + + # Signed-by-normal-side: positive means "account increased" + normal = out["normal_side"].astype(str).str.lower() + out["amount"] = np.where(normal.eq("credit"), -out["raw_amount"], out["raw_amount"]) + + # Stable row ids (helpful for downstream joins) + out = out.sort_values(["date", "txn_id", "account_id"], kind="mergesort").reset_index(drop=True) + out["line_no"] = out.groupby("txn_id").cumcount() + 1 + out["gl_line_id"] = out["txn_id"].astype(str) + "-" + out["line_no"].astype(str) + + cols = [ + "gl_line_id", + "txn_id", + "line_no", + "date", + "month", + "doc_id", + "description", + "account_id", + "account_name", + "account_type", + "normal_side", + "dc", + "debit", + "credit", + "raw_amount", + "amount", + ] + + # Keep any extra columns at the end (future-proof) + extra = [c for c in out.columns if c not in cols] + return out[cols + extra] + + +def build_gl_tidy_dataset(gl: pd.DataFrame, coa: pd.DataFrame) -> pd.DataFrame: + """Backward-compatible alias for :func:`prepare_gl_tidy`. + + Chapter 8 imports ``build_gl_tidy_dataset``. + Chapter 7 uses the canonical name ``prepare_gl_tidy``. + """ + + return prepare_gl_tidy(gl, coa) + + +def prepare_gl_monthly_summary(gl_tidy: pd.DataFrame) -> pd.DataFrame: + """Monthly rollup of tidy GL. + + Produces one row per (month, account) with debit/credit totals and a + signed net change (`net_change`) aligned to the account's normal side. + """ + + g = gl_tidy.copy() + + group_cols = ["month", "account_id", "account_name", "account_type", "normal_side"] + out = ( + g.groupby(group_cols, dropna=False) + .agg( + n_lines=("gl_line_id", "count"), + debit=("debit", "sum"), + credit=("credit", "sum"), + net_change=("amount", "sum"), + ) + .reset_index() + ) + + out["debit"] = out["debit"].astype(float) + out["credit"] = out["credit"].astype(float) + out["net_change"] = out["net_change"].astype(float) + + return out.sort_values(["month", "account_id"], kind="mergesort").reset_index(drop=True) + + + + +def build_data_dictionary() -> dict[str, str]: + """A lightweight data dictionary for the Chapter 7 output tables. + + This is intentionally small and human-readable (useful for docs + downstream + notebooks). It is *not* intended to be a formal metadata standard. + """ + + return { + # Keys used in gl_tidy.csv + "gl_line_id": "Stable line identifier (txn_id-line_no).", + "txn_id": "Journal transaction id (groups debit/credit lines for one event).", + "line_no": "Line number within txn_id (1..k).", + "date": "Journal posting date (YYYY-MM-DD).", + "month": "Month key derived from date (YYYY-MM).", + "doc_id": "Source document id (invoice, payroll run, bank transfer, etc.).", + "description": "Text description from the journal.", + "account_id": "Chart-of-accounts id.", + "account_name": "Chart-of-accounts account name.", + "account_type": "High-level account class (Asset, Liability, Equity, Revenue, Expense).", + "normal_side": "Normal balance side for the account (debit or credit).", + "debit": "Debit amount for the line (0 if none).", + "credit": "Credit amount for the line (0 if none).", + "dc": "D if debit>0, C if credit>0, blank if both are 0.", + "raw_amount": "Single-column amount in debit-positive convention: debit - credit.", + "amount": "Signed amount aligned to the account's normal side (positive means the account increased).", + # Keys used in gl_monthly_summary.csv + "n_lines": "Number of GL lines aggregated into the month/account group.", + "net_change": "Sum of `amount` in the month/account group.", + } + + +def analyze_gl_preparation(gl_journal: pd.DataFrame, chart_of_accounts: pd.DataFrame) -> GLPrepOutputs: + """Compute Chapter 7 outputs + a small QC summary.""" + + gl_tidy = prepare_gl_tidy(gl_journal, chart_of_accounts) + monthly = prepare_gl_monthly_summary(gl_tidy) + + n_lines = int(len(gl_tidy)) + n_txns = int(gl_tidy["txn_id"].nunique()) if n_lines else 0 + n_missing_accounts = int(gl_tidy["account_name"].isna().sum()) + n_bad_dates = int(gl_tidy["date"].isna().sum()) + + # Basic accounting invariant: sum of raw debit-positive amounts should be ~0 + raw_total = float(gl_tidy["raw_amount"].sum()) if n_lines else 0.0 + gl_balances = bool(abs(raw_total) < 1e-6) + + summary: dict[str, Any] = { + "checks": { + "gl_balances_raw_amount_sum_zero": gl_balances, + "coa_join_coverage_ok": n_missing_accounts == 0, + "no_missing_coa_mappings": n_missing_accounts == 0, + "all_gl_dates_parse": n_bad_dates == 0, + "no_unparseable_dates": n_bad_dates == 0, + }, + "metrics": { + "n_gl_lines": n_lines, + "n_txns": n_txns, + "n_accounts": int(gl_tidy["account_id"].nunique()) if n_lines else 0, + "n_months": int(gl_tidy["month"].nunique()) if n_lines else 0, + "n_missing_coa_mappings": n_missing_accounts, + "n_bad_dates": n_bad_dates, + "raw_amount_sum": raw_total, + }, + "data_dictionary": build_data_dictionary(), + "notes": { + "amount_definition": ( + "amount is signed so positive means the account increased on its normal side; " + "raw_amount uses debit-positive convention (debit - credit)." + ) + }, + } + + return GLPrepOutputs(gl_tidy=gl_tidy, gl_monthly_summary=monthly, summary=summary) diff --git a/workbooks/track_d_template/scripts/_business_recon.py b/workbooks/track_d_template/scripts/_business_recon.py new file mode 100644 index 0000000..b59775d --- /dev/null +++ b/workbooks/track_d_template/scripts/_business_recon.py @@ -0,0 +1,225 @@ +# SPDX-License-Identifier: MIT +"""Business reconciliation helpers (Track D, Chapter 6). + +Chapter 6 treats reconciliations as *data validation*. + +This module provides: +- bank reconciliation helpers (bank feed vs book cash activity) +- AR rollforward tie-outs (AR subledger vs trial balance control) + +The intent is to keep Chapter 6 code readable and testable. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + + +def write_json(obj: Any, path: str | Path) -> Path: + """Write a JSON file (pretty-printed) and return the written path.""" + p = Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + with p.open("w", encoding="utf-8") as f: + json.dump(obj, f, indent=2, sort_keys=True) + f.write("\n") + return p + + +def build_cash_txns_from_gl(gl: pd.DataFrame) -> pd.DataFrame: + """Group cash lines in GL into one row per txn_id with net cash impact.""" + cash_lines = gl.loc[gl["account_id"].astype(str) == "1000", ["txn_id", "date", "description", "debit", "credit"]].copy() + if cash_lines.empty: + return pd.DataFrame(columns=["txn_id", "date", "description", "amount"]) + + cash_lines["cash_net"] = cash_lines["debit"].astype(float) - cash_lines["credit"].astype(float) + + cash_txn = ( + cash_lines.groupby("txn_id", observed=True) + .agg(date=("date", "min"), description=("description", "first"), amount=("cash_net", "sum")) + .reset_index() + ) + cash_txn = cash_txn.loc[cash_txn["amount"].abs() > 1e-9].copy() + cash_txn = cash_txn.sort_values(["date", "txn_id"], kind="mergesort").reset_index(drop=True) + return cash_txn + + +def build_cash_txn_from_gl(gl: pd.DataFrame) -> pd.DataFrame: + """Alias for build_cash_txns_from_gl (keeps chapter script imports stable).""" + return build_cash_txns_from_gl(gl) + + +@dataclass(frozen=True) +class BankReconOutputs: + cash_txns: pd.DataFrame + matches: pd.DataFrame + exceptions: pd.DataFrame + + +def bank_reconcile(*, bank_statement: pd.DataFrame, cash_txns: pd.DataFrame, amount_tol: float = 0.01) -> BankReconOutputs: + """Reconcile bank statement lines against book cash transactions.""" + bank = bank_statement.copy() + if bank.empty: + empty = pd.DataFrame() + return BankReconOutputs(cash_txns=cash_txns.copy(), matches=empty, exceptions=empty) + + bank["gl_txn_id"] = pd.to_numeric(bank["gl_txn_id"], errors="coerce").astype("Int64") + bank["amount"] = bank["amount"].astype(float) + + cash = cash_txns.copy() + cash["txn_id"] = cash["txn_id"].astype(int) + cash["amount"] = cash["amount"].astype(float) + + matches = bank.merge( + cash.rename(columns={"txn_id": "gl_txn_id", "amount": "gl_amount", "date": "gl_date"}), + on="gl_txn_id", + how="left", + validate="m:1", + ) + + # convenience flag for summaries + matches["is_matched"] = matches["gl_amount"].notna() + + exceptions: list[dict[str, Any]] = [] + + # 1) duplicate bank_txn_id + dup_mask = matches["bank_txn_id"].astype(str).duplicated(keep=False) + if dup_mask.any(): + for _, r in matches.loc[dup_mask].iterrows(): + exceptions.append( + { + "exception_type": "bank_duplicate_txn_id", + "month": str(r.get("month", "")), + "bank_txn_id": str(r.get("bank_txn_id")), + "posted_date": str(r.get("posted_date")), + "gl_txn_id": (int(r["gl_txn_id"]) if pd.notna(r["gl_txn_id"]) else np.nan), + "bank_amount": float(r.get("amount", 0.0)), + "gl_amount": float(r.get("gl_amount", np.nan)) if pd.notna(r.get("gl_amount", np.nan)) else np.nan, + "details": "Duplicate bank_txn_id appears multiple times in bank feed.", + } + ) + + # 2) unmatched bank item + unmatched_bank = matches["gl_txn_id"].isna() | matches["gl_amount"].isna() + if unmatched_bank.any(): + for _, r in matches.loc[unmatched_bank].iterrows(): + exceptions.append( + { + "exception_type": "bank_unmatched_item", + "month": str(r.get("month", "")), + "bank_txn_id": str(r.get("bank_txn_id")), + "posted_date": str(r.get("posted_date")), + "gl_txn_id": (int(r["gl_txn_id"]) if pd.notna(r["gl_txn_id"]) else np.nan), + "bank_amount": float(r.get("amount", 0.0)), + "gl_amount": np.nan, + "details": "Bank statement line has no matching GL cash transaction.", + } + ) + + # 3) amount mismatch + matched = matches.loc[matches["gl_amount"].notna()].copy() + mism = matched.loc[(matched["amount"] - matched["gl_amount"]).abs() > float(amount_tol)] + if not mism.empty: + for _, r in mism.iterrows(): + exceptions.append( + { + "exception_type": "amount_mismatch", + "month": str(r.get("month", "")), + "bank_txn_id": str(r.get("bank_txn_id")), + "posted_date": str(r.get("posted_date")), + "gl_txn_id": int(r["gl_txn_id"]), + "bank_amount": float(r.get("amount", np.nan)), + "gl_amount": float(r.get("gl_amount", np.nan)), + "details": f"Bank amount differs from book by > {amount_tol}.", + } + ) + + # 4) book-only transactions (cash txn not seen on bank) + bank_gl_ids = set(matches.loc[matches["gl_txn_id"].notna(), "gl_txn_id"].astype(int).tolist()) + book_only = cash.loc[~cash["txn_id"].isin(bank_gl_ids)].copy() + if not book_only.empty: + for _, r in book_only.iterrows(): + exceptions.append( + { + "exception_type": "book_unmatched_cash_txn", + "month": str(r["date"])[:7], + "bank_txn_id": np.nan, + "posted_date": np.nan, + "gl_txn_id": int(r["txn_id"]), + "bank_amount": np.nan, + "gl_amount": float(r["amount"]), + "details": "Cash transaction in GL does not appear in bank feed.", + } + ) + + exc_df = pd.DataFrame(exceptions) + if not exc_df.empty: + exc_df = exc_df.sort_values(["exception_type", "month", "bank_txn_id"], kind="mergesort").reset_index(drop=True) + + return BankReconOutputs(cash_txns=cash, matches=matches, exceptions=exc_df) + + +def reconcile_bank_statement(bank_statement: pd.DataFrame, gl_journal: pd.DataFrame, *, amount_tol: float = 0.01) -> BankReconOutputs: + """Wrapper used by chapter script: bank feed vs GL.""" + cash_txns = build_cash_txns_from_gl(gl_journal) + return bank_reconcile(bank_statement=bank_statement, cash_txns=cash_txns, amount_tol=amount_tol) + + +def _ending_balance_from_tb(tb_month: pd.DataFrame, account_id: str) -> float: + """Return balance in its normal direction (positive if normal-side).""" + hit = tb_month.loc[tb_month["account_id"].astype(str) == str(account_id)] + if hit.empty: + return 0.0 + normal = str(hit.iloc[0]["normal_side"]) + ending_side = str(hit.iloc[0]["ending_side"]) + bal = float(hit.iloc[0]["ending_balance"]) + return bal if ending_side == normal else -bal + + +def ar_rollforward_vs_tb(trial_balance_monthly: pd.DataFrame, ar_events: pd.DataFrame) -> pd.DataFrame: + """Compute AR rollforward (begin + activity = end) and compare to TB.""" + tb = trial_balance_monthly.copy() + tb["month"] = tb["month"].astype(str) + + months = sorted(tb["month"].unique().tolist()) + + ar_monthly = ( + ar_events.assign(month=lambda d: d["month"].astype(str)) + .groupby("month", observed=True)["ar_delta"] + .sum() + .reindex(months, fill_value=0.0) + ) + + rows: list[dict[str, Any]] = [] + ar_begin = 0.0 + for m in months: + ar_delta = float(ar_monthly.loc[m]) + ar_end_events = float(ar_begin + ar_delta) + + tb_m = tb.loc[tb["month"] == m] + ar_end_tb = float(_ending_balance_from_tb(tb_m, "1100")) + diff = float(ar_end_events - ar_end_tb) + + rows.append( + { + "month": m, + "ar_begin": float(ar_begin), + "ar_delta": float(ar_delta), + "ar_end_from_events": float(ar_end_events), + "ar_end_from_tb": float(ar_end_tb), + "diff": float(diff), + } + ) + ar_begin = ar_end_events + + return pd.DataFrame(rows) + + +def build_ar_rollforward(trial_balance_monthly: pd.DataFrame, ar_events: pd.DataFrame) -> pd.DataFrame: + """Alias for ar_rollforward_vs_tb.""" + return ar_rollforward_vs_tb(trial_balance_monthly, ar_events) diff --git a/workbooks/track_d_template/scripts/_business_schema.py b/workbooks/track_d_template/scripts/_business_schema.py new file mode 100644 index 0000000..16a58d8 --- /dev/null +++ b/workbooks/track_d_template/scripts/_business_schema.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: MIT +"""Schema contracts for Track D business datasets.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pandas as pd + +DATASET_NSO_V1 = "nso_v1" + + +@dataclass(frozen=True) +class TableSchema: + name: str + required_columns: tuple[str, ...] + + +CONTRACT_TABLES: dict[str, TableSchema] = { + "chart_of_accounts": TableSchema( + name="chart_of_accounts.csv", + required_columns=("account_id", "account_name", "account_type", "normal_side"), + ), + "gl_journal": TableSchema( + name="gl_journal.csv", + required_columns=("txn_id", "date", "doc_id", "description", "account_id", "debit", "credit"), + ), + "trial_balance_monthly": TableSchema( + name="trial_balance_monthly.csv", + required_columns=( + "month", + "account_id", + "account_name", + "account_type", + "normal_side", + "debit", + "credit", + "ending_side", + "ending_balance", + ), + ), + "statements_is_monthly": TableSchema( + name="statements_is_monthly.csv", + required_columns=("month", "line", "amount"), + ), + "statements_bs_monthly": TableSchema( + name="statements_bs_monthly.csv", + required_columns=("month", "line", "amount"), + ), + "statements_cf_monthly": TableSchema( + name="statements_cf_monthly.csv", + required_columns=("month", "line", "amount"), + ), + "inventory_movements": TableSchema( + name="inventory_movements.csv", + required_columns=("month", "txn_id", "date", "sku", "movement_type", "qty", "unit_cost", "amount"), + ), + "fixed_assets": TableSchema( + name="fixed_assets.csv", + required_columns=( + "asset_id", + "asset_name", + "in_service_month", + "cost", + "useful_life_months", + "salvage_value", + "method", + ), + ), + "depreciation_schedule": TableSchema( + name="depreciation_schedule.csv", + required_columns=("month", "asset_id", "dep_expense", "accum_dep", "net_book_value"), + ), + # Chapter 5 + "payroll_events": TableSchema( + name="payroll_events.csv", + required_columns=( + "month", + "txn_id", + "date", + "event_type", + "gross_wages", + "employee_withholding", + "employer_tax", + "cash_paid", + "wages_payable_delta", + "payroll_taxes_payable_delta", + ), + ), + "sales_tax_events": TableSchema( + name="sales_tax_events.csv", + required_columns=( + "month", + "txn_id", + "date", + "event_type", + "taxable_sales", + "tax_amount", + "cash_paid", + "sales_tax_payable_delta", + ), + ), + "debt_schedule": TableSchema( + name="debt_schedule.csv", + required_columns=("month", "loan_id", "txn_id", "beginning_balance", "payment", "interest", "principal", "ending_balance"), + ), + "equity_events": TableSchema( + name="equity_events.csv", + required_columns=("month", "txn_id", "date", "event_type", "amount"), + ), + "ap_events": TableSchema( + name="ap_events.csv", + required_columns=("month", "txn_id", "date", "vendor", "invoice_id", "event_type", "amount", "ap_delta", "cash_paid"), + ), + # Chapter 6 + "ar_events": TableSchema( + name="ar_events.csv", + required_columns=("month", "txn_id", "date", "customer", "invoice_id", "event_type", "amount", "ar_delta", "cash_received"), + ), + "bank_statement": TableSchema( + name="bank_statement.csv", + required_columns=("month", "bank_txn_id", "posted_date", "description", "amount", "gl_txn_id"), + ), +} + +NSO_V1_TABLE_ORDER: tuple[str, ...] = ( + "chart_of_accounts", + "gl_journal", + "trial_balance_monthly", + "statements_is_monthly", + "statements_bs_monthly", + "statements_cf_monthly", + "inventory_movements", + "fixed_assets", + "depreciation_schedule", + # Chapter 5 + "payroll_events", + "sales_tax_events", + "debt_schedule", + "equity_events", + "ap_events", + # Chapter 6 + "ar_events", + "bank_statement", +) + +NSO_V1_TABLES: tuple[TableSchema, ...] = tuple(CONTRACT_TABLES[k] for k in NSO_V1_TABLE_ORDER) + + +def schemas_for_dataset(dataset: str) -> tuple[TableSchema, ...]: + if dataset == DATASET_NSO_V1: + return NSO_V1_TABLES + raise ValueError(f"Unknown dataset: {dataset}") + + +def validate_schema(datadir: Path, dataset: str) -> dict[str, Any]: + """Validate presence + required columns. Returns a report dict.""" + report: dict[str, Any] = { + "dataset": dataset, + "datadir": str(datadir), + "missing_tables": [], + "tables": {}, + "ok": True, + } + + for schema in schemas_for_dataset(dataset): + table_path = datadir / schema.name + if not table_path.exists(): + report["missing_tables"].append(schema.name) + report["tables"][schema.name] = { + "exists": False, + "missing_columns": list(schema.required_columns), + } + report["ok"] = False + continue + + df = pd.read_csv(table_path) + cols = set(map(str, df.columns)) + missing = [c for c in schema.required_columns if c not in cols] + report["tables"][schema.name] = { + "exists": True, + "n_rows": int(df.shape[0]), + "missing_columns": missing, + } + if missing: + report["ok"] = False + + return report diff --git a/workbooks/track_d_template/scripts/_cli.py b/workbooks/track_d_template/scripts/_cli.py new file mode 100644 index 0000000..d9ae74d --- /dev/null +++ b/workbooks/track_d_template/scripts/_cli.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import argparse +import pathlib +import random + +import numpy as np + + +def base_parser(description: str) -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description=description) + p.add_argument( + "--outdir", + type=pathlib.Path, + default=pathlib.Path("outputs/track_d"), + help="Where to write outputs (plots, csv). Default: ./outputs/track_d/track_d", + ) + p.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility") + return p + + +def apply_seed(seed: int | None) -> None: + if seed is None: + return + np.random.seed(seed) + random.seed(seed) + try: # optional: seed torch if present + import torch # pragma: no cover + torch.manual_seed(seed) + except Exception: + pass diff --git a/workbooks/track_d_template/scripts/_mpl_compat.py b/workbooks/track_d_template/scripts/_mpl_compat.py new file mode 100644 index 0000000..48687ff --- /dev/null +++ b/workbooks/track_d_template/scripts/_mpl_compat.py @@ -0,0 +1,34 @@ +"""Matplotlib compatibility helpers for workbook scripts. + +Matplotlib 3.9 renamed the Axes.boxplot keyword argument "labels" to +"tick_labels". The old name is deprecated and scheduled for removal. + +These helpers keep our educational scripts working on Matplotlib 3.8+ +while avoiding deprecation warnings on newer versions. +""" + +from __future__ import annotations + +from typing import Any, Sequence + + +def ax_boxplot( + ax: Any, + *args: Any, + tick_labels: Sequence[str] | None = None, + **kwargs: Any, +): + """Call ``ax.boxplot`` with a 3.8/3.9+ compatible keyword. + + Prefer ``tick_labels`` (Matplotlib >= 3.9). If that keyword is not + supported (Matplotlib <= 3.8), fall back to the legacy ``labels``. + """ + + if tick_labels is None: + return ax.boxplot(*args, **kwargs) + + try: + return ax.boxplot(*args, tick_labels=tick_labels, **kwargs) + except TypeError: + # Older Matplotlib: the new keyword doesn't exist. + return ax.boxplot(*args, labels=tick_labels, **kwargs) diff --git a/workbooks/track_d_template/scripts/_reporting_style.py b/workbooks/track_d_template/scripts/_reporting_style.py new file mode 100644 index 0000000..2338076 --- /dev/null +++ b/workbooks/track_d_template/scripts/_reporting_style.py @@ -0,0 +1,506 @@ +# SPDX-License-Identifier: MIT +"""Shared plotting/reporting helpers. + +Track D Chapter 9 introduces a *style contract* for figures and small reports. +This module centralizes the rules so later chapters can reuse them. + +Design goals +------------ +- Matplotlib-only (no seaborn) +- Deterministic output filenames and metadata +- Guardrails against misleading axes (especially for bar charts) +- Simple defaults suitable for ReadTheDocs screenshots and printing + +The "style contract" is intentionally conservative; it favors clarity over +flash. Downstream chapters can extend it, but should keep the core rules. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Iterable +from contextlib import contextmanager +import matplotlib as mpl +import numpy as np + +# Matplotlib is an optional dependency for some repo users. +# Track D chapters require it, so we import lazily in functions where possible. + +STYLE_CONTRACT: dict[str, Any] = { + "version": "1.0", + "allowed_chart_types": [ + "line", + "bar", + "histogram", + "ecdf", + "box", + "scatter", + "waterfall_bridge", + ], + "labeling_rules": { + "title_required": True, + "axis_labels_required": True, + "units_in_labels": True, + "use_month_tick_labels": "YYYY-MM", + "legend_only_if_multiple_series": True, + "caption_required_in_manifest": True, + }, + "anti_misleading_axes": { + "bar_charts_start_at_zero": True, + "explicit_note_if_y_truncated": True, + "show_zero_line_for_ratios": True, + "avoid_dual_axes": True, + }, + "distribution_guidance": { + "for_skewed_distributions": [ + "histogram + vertical lines for mean and median", + "ECDF (or quantile plot) to reveal tails", + "report key quantiles (p50, p75, p90, p95 if available)", + ] + }, + "file_format": {"type": "png", "dpi": 150}, + "figure_sizes": { + "time_series": [10.0, 4.0], + "distribution": [7.5, 4.5], + }, +} + + +# Minimal matplotlib rcParams for a consistent, non-misleading reporting look. +# NOTE: We intentionally avoid specifying colors so matplotlib defaults apply. +_REPORTING_RC: dict[str, object] = { + "figure.dpi": 120, + "savefig.dpi": 150, + "savefig.bbox": "tight", + "axes.grid": True, + "axes.titleweight": "bold", + "axes.titlesize": 12, + "axes.labelsize": 10, + "xtick.labelsize": 9, + "ytick.labelsize": 9, + "legend.fontsize": 9, +} + + +@contextmanager +def style_context(): + """Context manager to apply the reporting style contract to matplotlib figures.""" + with mpl.rc_context(rc=_REPORTING_RC): + yield + + + +@dataclass(frozen=True) +class FigureSpec: + """Minimal spec used when saving figures (validation + metadata).""" + + chart_type: str + title: str + caption: str = "" + x_label: str = "" + y_label: str = "" + data_source: str = "" + notes: str = "" + + +@dataclass(frozen=True) +class FigureManifestRow: + """One row in the Chapter 9 figure manifest CSV.""" + + filename: str + chart_type: str + title: str + x_label: str + y_label: str + guardrail_note: str + data_source: str + + + +def write_style_contract_json(outpath: Path) -> None: + """Write the global style contract to a JSON file.""" + + outpath.write_text(json.dumps(STYLE_CONTRACT, indent=2), encoding="utf-8") + + +def write_contract_json(outpath: Path) -> None: + """Write the global style contract to a JSON file.""" + outpath.write_text(json.dumps(STYLE_CONTRACT, indent=2), encoding="utf-8") + + + +def _mpl(): + """Import matplotlib with a non-interactive backend.""" + + import matplotlib + + # Ensure headless operation for CI / tests. + matplotlib.use("Agg", force=True) + + import matplotlib.pyplot as plt + + return matplotlib, plt + + +def mpl_context(): + """Context manager that applies a lightweight, consistent style.""" + + matplotlib, plt = _mpl() + + # A minimal rcParams set: keep things readable without over-styling. + rc = { + "figure.dpi": int(STYLE_CONTRACT["file_format"]["dpi"]), + "savefig.dpi": int(STYLE_CONTRACT["file_format"]["dpi"]), + "font.size": 10, + "axes.titlesize": 12, + "axes.labelsize": 10, + "legend.fontsize": 9, + "xtick.labelsize": 9, + "ytick.labelsize": 9, + "axes.grid": True, + "grid.alpha": 0.25, + "axes.spines.top": False, + "axes.spines.right": False, + } + + return matplotlib.rc_context(rc) + + +def save_figure(fig, outpath: Path, spec: FigureSpec | None = None) -> None: + """Save and close a Matplotlib figure deterministically. + + If spec is provided, enforce allowed chart types. + """ + if spec is not None: + ensure_allowed_chart_type(spec.chart_type) + + outpath.parent.mkdir(parents=True, exist_ok=True) + fig.tight_layout() + fig.savefig(outpath, bbox_inches="tight") + + # Avoid memory leaks in test runs. + _, plt = _mpl() + plt.close(fig) + + + +def _format_month_ticks(ax, months: list[str]) -> None: + """Format x-axis ticks for YYYY-MM month labels.""" + + # Show at most ~8 ticks; for longer series, reduce tick density. + n = len(months) + if n <= 8: + step = 1 + elif n <= 18: + step = 2 + else: + step = 3 + + ticks = list(range(0, n, step)) + ax.set_xticks(ticks) + ax.set_xticklabels([months[i] for i in ticks], rotation=45, ha="right") + + +def _enforce_bar_zero_baseline(ax) -> None: + """Enforce y-axis baseline at zero for bar charts.""" + + y0, y1 = ax.get_ylim() + if y0 > 0: + ax.set_ylim(0.0, y1) + elif y1 < 0: + ax.set_ylim(y0, 0.0) + + +def plot_time_series( + df, + x: str, + series: dict[str, str], + title: str, + x_label: str, + y_label: str, + figsize: tuple[float, float] | None = None, + show_zero_line: bool = False, +): + """Create a standard time-series line chart. + + Parameters + ---------- + df: + Dataframe with columns including x and all series columns. + x: + Column name for x-axis (typically month). + series: + Mapping of legend label -> column name. + show_zero_line: + If True, draw a horizontal line at y=0 (useful for ratios/growth). + """ + + _, plt = _mpl() + + if figsize is None: + w, h = STYLE_CONTRACT["figure_sizes"]["time_series"] + figsize = (float(w), float(h)) + + fig, ax = plt.subplots(figsize=figsize) + + months = [str(m) for m in df[x].tolist()] + x_idx = np.arange(len(months)) + + for label, col in series.items(): + ax.plot(x_idx, df[col].astype(float).to_numpy(), marker="o", linewidth=1.5, label=label) + + if show_zero_line: + ax.axhline(0.0, linewidth=1.0) + + ax.set_title(title) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + + _format_month_ticks(ax, months) + + if len(series) > 1: + ax.legend(loc="best") + + return fig + + +def plot_bar( + df, + x: str, + y: str, + title: str, + x_label: str, + y_label: str, + figsize: tuple[float, float] | None = None, +): + """Create a standard bar chart with a zero baseline.""" + + _, plt = _mpl() + + if figsize is None: + w, h = STYLE_CONTRACT["figure_sizes"]["time_series"] + figsize = (float(w), float(h)) + + fig, ax = plt.subplots(figsize=figsize) + + months = [str(m) for m in df[x].tolist()] + x_idx = np.arange(len(months)) + + ax.bar(x_idx, df[y].astype(float).to_numpy()) + + ax.set_title(title) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + + _format_month_ticks(ax, months) + _enforce_bar_zero_baseline(ax) + + return fig + + +def _ecdf(values: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + v = np.asarray(values, dtype=float) + v = v[~np.isnan(v)] + if v.size == 0: + return np.array([]), np.array([]) + v = np.sort(v) + y = np.arange(1, v.size + 1, dtype=float) / float(v.size) + return v, y + + +def plot_histogram_with_markers( + values: Iterable[float], + title: str, + x_label: str, + y_label: str, + markers: dict[str, float] | None = None, + figsize: tuple[float, float] | None = None, +): + """Histogram with optional vertical markers (e.g., mean/median).""" + + _, plt = _mpl() + + if figsize is None: + w, h = STYLE_CONTRACT["figure_sizes"]["distribution"] + figsize = (float(w), float(h)) + + v = np.asarray(list(values), dtype=float) + v = v[~np.isnan(v)] + + fig, ax = plt.subplots(figsize=figsize) + + if v.size > 0: + ax.hist(v, bins="auto") + + if markers: + for label, x0 in markers.items(): + if np.isfinite(x0): + ax.axvline(float(x0), linestyle="--", linewidth=1.2, label=label) + + ax.set_title(title) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + + if markers and len(markers) > 0: + ax.legend(loc="best") + + return fig + + +def plot_ecdf( + values: Iterable[float], + title: str, + x_label: str, + y_label: str, + markers: dict[str, float] | None = None, + figsize: tuple[float, float] | None = None, +): + """ECDF plot with optional vertical markers.""" + + _, plt = _mpl() + + if figsize is None: + w, h = STYLE_CONTRACT["figure_sizes"]["distribution"] + figsize = (float(w), float(h)) + + v = np.asarray(list(values), dtype=float) + x, y = _ecdf(v) + + fig, ax = plt.subplots(figsize=figsize) + + if x.size > 0: + ax.plot(x, y, marker=".", linestyle="none") + + if markers: + for label, x0 in markers.items(): + if np.isfinite(x0): + ax.axvline(float(x0), linestyle="--", linewidth=1.2, label=label) + + ax.set_title(title) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_ylim(0.0, 1.0) + + if markers and len(markers) > 0: + ax.legend(loc="best") + + return fig + + +def plot_waterfall_bridge( + start_label: str, + end_label: str, + start_value: float, + end_value: float, + components: list[tuple[str, float]], + title: str, + y_label: str, + x_label: str = "Component", + figsize: tuple[float, float] | None = None, +): + """Create a variance waterfall / bridge chart (start -> end via additive components). + + Guardrails + --------- + - Deterministic structure: explicit start and end totals plus additive components. + - Printer-safe encoding: hatch patterns distinguish positive vs negative deltas. + - Zero line included; y-limits padded to reduce truncation temptation. + + Notes + ----- + The caller is responsible for choosing defensible components. Any residual + can be included as an "Other / rounding" component to reconcile exactly. + """ + + _, plt = _mpl() + + if figsize is None: + w, h = STYLE_CONTRACT["figure_sizes"]["time_series"] + figsize = (float(w), float(h)) + + labels = [start_label] + [name for name, _ in components] + [end_label] + + # Running totals after each component (for connectors and y-range). + running = float(start_value) + totals = [running] + for _, delta in components: + running += float(delta) + totals.append(running) + totals.append(float(end_value)) + + fig, ax = plt.subplots(figsize=figsize) + + # Start total + ax.bar(0, float(start_value), edgecolor="black", linewidth=0.8) + + # Component deltas + running = float(start_value) + for i, (_, delta) in enumerate(components, start=1): + d = float(delta) + new_total = running + d + + if d >= 0: + bottom = running + height = d + hatch = "//" + else: + bottom = new_total + height = -d + hatch = "\\" + + ax.bar(i, height, bottom=bottom, hatch=hatch, edgecolor="black", linewidth=0.8) + running = new_total + + # End total + ax.bar(len(labels) - 1, float(end_value), edgecolor="black", linewidth=0.8) + + # Connectors between bars (running totals) + running = float(start_value) + for i, (_, delta) in enumerate(components, start=1): + ax.plot([i - 0.4, i + 0.4], [running, running], linewidth=1.0) + running += float(delta) + + ax.set_title(title) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_xticks(list(range(len(labels)))) + ax.set_xticklabels(labels, rotation=0) + ax.axhline(0.0, linewidth=1.0) + + def _fmt(v: float) -> str: + return f"{v:,.0f}" + + # Annotate start/end totals + ax.text(0, float(start_value), _fmt(float(start_value)), ha="center", va="bottom") + ax.text(len(labels) - 1, float(end_value), _fmt(float(end_value)), ha="center", va="bottom") + + # Annotate component deltas + running = float(start_value) + for i, (_, delta) in enumerate(components, start=1): + d = float(delta) + y = (running + d) if d >= 0 else running + ax.text(i, y, f"{d:+,.0f}", ha="center", va="bottom") + running += d + + # Pad y-limits (anti-truncation guardrail) + lo = min([0.0] + totals) + hi = max([0.0] + totals) + span = hi - lo + pad = 0.10 * span if span > 0 else 1.0 + ax.set_ylim(lo - pad, hi + pad) + + return fig + + +def figure_manifest_to_frame(specs: list[FigureSpec]): + import pandas as pd + + return pd.DataFrame([asdict(s) for s in specs]) + + +def ensure_allowed_chart_type(chart_type: str) -> None: + allowed = set(STYLE_CONTRACT["allowed_chart_types"]) + if chart_type not in allowed: + raise ValueError(f"chart_type must be one of {sorted(allowed)}; got {chart_type!r}") diff --git a/workbooks/track_d_template/scripts/business_ch01_accounting_measurement.py b/workbooks/track_d_template/scripts/business_ch01_accounting_measurement.py new file mode 100644 index 0000000..b2ddd6d --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch01_accounting_measurement.py @@ -0,0 +1,263 @@ +# SPDX-License-Identifier: MIT +"""Track D – Chapter 1: Accounting as a measurement system. + +This script reads the LedgerLab Ch01 tables, performs basic integrity checks +(controls-aware analytics), and produces accountant-friendly descriptive +summaries. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch01_cash_balance.png +* business_ch01_balance_sheet_bar.png +* business_ch01_summary.json + +The goal is to model a reproducible "mini-close" workflow: + +1) simulate bookkeeping data +2) derive statements +3) validate core accounting identities +4) produce a short, decision-oriented summary""" + +from __future__ import annotations + +import json +import pathlib +from dataclasses import dataclass +from typing import Any + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt # noqa: E402 +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser + + +@dataclass(frozen=True) +class Ch01Summary: + checks: dict[str, Any] + metrics: dict[str, Any] + + +def load_ledgerlab(datadir: pathlib.Path) -> dict[str, pd.DataFrame]: + """Load required LedgerLab tables for Chapter 1.""" + tables = { + "chart_of_accounts": datadir / "chart_of_accounts.csv", + "gl_journal": datadir / "gl_journal.csv", + "trial_balance_monthly": datadir / "trial_balance_monthly.csv", + "statements_is_monthly": datadir / "statements_is_monthly.csv", + "statements_bs_monthly": datadir / "statements_bs_monthly.csv", + } + missing = [name for name, path in tables.items() if not path.exists()] + if missing: + raise FileNotFoundError( + f"Missing required LedgerLab tables in {datadir}: {', '.join(missing)}" + ) + + # Ensure key identifiers stay as strings (CSV auto-inference would turn + # account_id 1000 into int, which breaks joins and filters). + read_specs: dict[str, dict[str, Any]] = { + "chart_of_accounts": {"dtype": {"account_id": str}}, + "gl_journal": {"dtype": {"txn_id": str, "doc_id": str, "account_id": str}}, + "trial_balance_monthly": {"dtype": {"account_id": str}}, + "statements_is_monthly": {}, + "statements_bs_monthly": {}, + } + + out: dict[str, pd.DataFrame] = { + name: pd.read_csv(path, **read_specs.get(name, {})) for name, path in tables.items() + } + return out + + +def check_transactions_balance(gl: pd.DataFrame, tol: float = 1e-6) -> dict[str, Any]: + """Validate debits == credits for each transaction.""" + by_txn = gl.groupby("txn_id", as_index=False)[["debit", "credit"]].sum() + by_txn["diff"] = (by_txn["debit"] - by_txn["credit"]).abs() + bad = by_txn.loc[by_txn["diff"] > tol] + return { + "transactions_balanced": bool(bad.empty), + "n_transactions": int(by_txn.shape[0]), + "n_unbalanced": int(bad.shape[0]), + "max_abs_diff": float(by_txn["diff"].max()) if not by_txn.empty else 0.0, + } + + +def check_accounting_equation(bs: pd.DataFrame, tol: float = 1e-6) -> dict[str, Any]: + """Validate Assets == Liabilities + Equity using the statement totals.""" + def _line_amount(line: str) -> float: + row = bs.loc[bs["line"] == line] + if row.empty: + return 0.0 + return float(row.iloc[0]["amount"]) + + assets = _line_amount("Total Assets") + le = _line_amount("Total Liabilities + Equity") + diff = float(abs(assets - le)) + return { + "accounting_equation_balances": diff <= tol, + "total_assets": assets, + "total_liabilities_plus_equity": le, + "abs_diff": diff, + } + + +def compute_metrics(gl: pd.DataFrame, is_stmt: pd.DataFrame, bs: pd.DataFrame) -> dict[str, Any]: + """Compute a small set of accountant-friendly descriptive statistics.""" + # Sales revenue: credit to account_id=4000 + sales_lines = gl.loc[gl["account_id"] == "4000", "credit"].to_numpy(dtype=float) + sales_total = float(np.sum(sales_lines)) + n_sales = int(gl.loc[gl["doc_id"].str.startswith("SALE"), "doc_id"].nunique()) + avg_sale = float(sales_total / n_sales) if n_sales else 0.0 + + # Cash vs AR sales (from the revenue-side entry) + rev_side = gl.loc[(gl["doc_id"].str.startswith("SALE")) & (gl["account_id"] == "4000")] + # match revenue-side by txn_id: the debit line account_id will be Cash or AR + rev_txn_ids = rev_side["txn_id"].unique() + rev_txn_lines = gl.loc[gl["txn_id"].isin(rev_txn_ids)].copy() + debit_by_txn = ( + rev_txn_lines.loc[rev_txn_lines["debit"] > 0] + .groupby("txn_id", as_index=False)["account_id"] + .first() + ) + pct_on_account = 0.0 + if not debit_by_txn.empty: + pct_on_account = float((debit_by_txn["account_id"] == "1100").mean()) + + # Gross margin proxy (Sales - COGS) + cogs_total = float(gl.loc[gl["account_id"] == "5000", "debit"].sum()) + gross_profit = sales_total - cogs_total + gross_margin_pct = float(gross_profit / sales_total) if sales_total else 0.0 + + # Net income from statement + ni_row = is_stmt.loc[is_stmt["line"] == "Net Income"] + net_income = float(ni_row.iloc[0]["amount"]) if not ni_row.empty else float("nan") + + # Ending cash from balance sheet + cash_row = bs.loc[bs["line"] == "Cash"] + ending_cash = float(cash_row.iloc[0]["amount"]) if not cash_row.empty else float("nan") + + return { + "sales_total": sales_total, + "n_sales": n_sales, + "avg_sale": avg_sale, + "pct_sales_on_account": pct_on_account, + "cogs_total": cogs_total, + "gross_profit": gross_profit, + "gross_margin_pct": gross_margin_pct, + "net_income": net_income, + "ending_cash": ending_cash, + } + + +def plot_cash_balance(gl: pd.DataFrame, outpath: pathlib.Path) -> None: + """Plot daily cash balance (simple teaching calendar: 28 days).""" + cash_lines = gl.loc[gl["account_id"] == "1000", ["date", "debit", "credit"]].copy() + if cash_lines.empty: + return + cash_lines["date"] = pd.to_datetime(cash_lines["date"]).dt.date + cash_lines["delta"] = cash_lines["debit"].astype(float) - cash_lines["credit"].astype(float) + daily = cash_lines.groupby("date", as_index=False)["delta"].sum().sort_values("date") + daily["balance"] = daily["delta"].cumsum() + + fig, ax = plt.subplots(figsize=(7, 4)) + ax.plot(daily["date"], daily["balance"], marker="o") + ax.set_title("Daily Cash Balance (LedgerLab)") + ax.set_xlabel("Date") + ax.set_ylabel("Cash balance") + ax.grid(True, linestyle=":", alpha=0.7) + fig.autofmt_xdate(rotation=45) + fig.tight_layout() + fig.savefig(outpath, dpi=150) + plt.close(fig) + + +def plot_balance_sheet(bs: pd.DataFrame, outpath: pathlib.Path) -> None: + """Simple bar chart: Assets vs Liabilities vs Equity.""" + def _line(line: str) -> float: + row = bs.loc[bs["line"] == line] + return float(row.iloc[0]["amount"]) if not row.empty else 0.0 + + assets = _line("Total Assets") + liab = _line("Total Liabilities") + equity = _line("Total Equity") + + fig, ax = plt.subplots(figsize=(6, 4)) + ax.bar(["Assets", "Liabilities", "Equity"], [assets, liab, equity]) + ax.set_title("Balance Sheet Snapshot (Month End)") + ax.set_ylabel("Amount") + ax.grid(axis="y", linestyle=":", alpha=0.7) + fig.tight_layout() + fig.savefig(outpath, dpi=150) + plt.close(fig) + + +def analyze_ch01(datadir: pathlib.Path, outdir: pathlib.Path, seed: int | None = None) -> Ch01Summary: + """Run Chapter 1 analysis and write artifacts.""" + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + + tables = load_ledgerlab(datadir) + gl = tables["gl_journal"] + is_stmt = tables["statements_is_monthly"] + bs = tables["statements_bs_monthly"] + + checks: dict[str, Any] = {} + checks.update(check_transactions_balance(gl)) + checks.update(check_accounting_equation(bs)) + + metrics = compute_metrics(gl, is_stmt, bs) + + plot_cash_balance(gl, outdir / "business_ch01_cash_balance.png") + plot_balance_sheet(bs, outdir / "business_ch01_balance_sheet_bar.png") + + payload = { + "chapter": "business_ch01_accounting_measurement", + "datadir": str(datadir), + "seed": seed, + "checks": checks, + "metrics": metrics, + } + (outdir / "business_ch01_summary.json").write_text( + json.dumps(payload, indent=2), encoding="utf-8" + ) + + return Ch01Summary(checks=checks, metrics=metrics) + + +def main() -> None: + parser = base_parser("Track D Chapter 1: Accounting as measurement") + parser.add_argument( + "--datadir", + type=pathlib.Path, + default=pathlib.Path("data/synthetic/ledgerlab_ch01"), + help="Directory containing LedgerLab core tables", + ) + parser.set_defaults(outdir=pathlib.Path("outputs/track_d")) + args = parser.parse_args() + + try: + summary = analyze_ch01(args.datadir, args.outdir, seed=args.seed) + except FileNotFoundError as e: + print(str(e)) + print("Hint: run `python -m scripts.sim_business_ledgerlab --outdir data/synthetic/ledgerlab_ch01 --seed 123`. ") + return + + print("\nChecks:") + for k, v in summary.checks.items(): + print(f"- {k}: {v}") + + print("\nKey metrics:") + for k, v in summary.metrics.items(): + if isinstance(v, float): + print(f"- {k}: {v:.4f}") + else: + print(f"- {k}: {v}") + + print(f"\nWrote outputs -> {args.outdir}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch02_double_entry_and_gl.py b/workbooks/track_d_template/scripts/business_ch02_double_entry_and_gl.py new file mode 100644 index 0000000..352d310 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch02_double_entry_and_gl.py @@ -0,0 +1,313 @@ +# SPDX-License-Identifier: MIT +"""Track D – Chapter 2: Double-entry and the general ledger as a database. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch02_gl_tidy.csv +* business_ch02_trial_balance.csv +* business_ch02_account_rollup.csv +* business_ch02_tb_by_account.png +* business_ch02_summary.json +""" + +from __future__ import annotations + +import json +import pathlib +from dataclasses import dataclass +from typing import Any, Iterable + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt # noqa: E402 +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser + + +@dataclass(frozen=True) +class Ch02Summary: + checks: dict[str, Any] + metrics: dict[str, Any] + + +ALLOWED_ACCOUNT_TYPES: set[str] = {"Asset", "Liability", "Equity", "Revenue", "Expense"} + + +def _require_columns(df: pd.DataFrame, cols: Iterable[str], name: str) -> None: + missing = [c for c in cols if c not in df.columns] + if missing: + raise ValueError(f"{name} missing required columns: {', '.join(missing)}") + + +def load_ledgerlab_core(datadir: pathlib.Path) -> dict[str, pd.DataFrame]: + """Load the core LedgerLab tables used in Chapter 2.""" + tables = { + "chart_of_accounts": datadir / "chart_of_accounts.csv", + "gl_journal": datadir / "gl_journal.csv", + "trial_balance_monthly": datadir / "trial_balance_monthly.csv", + } + missing = [name for name, path in tables.items() if not path.exists()] + if missing: + raise FileNotFoundError( + f"Missing required LedgerLab tables in {datadir}: {', '.join(missing)}" + ) + + read_specs: dict[str, dict[str, Any]] = { + "chart_of_accounts": {"dtype": {"account_id": str}}, + "gl_journal": {"dtype": {"txn_id": str, "doc_id": str, "account_id": str}}, + "trial_balance_monthly": {"dtype": {"account_id": str}}, + } + + return {name: pd.read_csv(path, **read_specs[name]) for name, path in tables.items()} + + +def check_schema(coa: pd.DataFrame, gl: pd.DataFrame) -> dict[str, Any]: + _require_columns( + coa, + ["account_id", "account_name", "account_type", "normal_side"], + "chart_of_accounts", + ) + _require_columns( + gl, + ["txn_id", "date", "doc_id", "description", "account_id", "debit", "credit"], + "gl_journal", + ) + + checks: dict[str, Any] = {} + + checks["coa_account_ids_unique"] = bool(coa["account_id"].is_unique) + + bad_types = sorted(set(coa["account_type"]) - ALLOWED_ACCOUNT_TYPES) + checks["coa_account_types_valid"] = len(bad_types) == 0 + checks["coa_bad_account_types"] = bad_types + + bad_sides = sorted(set(coa["normal_side"]) - {"Debit", "Credit"}) + checks["coa_normal_sides_valid"] = len(bad_sides) == 0 + checks["coa_bad_normal_sides"] = bad_sides + + coa_ids = set(coa["account_id"].astype(str)) + gl_ids = set(gl["account_id"].astype(str)) + missing_ids = sorted(gl_ids - coa_ids) + checks["gl_account_ids_in_coa"] = len(missing_ids) == 0 + checks["gl_missing_account_ids"] = missing_ids[:20] + + checks["gl_debits_nonnegative"] = bool((gl["debit"].astype(float) >= 0).all()) + checks["gl_credits_nonnegative"] = bool((gl["credit"].astype(float) >= 0).all()) + + return checks + + +def check_transactions_balance(gl: pd.DataFrame, tol: float = 1e-6) -> dict[str, Any]: + by_txn = gl.groupby("txn_id", as_index=False)[["debit", "credit"]].sum() + by_txn["diff"] = (by_txn["debit"] - by_txn["credit"]).abs() + bad = by_txn.loc[by_txn["diff"] > tol] + return { + "transactions_balanced": bool(bad.empty), + "n_transactions": int(by_txn.shape[0]), + "n_unbalanced": int(bad.shape[0]), + "max_abs_diff": float(by_txn["diff"].max()) if not by_txn.empty else 0.0, + } + + +def make_tidy_gl(gl: pd.DataFrame, coa: pd.DataFrame) -> pd.DataFrame: + """Analysis-ready GL export with explicit signed-amount conventions. + + Handles cases where gl_journal already includes COA-like columns by + coalescing values instead of creating _x/_y suffix confusion. + """ + # Bring only COA columns we need (and name them with a suffix to avoid collisions) + coa_small = coa[["account_id", "account_name", "account_type", "normal_side"]].copy() + + df = gl.merge(coa_small, on="account_id", how="left", suffixes=("", "_coa")) + + # Coalesce: prefer existing GL columns if present, otherwise use COA + for col in ["account_name", "account_type", "normal_side"]: + col_coa = f"{col}_coa" + if col in df.columns and col_coa in df.columns: + df[col] = df[col].where(df[col].notna(), df[col_coa]) + df = df.drop(columns=[col_coa]) + elif col_coa in df.columns and col not in df.columns: + df = df.rename(columns={col_coa: col}) + + df["date"] = pd.to_datetime(df["date"]).dt.date + + # Convention A: debit-positive, credit-negative + df["dc_amount"] = df["debit"].astype(float) - df["credit"].astype(float) + + # Convention B: normal-balance-positive + df["normal_amount"] = np.where( + df["normal_side"] == "Debit", df["dc_amount"], -df["dc_amount"] + ).astype(float) + + # Simple statement mapping + df["statement"] = np.where(df["account_type"].isin(["Revenue", "Expense"]), "IS", "BS") + + cols = [ + "txn_id", + "date", + "doc_id", + "description", + "account_id", + "account_name", + "account_type", + "normal_side", + "debit", + "credit", + "dc_amount", + "normal_amount", + "statement", + ] + return df[cols].sort_values(["date", "txn_id", "account_id"], kind="mergesort").reset_index(drop=True) + + + +def compute_trial_balance(tidy: pd.DataFrame, month: str) -> pd.DataFrame: + tb = ( + tidy.groupby( + ["account_id", "account_name", "account_type", "normal_side"], observed=True + )[["debit", "credit"]] + .sum() + .reset_index() + ) + tb["net"] = tb["debit"] - tb["credit"] + tb["ending_side"] = np.where(tb["net"] >= 0, "Debit", "Credit") + tb["ending_balance"] = tb["net"].abs() + tb = tb.drop(columns=["net"]) + tb.insert(0, "month", month) + return tb + + +def plot_tb_by_account(tb: pd.DataFrame, outpath: pathlib.Path, top_n: int = 10) -> None: + top = tb.sort_values("ending_balance", ascending=False).head(top_n).copy() + if top.empty: + return + + labels = top["account_name"].astype(str).to_list() + values = top["ending_balance"].astype(float).to_list() + + fig, ax = plt.subplots(figsize=(8, 4.5)) + ax.bar(labels, values) + ax.set_title(f"Trial Balance (Top {len(values)} accounts by balance)") + ax.set_ylabel("Ending balance (absolute)") + ax.tick_params(axis="x", rotation=30) + ax.grid(axis="y", linestyle=":", alpha=0.7) + fig.tight_layout() + fig.savefig(outpath, dpi=150) + plt.close(fig) + + +def analyze_ch02(datadir: pathlib.Path, outdir: pathlib.Path, seed: int | None = None) -> Ch02Summary: + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + + tables = load_ledgerlab_core(datadir) + coa = tables["chart_of_accounts"] + gl = tables["gl_journal"] + tb_src = tables["trial_balance_monthly"] + + # month: prefer provided TB month column, else infer from first GL date + if "month" in tb_src.columns and not tb_src.empty: + month = str(tb_src.iloc[0]["month"]) + else: + first_date = pd.to_datetime(gl["date"].iloc[0]).to_period("M") + month = f"{first_date.year:04d}-{first_date.month:02d}" + + checks: dict[str, Any] = {} + checks.update(check_schema(coa, gl)) + checks.update(check_transactions_balance(gl)) + + tidy = make_tidy_gl(gl, coa) + tb = compute_trial_balance(tidy, month=month) + + # Compare recomputed TB to source TB (lightweight consistency check) + if {"account_id", "ending_balance", "ending_side"}.issubset(tb_src.columns): + tb_merge = tb.merge( + tb_src[["account_id", "ending_balance", "ending_side"]].rename( + columns={ + "ending_balance": "ending_balance_src", + "ending_side": "ending_side_src", + } + ), + on="account_id", + how="left", + ) + tb_merge["diff"] = ( + tb_merge["ending_balance"].astype(float) + - tb_merge["ending_balance_src"].astype(float) + ).abs() + max_diff = float(tb_merge["diff"].max()) if not tb_merge.empty else 0.0 + checks["trial_balance_matches_source"] = bool(max_diff <= 1e-6) + checks["trial_balance_max_abs_diff"] = max_diff + else: + checks["trial_balance_matches_source"] = False + checks["trial_balance_max_abs_diff"] = None + + rollup = ( + tidy.groupby(["account_type", "statement"], observed=True)["normal_amount"] + .sum() + .reset_index() + .rename(columns={"normal_amount": "total_normal_amount"}) + .sort_values(["statement", "account_type"]) + .reset_index(drop=True) + ) + + metrics = { + "month": month, + "n_gl_rows": int(tidy.shape[0]), + "n_transactions": int(tidy["txn_id"].nunique()), + "n_accounts_used": int(tidy["account_id"].nunique()), + } + + tidy.to_csv(outdir / "business_ch02_gl_tidy.csv", index=False) + tb.to_csv(outdir / "business_ch02_trial_balance.csv", index=False) + rollup.to_csv(outdir / "business_ch02_account_rollup.csv", index=False) + plot_tb_by_account(tb, outdir / "business_ch02_tb_by_account.png") + + payload = { + "chapter": "business_ch02_double_entry_and_gl", + "datadir": str(datadir), + "seed": seed, + "checks": checks, + "metrics": metrics, + } + (outdir / "business_ch02_summary.json").write_text( + json.dumps(payload, indent=2), encoding="utf-8" + ) + + return Ch02Summary(checks=checks, metrics=metrics) + + +def main() -> None: + parser = base_parser("Track D Chapter 2: Double-entry & the GL as a database") + parser.add_argument( + "--datadir", + type=pathlib.Path, + default=pathlib.Path("data/synthetic/ledgerlab_ch01"), + help="Directory containing LedgerLab core tables", + ) + parser.set_defaults(outdir=pathlib.Path("outputs/track_d")) + args = parser.parse_args() + + try: + summary = analyze_ch02(args.datadir, args.outdir, seed=args.seed) + except FileNotFoundError as e: + print(str(e)) + print("Hint: run `make business-sim` first.") + return + + print("\nChecks:") + for k, v in summary.checks.items(): + print(f"- {k}: {v}") + + print("\nMetrics:") + for k, v in summary.metrics.items(): + print(f"- {k}: {v}") + + print(f"\nWrote outputs -> {args.outdir}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch03_statements_as_summaries.py b/workbooks/track_d_template/scripts/business_ch03_statements_as_summaries.py new file mode 100644 index 0000000..07a3749 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch03_statements_as_summaries.py @@ -0,0 +1,401 @@ +# SPDX-License-Identifier: MIT +""" +Track D - Chapter 3 +Financial statements as summary statistics. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch03_summary.json +* business_ch03_statement_bridge.csv +* business_ch03_trial_balance.csv +* business_ch03_net_income_vs_cash_change.png + +Reads LedgerLab tables, recomputes a TB from the GL, reconciles statements, +and builds a simple net-income -> cash-change bridge (cash flow style).""" + +from __future__ import annotations + +import argparse +import json +import pathlib +from dataclasses import dataclass +from typing import Any + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed + + +@dataclass(frozen=True) +class Ch03Summary: + checks: dict[str, Any] + metrics: dict[str, Any] + + +# ----------------------------- +# I/O helpers +# ----------------------------- +def _read_csv(path: pathlib.Path, **kwargs: Any) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"Missing required table: {path}") + return pd.read_csv(path, **kwargs) + + +def load_ledgerlab_tables(datadir: pathlib.Path) -> dict[str, pd.DataFrame]: + tables = { + "chart_of_accounts": _read_csv( + datadir / "chart_of_accounts.csv", + dtype={"account_id": str}, + ), + "gl_journal": _read_csv( + datadir / "gl_journal.csv", + dtype={"txn_id": str, "doc_id": str, "account_id": str}, + ), + "trial_balance_monthly": _read_csv( + datadir / "trial_balance_monthly.csv", + dtype={"account_id": str}, + ), + "statements_is_monthly": _read_csv(datadir / "statements_is_monthly.csv"), + "statements_bs_monthly": _read_csv(datadir / "statements_bs_monthly.csv"), + } + + # CF table is expected for Ch03, but we allow “compute if missing” + cf_path = datadir / "statements_cf_monthly.csv" + if cf_path.exists(): + tables["statements_cf_monthly"] = _read_csv(cf_path) + + return tables + + +# ----------------------------- +# Core checks and transforms +# ----------------------------- +def check_transactions_balance(gl: pd.DataFrame) -> dict[str, Any]: + required = {"txn_id", "debit", "credit"} + if not required.issubset(gl.columns): + return { + "transactions_balanced": False, + "n_transactions": None, + "n_unbalanced": None, + "max_abs_diff": None, + } + + g = gl.groupby("txn_id", observed=True)[["debit", "credit"]].sum() + diff = (g["debit"].astype(float) - g["credit"].astype(float)).abs() + n_txn = int(g.shape[0]) + n_unbalanced = int((diff > 1e-9).sum()) + max_abs_diff = float(diff.max()) if n_txn else 0.0 + + return { + "transactions_balanced": bool(n_unbalanced == 0), + "n_transactions": n_txn, + "n_unbalanced": n_unbalanced, + "max_abs_diff": max_abs_diff, + } + + +def make_tidy_gl(gl: pd.DataFrame, coa: pd.DataFrame) -> pd.DataFrame: + df = gl.copy() + + # Safe: allow account_name/type/normal_side to already exist + for col in ["account_name", "account_type", "normal_side"]: + if col in df.columns: + df = df.drop(columns=[col]) + + coa_cols = ["account_id", "account_name", "account_type", "normal_side"] + df = df.merge(coa[coa_cols], on="account_id", how="left") + + df["debit"] = df["debit"].astype(float) + df["credit"] = df["credit"].astype(float) + df["dc_amount"] = df["debit"] - df["credit"] + + # Normal-side amount: positive means “in the normal direction” + df["normal_amount"] = np.where( + df["normal_side"] == "Debit", + df["dc_amount"], + -df["dc_amount"], + ) + + cols = [ + "txn_id", + "date", + "doc_id", + "description", + "account_id", + "account_name", + "account_type", + "normal_side", + "debit", + "credit", + "dc_amount", + "normal_amount", + ] + return ( + df[cols] + .sort_values(["date", "txn_id", "account_id"], kind="mergesort") + .reset_index(drop=True) + ) + + +def compute_trial_balance(tidy: pd.DataFrame, month: str) -> pd.DataFrame: + tb = ( + tidy.groupby( + ["account_id", "account_name", "account_type", "normal_side"], observed=True + )[["debit", "credit"]] + .sum() + .reset_index() + ) + tb["net"] = tb["debit"] - tb["credit"] + tb["ending_side"] = np.where(tb["net"] >= 0, "Debit", "Credit") + tb["ending_balance"] = tb["net"].abs() + tb = tb.drop(columns=["net"]) + tb.insert(0, "month", month) + return tb + + +def _get_stmt_amount(stmt: pd.DataFrame, line: str) -> float: + if stmt.empty: + return 0.0 + hit = stmt.loc[stmt["line"].astype(str) == line, "amount"] + if hit.empty: + return 0.0 + return float(hit.iloc[0]) + + +def _get_stmt_amount_any(stmt: pd.DataFrame, lines: list[str]) -> float: + """Return first matching statement line amount, else 0.0.""" + for line in lines: + v = _get_stmt_amount(stmt, line) + if v != 0.0 or (stmt["line"].astype(str) == line).any(): + return v + return 0.0 + + +def _net_income_from_is_stmt(is_stmt: pd.DataFrame) -> float: + """ + Support both LedgerLab IS schemas: + A) Old: Sales Revenue, Total Expenses (incl. COGS), Net Income + B) New: Sales Revenue, Cost of Goods Sold, Operating Expenses, Net Income (and optional Gross Profit) + """ + # Prefer explicit net income if present + if (is_stmt["line"].astype(str) == "Net Income").any(): + return float(is_stmt.loc[is_stmt["line"].astype(str) == "Net Income", "amount"].iloc[0]) + + revenue = _get_stmt_amount(is_stmt, "Sales Revenue") + + if (is_stmt["line"].astype(str) == "Total Expenses (incl. COGS)").any(): + expenses = _get_stmt_amount(is_stmt, "Total Expenses (incl. COGS)") + return revenue - expenses + + cogs = _get_stmt_amount_any(is_stmt, ["Cost of Goods Sold", "COGS"]) + opx = _get_stmt_amount_any(is_stmt, ["Operating Expenses", "OpEx", "OPEX"]) + return revenue - (cogs + opx) + + +def _net_income_from_tb(tb: pd.DataFrame) -> float: + """Compute net income from TB as Revenue - Expenses (including COGS).""" + revenue_tb = float( + tb.loc[tb["account_type"] == "Revenue", "credit"].sum() + - tb.loc[tb["account_type"] == "Revenue", "debit"].sum() + ) + expenses_tb = float( + tb.loc[tb["account_type"] == "Expense", "debit"].sum() + - tb.loc[tb["account_type"] == "Expense", "credit"].sum() + ) + return revenue_tb - expenses_tb + + +def build_statement_bridge(is_stmt: pd.DataFrame, bs_stmt: pd.DataFrame) -> pd.DataFrame: + """ + Simple operating cash bridge (startup-month friendly): + CFO ≈ Net Income - ΔAR - ΔInv + ΔAP + Financing includes owner contribution. + Beginning cash assumed 0 for this synthetic startup month. + """ + net_income = _net_income_from_is_stmt(is_stmt) + cash_end = _get_stmt_amount(bs_stmt, "Cash") + ar_end = _get_stmt_amount(bs_stmt, "Accounts Receivable") + inv_end = _get_stmt_amount(bs_stmt, "Inventory") + ap_end = _get_stmt_amount(bs_stmt, "Accounts Payable") + owner_cap = _get_stmt_amount(bs_stmt, "Owner Capital") + + cfo = net_income - ar_end - inv_end + ap_end + cff = owner_cap + net_change_cash = cfo + cff + cash_begin = 0.0 + cash_end_from_bridge = cash_begin + net_change_cash + + rows = [ + ("Net Income", net_income), + ("Change in Accounts Receivable", -ar_end), + ("Change in Inventory", -inv_end), + ("Change in Accounts Payable", ap_end), + ("Net Cash from Operations", cfo), + ("Owner Contribution", owner_cap), + ("Net Cash from Financing", cff), + ("Net Change in Cash", net_change_cash), + ("Beginning Cash (assumed)", cash_begin), + ("Ending Cash (from bridge)", cash_end_from_bridge), + ("Ending Cash (balance sheet)", cash_end), + ("Bridge Diff (abs)", abs(cash_end_from_bridge - cash_end)), + ] + return pd.DataFrame(rows, columns=["line", "amount"]) + + +# ----------------------------- +# Plotting +# ----------------------------- +def plot_net_income_vs_cash_change( + net_income: float, cash_change: float, outpath: pathlib.Path +) -> None: + fig, ax = plt.subplots(figsize=(6.5, 4.0)) + ax.bar(["Net Income", "Cash Change"], [net_income, cash_change]) + ax.set_title("Net Income vs Change in Cash") + ax.set_ylabel("Amount") + ax.grid(axis="y", linestyle=":", alpha=0.7) + fig.tight_layout() + fig.savefig(outpath, dpi=150) + plt.close(fig) + + +# ----------------------------- +# Main analysis +# ----------------------------- +def analyze_ch03(datadir: pathlib.Path, outdir: pathlib.Path, seed: int | None = None) -> Ch03Summary: + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + + tables = load_ledgerlab_tables(datadir) + coa = tables["chart_of_accounts"] + gl = tables["gl_journal"] + tb_src = tables["trial_balance_monthly"] + is_stmt = tables["statements_is_monthly"] + bs_stmt = tables["statements_bs_monthly"] + + # Month: prefer TB month, else infer from GL date + if "month" in tb_src.columns and not tb_src.empty: + month = str(tb_src.iloc[0]["month"]) + else: + first_date = pd.to_datetime(gl["date"].iloc[0]).to_period("M") + month = f"{first_date.year:04d}-{first_date.month:02d}" + + checks: dict[str, Any] = {} + checks.update(check_transactions_balance(gl)) + + tidy = make_tidy_gl(gl, coa) + tb = compute_trial_balance(tidy, month=month) + + # TB reconciliation (like Ch02) + if {"account_id", "ending_balance", "ending_side"}.issubset(tb_src.columns): + tb_merge = tb.merge( + tb_src[["account_id", "ending_balance", "ending_side"]].rename( + columns={ + "ending_balance": "ending_balance_src", + "ending_side": "ending_side_src", + } + ), + on="account_id", + how="left", + ) + tb_merge["diff"] = ( + tb_merge["ending_balance"].astype(float) + - tb_merge["ending_balance_src"].astype(float) + ).abs() + max_diff = float(tb_merge["diff"].max()) if not tb_merge.empty else 0.0 + checks["trial_balance_matches_source"] = bool(max_diff <= 1e-6) + checks["trial_balance_max_abs_diff"] = max_diff + else: + checks["trial_balance_matches_source"] = False + checks["trial_balance_max_abs_diff"] = None + + # Income statement tie-out (robust to LedgerLab IS schema changes) + ni_tb = _net_income_from_tb(tb) + ni_is = _net_income_from_is_stmt(is_stmt) + ni_abs_diff = float(abs(ni_tb - ni_is)) + checks["income_statement_ties_to_trial_balance"] = bool(ni_abs_diff <= 1e-6) + checks["income_statement_max_abs_diff"] = ni_abs_diff + + # Balance sheet equation check (from statement lines) + total_assets = _get_stmt_amount(bs_stmt, "Total Assets") + total_l_plus_e = _get_stmt_amount(bs_stmt, "Total Liabilities + Equity") + bs_abs_diff = abs(total_assets - total_l_plus_e) + checks["balance_sheet_equation_balances"] = bool(bs_abs_diff <= 1e-6) + checks["balance_sheet_abs_diff"] = float(bs_abs_diff) + + # Cash flow bridge & tie-out + bridge = build_statement_bridge(is_stmt, bs_stmt) + cash_end_from_bridge = float( + bridge.loc[bridge["line"] == "Ending Cash (from bridge)", "amount"].iloc[0] + ) + cash_end_bs = float( + bridge.loc[bridge["line"] == "Ending Cash (balance sheet)", "amount"].iloc[0] + ) + cash_diff = abs(cash_end_from_bridge - cash_end_bs) + checks["cash_flow_ties_to_balance_sheet_cash"] = bool(cash_diff <= 1e-6) + checks["cash_flow_cash_abs_diff"] = float(cash_diff) + + # Metrics + cash_change = cash_end_bs # beginning cash assumed 0 for this synthetic startup month + metrics = { + "month": month, + "n_gl_rows": int(gl.shape[0]), + "n_transactions": int(gl["txn_id"].nunique()), + "net_income": float(ni_is), + "cash_change": float(cash_change), + "net_income_minus_cash_change": float(ni_is - cash_change), + } + + # Write outputs + (outdir / "business_ch03_summary.json").write_text( + json.dumps({"checks": checks, "metrics": metrics}, indent=2), + encoding="utf-8", + ) + + bridge_out = bridge.copy() + bridge_out.insert(0, "month", month) + bridge_out.to_csv(outdir / "business_ch03_statement_bridge.csv", index=False) + + tb.to_csv(outdir / "business_ch03_trial_balance.csv", index=False) + + plot_net_income_vs_cash_change( + net_income=float(ni_is), + cash_change=float(cash_change), + outpath=outdir / "business_ch03_net_income_vs_cash_change.png", + ) + + # Console output (matches Ch01/Ch02 style) + print("\nChecks:") + for k, v in checks.items(): + print(f"- {k}: {v}") + + print("\nMetrics:") + for k, v in metrics.items(): + print(f"- {k}: {v}") + + print(f"\nWrote outputs -> {outdir}") + + return Ch03Summary(checks=checks, metrics=metrics) + + +# ----------------------------- +# CLI +# ----------------------------- +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + description="Track D Chapter 3: statements as summary statistics (reconcile + bridge)." + ) + p.add_argument("--datadir", type=pathlib.Path, required=True) + p.add_argument("--outdir", type=pathlib.Path, required=True) + p.add_argument("--seed", type=int, default=None) + return p.parse_args(argv) + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + analyze_ch03(args.datadir, args.outdir, seed=args.seed) + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch04_assets_inventory_fixed_assets.py b/workbooks/track_d_template/scripts/business_ch04_assets_inventory_fixed_assets.py new file mode 100644 index 0000000..76ab4bb --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch04_assets_inventory_fixed_assets.py @@ -0,0 +1,277 @@ +# SPDX-License-Identifier: MIT +"""Track D - Chapter 4 +Assets: inventory, fixed assets, depreciation (and leases, conceptual). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch04_inventory_rollforward.csv +* business_ch04_margin_bridge.csv +* business_ch04_depreciation_rollforward.csv +* business_ch04_summary.json +* business_ch04_gross_margin_over_time.png +* business_ch04_depreciation_over_time.png + +Reads NSO v1 tables, builds rollforwards and tie-outs: +- Inventory movements -> Inventory ending balance +- Inventory movements -> COGS (sale issues + count adjustments) +- Depreciation schedule -> Depreciation Expense + Accumulated Depreciation + +Outputs: +- business_ch04_summary.json +- business_ch04_inventory_rollforward.csv +- business_ch04_margin_bridge.csv +- business_ch04_depreciation_rollforward.csv +- (optional) plots""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import matplotlib.pyplot as plt +import pandas as pd + +from scripts._cli import apply_seed + + +@dataclass(frozen=True) +class Ch04Summary: + checks: dict[str, Any] + metrics: dict[str, Any] + + +def _read_csv(path: Path, **kwargs: Any) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"Missing required table: {path}") + return pd.read_csv(path, **kwargs) + + +def _months_from_table(df: pd.DataFrame) -> list[str]: + if df.empty or "month" not in df.columns: + return [] + months = sorted(set(df["month"].astype(str))) + return months + + +def _get_stmt_amount(stmt: pd.DataFrame, month: str, line: str) -> float: + hit = stmt.loc[(stmt["month"].astype(str) == month) & (stmt["line"].astype(str) == line), "amount"] + if hit.empty: + return 0.0 + return float(hit.iloc[0]) + + +def plot_series(df: pd.DataFrame, x: str, y: str, title: str, outpath: Path) -> None: + fig, ax = plt.subplots(figsize=(7.0, 4.0)) + ax.plot(df[x], df[y], marker="o") + ax.set_title(title) + ax.set_xlabel(x) + ax.set_ylabel(y) + ax.grid(True, linestyle=":", alpha=0.7) + fig.tight_layout() + fig.savefig(outpath, dpi=150) + plt.close(fig) + + +def analyze_ch04(datadir: Path, outdir: Path, seed: int | None = None) -> Ch04Summary: + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + + gl = _read_csv(datadir / "gl_journal.csv", dtype={"txn_id": str, "account_id": str, "doc_id": str}) + inv = _read_csv(datadir / "inventory_movements.csv", dtype={"txn_id": str}) + fa = _read_csv(datadir / "fixed_assets.csv") + dep = _read_csv(datadir / "depreciation_schedule.csv") + is_stmt = _read_csv(datadir / "statements_is_monthly.csv") + bs_stmt = _read_csv(datadir / "statements_bs_monthly.csv") + + months = _months_from_table(is_stmt) + if not months: + raise ValueError("No months found in statements_is_monthly.csv") + + # --- Inventory rollforward + tie to BS/GL --- + inv["amount"] = inv["amount"].astype(float) + inv["qty"] = inv["qty"].astype(float) + + roll_rows: list[dict[str, Any]] = [] + beg_inv = 0.0 + + for m in months: + mdf = inv.loc[inv["month"].astype(str) == m] + purchases = float(mdf.loc[mdf["movement_type"] == "purchase", "amount"].sum()) + issues = float(mdf.loc[mdf["movement_type"] == "sale_issue", "amount"].sum()) # negative + adjusts = float(mdf.loc[mdf["movement_type"] == "count_adjustment", "amount"].sum()) + + end_sub = float(beg_inv + purchases + issues + adjusts) + + # inventory from BS statement + end_bs = float(_get_stmt_amount(bs_stmt, m, "Inventory")) + + roll_rows.append( + { + "month": m, + "begin_inventory": beg_inv, + "purchases": purchases, + "sale_issues": issues, + "count_adjustments": adjusts, + "end_inventory_subledger": end_sub, + "end_inventory_balance_sheet": end_bs, + "abs_diff": abs(end_sub - end_bs), + } + ) + beg_inv = end_sub + + inv_roll = pd.DataFrame(roll_rows) + + # --- COGS tie: COGS expense should equal - (inventory deltas from issues + adjustments) --- + cogs_rows: list[dict[str, Any]] = [] + for m in months: + mdf = inv.loc[inv["month"].astype(str) == m] + inv_delta_for_cogs = float( + mdf.loc[mdf["movement_type"].isin(["sale_issue", "count_adjustment"]), "amount"].sum() + ) + cogs_from_subledger = float(-inv_delta_for_cogs) + + cogs_stmt = float(_get_stmt_amount(is_stmt, m, "Cost of Goods Sold")) + + cogs_rows.append( + { + "month": m, + "cogs_from_subledger": cogs_from_subledger, + "cogs_from_income_statement": cogs_stmt, + "abs_diff": abs(cogs_from_subledger - cogs_stmt), + } + ) + cogs_tie = pd.DataFrame(cogs_rows) + + # --- Depreciation tie: schedule -> GL/statement --- + dep["dep_expense"] = dep["dep_expense"].astype(float) + dep_by_month = dep.groupby("month", observed=True)["dep_expense"].sum().reset_index() + + dep_rows: list[dict[str, Any]] = [] + for m in months: + dep_exp = float(dep_by_month.loc[dep_by_month["month"].astype(str) == m, "dep_expense"].sum()) + + # GL depreciation expense (6400) in the month (debits minus credits) + gl_m = gl.loc[pd.to_datetime(gl["date"]).dt.to_period("M").astype(str) == m] + dep_gl = float( + gl_m.loc[gl_m["account_id"].astype(str) == "6400", "debit"].astype(float).sum() + - gl_m.loc[gl_m["account_id"].astype(str) == "6400", "credit"].astype(float).sum() + ) + + # Accum dep in GL is credit-normal 1350; ending balance per BS is negative line, + # so use absolute value from BS line to compare. + accum_bs_line = float(_get_stmt_amount(bs_stmt, m, "Accumulated Depreciation")) # negative + accum_bs_abs = float(abs(accum_bs_line)) + + accum_sched = float( + dep.loc[(dep["month"].astype(str) == m), "accum_dep"].sum() + ) # note: sum across assets + + dep_rows.append( + { + "month": m, + "dep_expense_schedule": dep_exp, + "dep_expense_gl": dep_gl, + "dep_expense_abs_diff": abs(dep_exp - dep_gl), + "accum_dep_schedule": accum_sched, + "accum_dep_balance_sheet_abs": accum_bs_abs, + "accum_dep_abs_diff": abs(accum_sched - accum_bs_abs), + } + ) + dep_roll = pd.DataFrame(dep_rows) + + # --- Margin bridge (Sales, COGS, GM%) --- + margin_rows: list[dict[str, Any]] = [] + for m in months: + sales = float(_get_stmt_amount(is_stmt, m, "Sales Revenue")) + cogs = float(_get_stmt_amount(is_stmt, m, "Cost of Goods Sold")) + gp = float(sales - cogs) + gm_pct = float(gp / sales) if sales != 0 else 0.0 + margin_rows.append({"month": m, "sales": sales, "cogs": cogs, "gross_profit": gp, "gross_margin_pct": gm_pct}) + margin = pd.DataFrame(margin_rows) + + # --- Checks --- + checks: dict[str, Any] = {} + + inv_max_diff = float(inv_roll["abs_diff"].max()) if not inv_roll.empty else 0.0 + checks["inventory_subledger_ties_to_gl_inventory"] = bool(inv_max_diff <= 1e-6) + checks["inventory_max_abs_diff"] = inv_max_diff + + cogs_max_diff = float(cogs_tie["abs_diff"].max()) if not cogs_tie.empty else 0.0 + checks["cogs_subledger_ties_to_gl_cogs"] = bool(cogs_max_diff <= 1e-6) + checks["cogs_max_abs_diff"] = cogs_max_diff + + dep_exp_max = float(dep_roll["dep_expense_abs_diff"].max()) if not dep_roll.empty else 0.0 + checks["depreciation_schedule_ties_to_gl_dep_expense"] = bool(dep_exp_max <= 1e-6) + checks["depreciation_expense_max_abs_diff"] = dep_exp_max + + accum_max = float(dep_roll["accum_dep_abs_diff"].max()) if not dep_roll.empty else 0.0 + checks["accum_dep_ties_to_gl_accum_dep"] = bool(accum_max <= 1e-6) + checks["accum_dep_max_abs_diff"] = accum_max + + # --- Metrics --- + metrics: dict[str, Any] = { + "n_months": int(len(months)), + "n_gl_rows": int(gl.shape[0]), + "n_inventory_movements": int(inv.shape[0]), + "n_fixed_assets": int(fa.shape[0]), + } + + # --- Write outputs --- + (outdir / "business_ch04_inventory_rollforward.csv").write_text(inv_roll.to_csv(index=False), encoding="utf-8") + (outdir / "business_ch04_margin_bridge.csv").write_text(margin.to_csv(index=False), encoding="utf-8") + (outdir / "business_ch04_depreciation_rollforward.csv").write_text(dep_roll.to_csv(index=False), encoding="utf-8") + + (outdir / "business_ch04_summary.json").write_text( + json.dumps({"checks": checks, "metrics": metrics}, indent=2), + encoding="utf-8", + ) + + # Optional plots (keep simple) + plot_series( + margin, + x="month", + y="gross_margin_pct", + title="Gross Margin % over time", + outpath=outdir / "business_ch04_gross_margin_over_time.png", + ) + + plot_series( + dep_roll, + x="month", + y="dep_expense_gl", + title="Depreciation Expense (GL) over time", + outpath=outdir / "business_ch04_depreciation_over_time.png", + ) + + # Console output + print("\nChecks:") + for k, v in checks.items(): + print(f"- {k}: {v}") + + print("\nMetrics:") + for k, v in metrics.items(): + print(f"- {k}: {v}") + + print(f"\nWrote outputs -> {outdir}") + + return Ch04Summary(checks=checks, metrics=metrics) + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser(description="Track D Chapter 4: Assets (inventory + fixed assets + depreciation).") + p.add_argument("--datadir", type=Path, required=True) + p.add_argument("--outdir", type=Path, required=True) + p.add_argument("--seed", type=int, default=None) + return p.parse_args(argv) + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + analyze_ch04(args.datadir, args.outdir, seed=args.seed) + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch05_liabilities_payroll_taxes_equity.py b/workbooks/track_d_template/scripts/business_ch05_liabilities_payroll_taxes_equity.py new file mode 100644 index 0000000..92f2b1d --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch05_liabilities_payroll_taxes_equity.py @@ -0,0 +1,382 @@ +""" +Track D - Chapter 5 +Liabilities, payroll, taxes, and equity: obligations and structure. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch05_summary.json +* business_ch05_wages_payable_rollforward.csv +* business_ch05_payroll_taxes_payable_rollforward.csv +* business_ch05_sales_tax_payable_rollforward.csv +* business_ch05_notes_payable_rollforward.csv +* business_ch05_accounts_payable_rollforward.csv +* business_ch05_liabilities_over_time.png + +Reads NSO v1 tables, recomputes key monthly totals from the GL, and performs +controls-as-validation tie-outs: + +- Debt schedule ↔ interest expense + notes payable rollforward +- Payroll events ↔ payroll expense + wages payable + payroll taxes payable rollforward +- Sales tax events ↔ sales tax payable rollforward +- Equity events ↔ contributions/draws + equity rollforward (simple) + +Writes summary checks/metrics + rollforward tables + (optional) a plot.""" + +from __future__ import annotations + +import argparse +import json +import pathlib +from dataclasses import dataclass +from typing import Any + +import matplotlib.pyplot as plt +import pandas as pd + +from scripts._cli import apply_seed + + +@dataclass(frozen=True) +class Ch05Summary: + checks: dict[str, Any] + metrics: dict[str, Any] + + +def _read_csv(path: pathlib.Path, **kwargs: Any) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"Missing required table: {path}") + return pd.read_csv(path, **kwargs) + + +def load_nso_v1_tables(datadir: pathlib.Path) -> dict[str, pd.DataFrame]: + # Core + tables: dict[str, pd.DataFrame] = { + "chart_of_accounts": _read_csv(datadir / "chart_of_accounts.csv", dtype={"account_id": str}), + "gl_journal": _read_csv( + datadir / "gl_journal.csv", + dtype={"txn_id": str, "doc_id": str, "account_id": str}, + ), + "trial_balance_monthly": _read_csv(datadir / "trial_balance_monthly.csv", dtype={"account_id": str}), + "statements_is_monthly": _read_csv(datadir / "statements_is_monthly.csv"), + "statements_bs_monthly": _read_csv(datadir / "statements_bs_monthly.csv"), + "statements_cf_monthly": _read_csv(datadir / "statements_cf_monthly.csv"), + # Subledgers / events (Ch04/Ch05) + "inventory_movements": _read_csv(datadir / "inventory_movements.csv", dtype={"txn_id": str}), + "fixed_assets": _read_csv(datadir / "fixed_assets.csv", dtype={"asset_id": str}), + "depreciation_schedule": _read_csv(datadir / "depreciation_schedule.csv", dtype={"asset_id": str}), + # Ch05 additions + "payroll_events": _read_csv(datadir / "payroll_events.csv", dtype={"txn_id": str}), + "sales_tax_events": _read_csv(datadir / "sales_tax_events.csv", dtype={"txn_id": str}), + "debt_schedule": _read_csv(datadir / "debt_schedule.csv", dtype={"loan_id": str, "txn_id": str}), + "equity_events": _read_csv(datadir / "equity_events.csv", dtype={"txn_id": str}), + "ap_events": _read_csv(datadir / "ap_events.csv", dtype={"txn_id": str, "invoice_id": str}), + } + return tables + + +def check_transactions_balance(gl: pd.DataFrame) -> dict[str, Any]: + required = {"txn_id", "debit", "credit"} + if not required.issubset(gl.columns): + return { + "transactions_balanced": False, + "n_transactions": None, + "n_unbalanced": None, + "max_abs_diff": None, + } + + g = gl.groupby("txn_id", observed=True)[["debit", "credit"]].sum() + diff = (g["debit"].astype(float) - g["credit"].astype(float)).abs() + n_txn = int(g.shape[0]) + n_unbalanced = int((diff > 1e-9).sum()) + max_abs_diff = float(diff.max()) if n_txn else 0.0 + + return { + "transactions_balanced": bool(n_unbalanced == 0), + "n_transactions": n_txn, + "n_unbalanced": n_unbalanced, + "max_abs_diff": max_abs_diff, + } + + +def _month_series_from_date(gl: pd.DataFrame) -> pd.Series: + return pd.to_datetime(gl["date"]).dt.to_period("M").astype(str) + + +def gl_monthly_amount(gl: pd.DataFrame, account_id: str) -> pd.Series: + """ + Monthly signed amount in debit-credit space: debit - credit. + For expense accounts, this is typically positive. + """ + df = gl.copy() + df["month"] = _month_series_from_date(df) + g = df.loc[df["account_id"].astype(str) == str(account_id)].groupby("month", observed=True)[["debit", "credit"]].sum() + amt = (g["debit"].astype(float) - g["credit"].astype(float)) + return amt.sort_index() + + +def tb_signed_normal_balance(tb: pd.DataFrame, account_id: str) -> pd.Series: + """ + Monthly ending balance signed in the account's normal direction: + + means "in normal_side", - means opposite. + + tb has: month, account_id, normal_side, ending_side, ending_balance + """ + df = tb.loc[tb["account_id"].astype(str) == str(account_id)].copy() + if df.empty: + return pd.Series(dtype=float) + + df["ending_balance"] = df["ending_balance"].astype(float) + same = df["ending_side"].astype(str) == df["normal_side"].astype(str) + df["signed_normal"] = df["ending_balance"].where(same, -df["ending_balance"]) + out = df.set_index(df["month"].astype(str))["signed_normal"].sort_index() + out.index.name = "month" + return out + + +def _rollforward_from_deltas( + ending_balance: pd.Series, + delta: pd.Series, +) -> pd.DataFrame: + """ + Build a rollforward table with: + begin + delta = end + using ending balances as source of truth. + """ + months = sorted(set(ending_balance.index.astype(str)) | set(delta.index.astype(str))) + end = ending_balance.reindex(months).fillna(0.0).astype(float) + d = delta.reindex(months).fillna(0.0).astype(float) + begin = end.shift(1).fillna(0.0) + rf = pd.DataFrame({"begin": begin, "delta": d, "end": end}, index=pd.Index(months, name="month")) + rf["calc_end"] = rf["begin"] + rf["delta"] + rf["diff"] = (rf["calc_end"] - rf["end"]).abs() + return rf.reset_index() + + +def plot_liabilities_over_time( + outpath: pathlib.Path, + wages_payable_end: pd.Series, + payroll_tax_payable_end: pd.Series, + sales_tax_payable_end: pd.Series, + notes_payable_end: pd.Series, +) -> None: + df = pd.DataFrame( + { + "Wages Payable": wages_payable_end, + "Payroll Taxes Payable": payroll_tax_payable_end, + "Sales Tax Payable": sales_tax_payable_end, + "Notes Payable": notes_payable_end, + } + ).fillna(0.0) + + fig, ax = plt.subplots(figsize=(8.5, 4.5)) + df.plot(ax=ax) # default colors + ax.set_title("Key Liabilities (Ending Balances) Over Time") + ax.set_xlabel("Month") + ax.set_ylabel("Ending Balance (normal-direction signed)") + ax.grid(axis="y", linestyle=":", alpha=0.6) + fig.tight_layout() + fig.savefig(outpath, dpi=150) + plt.close(fig) + + +def analyze_ch05(datadir: pathlib.Path, outdir: pathlib.Path, seed: int | None = None) -> Ch05Summary: + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + + t = load_nso_v1_tables(datadir) + gl = t["gl_journal"] + tb = t["trial_balance_monthly"] + + payroll = t["payroll_events"].copy() + sales_tax = t["sales_tax_events"].copy() + debt = t["debt_schedule"].copy() + equity = t["equity_events"].copy() + ap = t["ap_events"].copy() + + checks: dict[str, Any] = {} + checks.update(check_transactions_balance(gl)) + + # ---- Accounts (match NSO v1 COA) ---- + ACCTS = { + "wages_expense": "6300", + "payroll_tax_expense": "6500", + "interest_expense": "6600", + "wages_payable": "2110", + "payroll_taxes_payable": "2120", + "sales_tax_payable": "2100", + "notes_payable": "2200", + "owner_capital": "3000", + "owner_draw": "3200", + "accounts_payable": "2000", + } + + # ---- Monthly GL totals ---- + wages_exp_gl = gl_monthly_amount(gl, ACCTS["wages_expense"]) + pr_tax_exp_gl = gl_monthly_amount(gl, ACCTS["payroll_tax_expense"]) + interest_exp_gl = gl_monthly_amount(gl, ACCTS["interest_expense"]) + + # ---- Payroll tie-outs ---- + # payroll_events schema: + # month, txn_id, date, event_type, + # gross_wages, employee_withholding, employer_tax, + # cash_paid, wages_payable_delta, payroll_taxes_payable_delta + payroll["month"] = payroll["month"].astype(str) + for c in [ + "gross_wages", + "employee_withholding", + "employer_tax", + "cash_paid", + "wages_payable_delta", + "payroll_taxes_payable_delta", + ]: + payroll[c] = payroll[c].astype(float) + + wages_exp_sub = payroll.groupby("month", observed=True)["gross_wages"].sum().sort_index() + pr_tax_exp_sub = payroll.groupby("month", observed=True)["employer_tax"].sum().sort_index() + + wages_exp_diff = (wages_exp_sub.reindex(wages_exp_gl.index).fillna(0.0) - wages_exp_gl.fillna(0.0)).abs() + pr_tax_exp_diff = (pr_tax_exp_sub.reindex(pr_tax_exp_gl.index).fillna(0.0) - pr_tax_exp_gl.fillna(0.0)).abs() + + checks["payroll_expense_ties_to_gl"] = bool(float(wages_exp_diff.max() if not wages_exp_diff.empty else 0.0) <= 1e-6) + checks["payroll_expense_max_abs_diff"] = float(wages_exp_diff.max()) if not wages_exp_diff.empty else 0.0 + + checks["payroll_tax_expense_ties_to_gl"] = bool(float(pr_tax_exp_diff.max() if not pr_tax_exp_diff.empty else 0.0) <= 1e-6) + checks["payroll_tax_expense_max_abs_diff"] = float(pr_tax_exp_diff.max()) if not pr_tax_exp_diff.empty else 0.0 + + # Wages payable rollforward: ending = begin + delta (deltas from payroll_events) + wages_payable_end = tb_signed_normal_balance(tb, ACCTS["wages_payable"]) + wages_payable_delta = payroll.groupby("month", observed=True)["wages_payable_delta"].sum().sort_index() + wages_rf = _rollforward_from_deltas(wages_payable_end, wages_payable_delta) + checks["wages_payable_rollforward_ties"] = bool(float(wages_rf["diff"].max() if not wages_rf.empty else 0.0) <= 1e-6) + checks["wages_payable_max_abs_diff"] = float(wages_rf["diff"].max()) if not wages_rf.empty else 0.0 + + # Payroll taxes payable rollforward + payroll_tax_payable_end = tb_signed_normal_balance(tb, ACCTS["payroll_taxes_payable"]) + payroll_tax_payable_delta = payroll.groupby("month", observed=True)["payroll_taxes_payable_delta"].sum().sort_index() + prtax_rf = _rollforward_from_deltas(payroll_tax_payable_end, payroll_tax_payable_delta) + checks["payroll_taxes_payable_rollforward_ties"] = bool(float(prtax_rf["diff"].max() if not prtax_rf.empty else 0.0) <= 1e-6) + checks["payroll_taxes_payable_max_abs_diff"] = float(prtax_rf["diff"].max()) if not prtax_rf.empty else 0.0 + + # ---- Sales tax tie-outs ---- + sales_tax["month"] = sales_tax["month"].astype(str) + for c in ["taxable_sales", "tax_amount", "cash_paid", "sales_tax_payable_delta"]: + sales_tax[c] = sales_tax[c].astype(float) + + sales_tax_payable_end = tb_signed_normal_balance(tb, ACCTS["sales_tax_payable"]) + sales_tax_delta = sales_tax.groupby("month", observed=True)["sales_tax_payable_delta"].sum().sort_index() + st_rf = _rollforward_from_deltas(sales_tax_payable_end, sales_tax_delta) + checks["sales_tax_payable_rollforward_ties"] = bool(float(st_rf["diff"].max() if not st_rf.empty else 0.0) <= 1e-6) + checks["sales_tax_payable_max_abs_diff"] = float(st_rf["diff"].max()) if not st_rf.empty else 0.0 + + # ---- Debt tie-outs ---- + # debt_schedule schema: month, loan_id, txn_id, beginning_balance, payment, interest, principal, ending_balance + debt["month"] = debt["month"].astype(str) + for c in ["beginning_balance", "payment", "interest", "principal", "ending_balance"]: + debt[c] = debt[c].astype(float) + + # Interest ties: sum schedule interest per month == GL interest expense + debt_interest = debt.groupby("month", observed=True)["interest"].sum().sort_index() + interest_diff = (debt_interest.reindex(interest_exp_gl.index).fillna(0.0) - interest_exp_gl.fillna(0.0)).abs() + checks["interest_expense_ties_to_gl"] = bool(float(interest_diff.max() if not interest_diff.empty else 0.0) <= 1e-6) + checks["interest_expense_max_abs_diff"] = float(interest_diff.max()) if not interest_diff.empty else 0.0 + + # Notes payable rollforward: ending balances vs deltas (delta = +new borrowing - principal) + notes_payable_end = tb_signed_normal_balance(tb, ACCTS["notes_payable"]) + # Use schedule principal as reduction; borrowing may appear as a one-time +delta via a special "originations" row (principal negative) + # We store deltas explicitly in the simulator; here we infer: + # month_delta = (begin->end) from schedule (end - begin) + debt_delta = (debt.groupby("month", observed=True)["ending_balance"].sum() - debt.groupby("month", observed=True)["beginning_balance"].sum()).sort_index() + np_rf = _rollforward_from_deltas(notes_payable_end, debt_delta) + checks["notes_payable_rollforward_ties"] = bool(float(np_rf["diff"].max() if not np_rf.empty else 0.0) <= 1e-6) + checks["notes_payable_max_abs_diff"] = float(np_rf["diff"].max()) if not np_rf.empty else 0.0 + + # ---- Accounts payable rollforward (optional tie-out from ap_events) ---- + ap["month"] = ap["month"].astype(str) + ap["ap_delta"] = ap["ap_delta"].astype(float) + ap_end = tb_signed_normal_balance(tb, ACCTS["accounts_payable"]) + ap_delta = ap.groupby("month", observed=True)["ap_delta"].sum().sort_index() + ap_rf = _rollforward_from_deltas(ap_end, ap_delta) + checks["accounts_payable_rollforward_ties"] = bool(float(ap_rf["diff"].max() if not ap_rf.empty else 0.0) <= 1e-6) + checks["accounts_payable_max_abs_diff"] = float(ap_rf["diff"].max()) if not ap_rf.empty else 0.0 + + # ---- Equity event ties (simple) ---- + equity["month"] = equity["month"].astype(str) + equity["amount"] = equity["amount"].astype(float) + + # Contributions should match GL credit to owner_capital (in debit-credit space, credit is negative), + # but we compare absolute “economic” amounts: + contrib_sub = equity.loc[equity["event_type"] == "contribution"].groupby("month", observed=True)["amount"].sum().sort_index() + draw_sub = equity.loc[equity["event_type"] == "draw"].groupby("month", observed=True)["amount"].sum().sort_index() + + owner_cap_gl = gl_monthly_amount(gl, ACCTS["owner_capital"]) # debit-credit + owner_draw_gl = gl_monthly_amount(gl, ACCTS["owner_draw"]) + + # owner_capital postings are credits, so owner_cap_gl is usually negative; compare -owner_cap_gl to contrib amounts + contrib_diff = (contrib_sub.reindex(owner_cap_gl.index).fillna(0.0) - (-owner_cap_gl).fillna(0.0)).abs() + draw_diff = (draw_sub.reindex(owner_draw_gl.index).fillna(0.0) - owner_draw_gl.fillna(0.0)).abs() + + checks["owner_contributions_tie_to_gl"] = bool(float(contrib_diff.max() if not contrib_diff.empty else 0.0) <= 1e-6) + checks["owner_contributions_max_abs_diff"] = float(contrib_diff.max()) if not contrib_diff.empty else 0.0 + checks["owner_draws_tie_to_gl"] = bool(float(draw_diff.max() if not draw_diff.empty else 0.0) <= 1e-6) + checks["owner_draws_max_abs_diff"] = float(draw_diff.max()) if not draw_diff.empty else 0.0 + + # ---- Outputs ---- + metrics = { + "n_months": int(tb["month"].nunique()) if "month" in tb.columns else None, + "n_gl_rows": int(gl.shape[0]), + "n_payroll_events": int(payroll.shape[0]), + "n_sales_tax_events": int(sales_tax.shape[0]), + "n_debt_rows": int(debt.shape[0]), + "n_equity_events": int(equity.shape[0]), + } + + # Write summary + rollforwards + (outdir / "business_ch05_summary.json").write_text( + json.dumps({"checks": checks, "metrics": metrics}, indent=2), + encoding="utf-8", + ) + + wages_rf.to_csv(outdir / "business_ch05_wages_payable_rollforward.csv", index=False) + prtax_rf.to_csv(outdir / "business_ch05_payroll_taxes_payable_rollforward.csv", index=False) + st_rf.to_csv(outdir / "business_ch05_sales_tax_payable_rollforward.csv", index=False) + np_rf.to_csv(outdir / "business_ch05_notes_payable_rollforward.csv", index=False) + ap_rf.to_csv(outdir / "business_ch05_accounts_payable_rollforward.csv", index=False) + + # Optional plot (lightweight and useful) + plot_liabilities_over_time( + outpath=outdir / "business_ch05_liabilities_over_time.png", + wages_payable_end=wages_payable_end, + payroll_tax_payable_end=payroll_tax_payable_end, + sales_tax_payable_end=sales_tax_payable_end, + notes_payable_end=notes_payable_end, + ) + + # Console output (match style) + print("\nChecks:") + for k, v in checks.items(): + print(f"- {k}: {v}") + + print("\nMetrics:") + for k, v in metrics.items(): + print(f"- {k}: {v}") + + print(f"\nWrote outputs -> {outdir}") + + return Ch05Summary(checks=checks, metrics=metrics) + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser(description="Track D Chapter 5: liabilities, payroll, taxes, equity (tie-outs).") + p.add_argument("--datadir", type=pathlib.Path, required=True) + p.add_argument("--outdir", type=pathlib.Path, required=True) + p.add_argument("--seed", type=int, default=None) + return p.parse_args(argv) + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + analyze_ch05(args.datadir, args.outdir, seed=args.seed) + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch06_reconciliations_quality_control.py b/workbooks/track_d_template/scripts/business_ch06_reconciliations_quality_control.py new file mode 100644 index 0000000..7ede31a --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch06_reconciliations_quality_control.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: MIT +# business_ch06_reconciliations_quality_control.py +"""Chapter 6 (Track D): Reconciliations as quality control. + +Inputs (dataset folder): +- gl_journal.csv +- trial_balance_monthly.csv +- ar_events.csv +- bank_statement.csv + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* ar_rollforward.csv +* bank_recon_matches.csv +* bank_recon_exceptions.csv +* ch06_summary.json +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pandas as pd + +from scripts._cli import base_parser +from scripts._business_recon import ( + build_ar_rollforward, + reconcile_bank_statement, + write_json, +) + +@dataclass(frozen=True) +class Ch06Outputs: + ar_rollforward: pd.DataFrame + bank_matches: pd.DataFrame + bank_exceptions: pd.DataFrame + summary: dict[str, Any] + +def _read_csv(path: Path) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"Missing required input: {path}") + return pd.read_csv(path) + +def analyze_ch06(datadir: Path) -> Ch06Outputs: + gl = _read_csv(datadir / "gl_journal.csv") + tb = _read_csv(datadir / "trial_balance_monthly.csv") + ar_events = _read_csv(datadir / "ar_events.csv") + bank = _read_csv(datadir / "bank_statement.csv") + + # AR rollforward tie-out + + ar_roll = build_ar_rollforward(tb, ar_events) + ar_ok = bool(ar_roll["diff"].abs().max() < 1e-6) + + # Bank reconciliation + exception report + bank_out = reconcile_bank_statement(bank, gl) + bank_matches = bank_out.matches + bank_ex = bank_out.exceptions + + exc_counts = {} + if not bank_ex.empty: + exc_counts = bank_ex["exception_type"].astype(str).value_counts().to_dict() + + summary = { + "checks": { + "ar_rollforward_ties_to_tb": ar_ok, + }, + "metrics": { + "n_bank_lines": int(len(bank)), + "n_cash_txns_in_gl": int(len(bank_out.cash_txns)), + "n_bank_matches": int(bank_matches.get("is_matched", pd.Series(dtype=bool)).fillna(False).sum()), + "n_bank_exceptions": int(len(bank_ex)), + }, + "exception_counts": exc_counts, + } + + return Ch06Outputs( + ar_rollforward=ar_roll, + bank_matches=bank_matches, + bank_exceptions=bank_ex, + summary=summary, + ) + + +def write_ch06_outputs(result: Ch06Outputs, outdir: Path) -> None: + outdir.mkdir(parents=True, exist_ok=True) + + result.ar_rollforward.to_csv(outdir / "ar_rollforward.csv", index=False) + result.bank_matches.to_csv(outdir / "bank_recon_matches.csv", index=False) + result.bank_exceptions.to_csv(outdir / "bank_recon_exceptions.csv", index=False) + + write_json(result.summary, outdir / "ch06_summary.json") + + +def main() -> None: + p = base_parser("Track D Chapter 6: Reconciliations as quality control") + p.add_argument("--datadir", type=Path, default=Path("data/synthetic/nso_v1")) + args = p.parse_args() + + result = analyze_ch06(args.datadir) + write_ch06_outputs(result, args.outdir) + print(f"Wrote Chapter 6 artifacts -> {args.outdir}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch07_preparing_accounting_data_for_analysis.py b/workbooks/track_d_template/scripts/business_ch07_preparing_accounting_data_for_analysis.py new file mode 100644 index 0000000..5b3f3e4 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch07_preparing_accounting_data_for_analysis.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: MIT +# business_ch07_preparing_accounting_data_for_analysis.py +"""Track D Chapter 7: Preparing accounting data for analysis. + +This chapter turns the *raw* general ledger (GL) export into two analysis-ready +datasets: + +1) ``gl_tidy.csv`` + One row per GL line with COA labels and a single signed amount column. + +2) ``gl_monthly_summary.csv`` + A monthly rollup per account with debit/credit totals and a signed net change. + +Inputs (from the dataset folder, e.g., ``data/synthetic/nso_v1``): +- ``gl_journal.csv`` +- ``chart_of_accounts.csv`` + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* gl_tidy.csv +* gl_monthly_summary.csv +* ch07_summary.json +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pandas as pd + +from scripts._business_etl import GLPrepOutputs, analyze_gl_preparation +from scripts._cli import base_parser +from scripts._business_recon import write_json + + +@dataclass(frozen=True) +class Ch07Outputs: + gl_tidy: pd.DataFrame + gl_monthly_summary: pd.DataFrame + summary: dict[str, Any] + + +def _read_csv(path: Path) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(path) + return pd.read_csv(path) + + +def analyze_ch07(datadir: Path) -> Ch07Outputs: + gl = _read_csv(datadir / "gl_journal.csv") + coa = _read_csv(datadir / "chart_of_accounts.csv") + + out: GLPrepOutputs = analyze_gl_preparation(gl, coa) + return Ch07Outputs(gl_tidy=out.gl_tidy, gl_monthly_summary=out.gl_monthly_summary, summary=out.summary) + + +def write_ch07_outputs(result: Ch07Outputs, outdir: Path) -> None: + outdir.mkdir(parents=True, exist_ok=True) + + result.gl_tidy.to_csv(outdir / "gl_tidy.csv", index=False) + result.gl_monthly_summary.to_csv(outdir / "gl_monthly_summary.csv", index=False) + write_json(result.summary, outdir / "ch07_summary.json") + + +def main() -> None: + p = base_parser("Track D Chapter 7: Preparing accounting data for analysis") + p.add_argument("--datadir", type=Path, default=Path("data/synthetic/nso_v1")) + args = p.parse_args() + + result = analyze_ch07(args.datadir) + write_ch07_outputs(result, args.outdir) + print(f"Wrote Chapter 7 artifacts -> {args.outdir}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch08_descriptive_statistics_financial_performance.py b/workbooks/track_d_template/scripts/business_ch08_descriptive_statistics_financial_performance.py new file mode 100644 index 0000000..97a824e --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch08_descriptive_statistics_financial_performance.py @@ -0,0 +1,491 @@ +# SPDX-License-Identifier: MIT +# business_ch08_descriptive_statistics_financial_performance.py +"""Track D Business Chapter 8: Descriptive statistics for financial performance. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* gl_kpi_monthly.csv +* ar_monthly_metrics.csv +* ar_payment_slices.csv +* ar_days_stats.csv +* ch08_summary.json + +This chapter takes the NSO v1 synthetic bookkeeping dataset and produces +analysis-ready *descriptive* tables that accountants and analysts use to +understand performance variability. + +Outputs +------- +- gl_kpi_monthly.csv + Monthly income-statement KPIs + ratios + small rolling volatility signals. +- ar_monthly_metrics.csv + Accounts Receivable (A/R) roll-forward metrics such as collections rate and + an approximate Days Sales Outstanding (DSO). +- ar_payment_slices.csv + A small “payment lag” distribution built by applying cash collections to + invoices using a FIFO rule (good for mean vs median demonstrations). +- ar_days_stats.csv + Overall + per-customer summary stats (mean/median/quantiles/std) for the + payment-lag distribution. +- ch08_summary.json + Run report: row counts, checks, and a short data dictionary. + +Design goals +------------ +- Deterministic (seeded) +- Small, readable CSVs +- Chapter runs standalone from the raw NSO v1 folder""" + +from __future__ import annotations + +import json +from collections import deque +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from ._business_etl import build_gl_tidy_dataset +from ._cli import base_parser + + +@dataclass(frozen=True) +class Ch08Outputs: + gl_kpi_monthly: pd.DataFrame + ar_monthly_metrics: pd.DataFrame + ar_payment_slices: pd.DataFrame + ar_days_stats: pd.DataFrame + summary: dict[str, Any] + + +def _read_csv_required(datadir: Path, filename: str, *, fallbacks: list[str] | None = None) -> pd.DataFrame: + """Read a required CSV, optionally trying fallback filenames. + + This keeps chapters robust when the simulator/export names evolve. + """ + candidates = [filename] + (fallbacks or []) + for name in candidates: + path = datadir / name + if path.exists(): + return pd.read_csv(path) + # If none found, raise using the primary expected name (so error is clear) + raise FileNotFoundError(datadir / filename) + + +def _pivot_statement(df: pd.DataFrame) -> pd.DataFrame: + """Return a wide statement frame: index month, columns = line.""" + out = df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum") + out = out.sort_index() + out.columns = [str(c) for c in out.columns] + return out.reset_index() + + +def _col(df: pd.DataFrame, name: str, default: float = 0.0) -> pd.Series: + """Return df[name] as a float Series, or a default-valued Series if missing.""" + if name in df.columns: + return df[name].astype(float) + return pd.Series([float(default)] * len(df), index=df.index, dtype=float) + + +def _days_in_month(month: str) -> int: + # month like "2025-01" + p = pd.Period(month, freq="M") + return int(p.days_in_month) + + +def _safe_div(numer: pd.Series, denom: pd.Series) -> pd.Series: + return numer.where(denom.abs() > 1e-12, np.nan) / denom.where(denom.abs() > 1e-12, np.nan) + + +def _describe_numeric(values: np.ndarray) -> dict[str, float]: + if values.size == 0: + return { + "n": 0.0, + "mean": np.nan, + "median": np.nan, + "std": np.nan, + "min": np.nan, + "p25": np.nan, + "p75": np.nan, + "p90": np.nan, + "max": np.nan, + } + v = values.astype(float) + return { + "n": float(v.size), + "mean": float(np.nanmean(v)), + "median": float(np.nanmedian(v)), + "std": float(np.nanstd(v, ddof=1)) if v.size > 1 else 0.0, + "min": float(np.nanmin(v)), + "p25": float(np.nanpercentile(v, 25)), + "p75": float(np.nanpercentile(v, 75)), + "p90": float(np.nanpercentile(v, 90)), + "max": float(np.nanmax(v)), + } + + +def _weighted_median(values: np.ndarray, weights: np.ndarray) -> float: + """Weighted median of values using non-negative weights.""" + if values.size == 0: + return float("nan") + w = np.asarray(weights, dtype=float) + v = np.asarray(values, dtype=float) + if np.any(w < 0): + raise ValueError("weights must be non-negative") + if np.all(w == 0): + return float(np.nanmedian(v)) + order = np.argsort(v) + v_sorted = v[order] + w_sorted = w[order] + cum = np.cumsum(w_sorted) + cutoff = 0.5 * float(np.sum(w_sorted)) + idx = int(np.searchsorted(cum, cutoff, side="left")) + return float(v_sorted[min(idx, v_sorted.size - 1)]) + + +def _ar_payment_slices(ar_events: pd.DataFrame) -> tuple[pd.DataFrame, dict[str, Any]]: + """Build a payment-lag distribution by applying collections to invoices. + + We treat invoices as positive A/R increases and collections as positive cash + received. Collections are applied FIFO to the oldest open invoices. + + Returns (slices_df, diagnostics). + """ + required = {"date", "customer", "event_type", "amount"} + missing = required - set(map(str, ar_events.columns)) + if missing: + raise ValueError(f"ar_events is missing required columns: {sorted(missing)}") + + df = ar_events.copy() + df["date"] = pd.to_datetime(df["date"], errors="coerce") + df = df.dropna(subset=["date", "customer", "event_type", "amount"]) + df = df.sort_values(["customer", "date", "txn_id"], kind="mergesort") + + rows: list[dict[str, Any]] = [] + unapplied_total = 0.0 + open_invoices_end: list[dict[str, Any]] = [] + + for customer, sub in df.groupby("customer", sort=False): + open_q: deque[dict[str, Any]] = deque() + for _, r in sub.iterrows(): + et = str(r["event_type"]).lower().strip() + amt = float(r["amount"]) + if amt <= 0: + continue + + if et == "invoice": + open_q.append( + { + "invoice_id": str(r.get("invoice_id", "")), + "invoice_date": pd.Timestamp(r["date"]).normalize(), + "remaining": amt, + } + ) + continue + + if et != "collection": + continue + + pay_date = pd.Timestamp(r["date"]).normalize() + remaining = amt + + while remaining > 1e-9 and len(open_q) > 0: + inv = open_q[0] + applied = min(float(inv["remaining"]), remaining) + days = int((pay_date - pd.Timestamp(inv["invoice_date"])).days) + rows.append( + { + "customer": customer, + "invoice_id": inv["invoice_id"], + "invoice_date": pd.Timestamp(inv["invoice_date"]).strftime("%Y-%m-%d"), + "payment_date": pay_date.strftime("%Y-%m-%d"), + "month_paid": pay_date.strftime("%Y-%m"), + "amount_applied": float(applied), + "days_outstanding": float(days), + } + ) + inv["remaining"] = float(inv["remaining"]) - float(applied) + remaining -= float(applied) + if inv["remaining"] <= 1e-9: + open_q.popleft() + + if remaining > 1e-9: + unapplied_total += float(remaining) + + # keep open invoices at the end (diagnostic only) + if len(open_q) > 0: + asof = sub["date"].max().normalize() + for inv in list(open_q): + open_invoices_end.append( + { + "customer": customer, + "invoice_id": inv["invoice_id"], + "invoice_date": pd.Timestamp(inv["invoice_date"]).strftime("%Y-%m-%d"), + "remaining_amount": float(inv["remaining"]), + "age_days_asof_end": float((asof - pd.Timestamp(inv["invoice_date"])).days), + } + ) + + slices = pd.DataFrame(rows) + if slices.empty: + slices = pd.DataFrame( + columns=[ + "customer", + "invoice_id", + "invoice_date", + "payment_date", + "month_paid", + "amount_applied", + "days_outstanding", + ] + ) + + diagnostics = { + "unapplied_collections_total": float(unapplied_total), + "open_invoices_end": open_invoices_end, + } + return slices, diagnostics + + +def _ar_days_stats(slices: pd.DataFrame) -> pd.DataFrame: + """Summarize days outstanding overall + by customer.""" + if slices.empty: + return pd.DataFrame( + columns=[ + "customer", + "n", + "mean_days", + "median_days", + "weighted_mean_days", + "weighted_median_days", + "std_days", + "p25_days", + "p75_days", + "p90_days", + "min_days", + "max_days", + "total_paid", + ] + ) + + def _one(g: pd.DataFrame) -> dict[str, Any]: + days = g["days_outstanding"].to_numpy(dtype=float) + amt = g["amount_applied"].to_numpy(dtype=float) + desc = _describe_numeric(days) + weighted_mean = float(np.sum(days * amt) / np.sum(amt)) if np.sum(amt) > 1e-12 else float(np.nan) + weighted_med = _weighted_median(days, amt) + return { + "n": int(desc["n"]), + "mean_days": desc["mean"], + "median_days": desc["median"], + "weighted_mean_days": weighted_mean, + "weighted_median_days": weighted_med, + "std_days": desc["std"], + "p25_days": desc["p25"], + "p75_days": desc["p75"], + "p90_days": desc["p90"], + "min_days": desc["min"], + "max_days": desc["max"], + "total_paid": float(np.sum(amt)), + } + + out_rows: list[dict[str, Any]] = [] + for customer, g in slices.groupby("customer", sort=False): + row = {"customer": str(customer)} + row.update(_one(g)) + out_rows.append(row) + + overall = {"customer": "ALL"} + overall.update(_one(slices)) + out_rows.append(overall) + + return pd.DataFrame(out_rows) + + +def analyze_ch08(datadir: Path, outdir: Path | None = None, seed: int = 123) -> Ch08Outputs: + """Run Chapter 8 analysis and return outputs as dataframes.""" + # Build analysis-ready GL (Chapter 7 logic) directly from raw exports + gl = _read_csv_required(datadir, "gl_journal.csv", fallbacks=["gl.csv", "general_ledger.csv"]) + coa = _read_csv_required(datadir, "chart_of_accounts.csv", fallbacks=["coa.csv"]) + gl_tidy = build_gl_tidy_dataset(gl, coa) + + # Statements are already monthly and are a stable “accounting truth” for KPIs + is_df = _read_csv_required(datadir, "statements_is_monthly.csv") + bs_df = _read_csv_required(datadir, "statements_bs_monthly.csv") + is_w = _pivot_statement(is_df) + bs_w = _pivot_statement(bs_df) + + # Ensure month alignment + months = sorted(set(is_w["month"]).intersection(set(bs_w["month"]))) + is_w = is_w.loc[is_w["month"].isin(months)].copy() + bs_w = bs_w.loc[bs_w["month"].isin(months)].copy() + is_w = is_w.sort_values("month").reset_index(drop=True) + bs_w = bs_w.sort_values("month").reset_index(drop=True) + + # KPIs + roll_window = 3 # small default window for “volatility” signals + + kpi = pd.DataFrame({"month": is_w["month"].astype(str)}) + kpi["revenue"] = _col(is_w, "Sales Revenue", 0.0) + kpi["cogs"] = _col(is_w, "Cost of Goods Sold", 0.0) + + if "Gross Profit" in is_w.columns: + kpi["gross_profit"] = is_w["Gross Profit"].astype(float) + else: + kpi["gross_profit"] = (kpi["revenue"] - kpi["cogs"]).astype(float) + + kpi["operating_expenses"] = _col(is_w, "Total Operating Expenses", 0.0) + kpi["net_income"] = _col(is_w, "Net Income", 0.0) + + kpi["gross_margin_pct"] = _safe_div(kpi["gross_profit"], kpi["revenue"]).replace([np.inf, -np.inf], np.nan) + kpi["net_margin_pct"] = _safe_div(kpi["net_income"], kpi["revenue"]).replace([np.inf, -np.inf], np.nan) + kpi["revenue_growth_pct"] = kpi["revenue"].pct_change().replace([np.inf, -np.inf], np.nan) + + for col in ["gross_margin_pct", "net_margin_pct", "revenue_growth_pct"]: + kpi[f"{col}_roll_mean_w{roll_window}"] = kpi[col].rolling(window=roll_window, min_periods=1).mean() + kpi[f"{col}_roll_std_w{roll_window}"] = kpi[col].rolling(window=roll_window, min_periods=2).std(ddof=1) + kpi[f"{col}_zscore"] = _safe_div( + kpi[col] - kpi[f"{col}_roll_mean_w{roll_window}"], + kpi[f"{col}_roll_std_w{roll_window}"], + ) + + # Add a few balance-sheet anchors (useful for “ratio thinking”) + kpi["cash_end"] = bs_w.get("Cash", np.nan).astype(float) + kpi["ar_end"] = bs_w.get("Accounts Receivable", np.nan).astype(float) + kpi["inventory_end"] = bs_w.get("Inventory", np.nan).astype(float) + kpi["ap_end"] = bs_w.get("Accounts Payable", np.nan).astype(float) + + # A/R monthly metrics from tidy GL (credit sales, collections) + BS balances + ar_lines = gl_tidy.loc[gl_tidy["account_id"].astype(str) == "1100"].copy() + ar_lines["month"] = ar_lines["date"].astype(str).str.slice(0, 7) + ar_lines["signed"] = ar_lines["debit"].astype(float) - ar_lines["credit"].astype(float) + + ar_month = ( + ar_lines.groupby("month", observed=True) + .agg( + credit_sales=("signed", lambda s: float(np.sum(np.clip(s.to_numpy(dtype=float), 0, None)))), + collections=("signed", lambda s: float(np.sum(np.clip(-s.to_numpy(dtype=float), 0, None)))), + ) + .reset_index() + ) + ar_month = ar_month.loc[ar_month["month"].isin(months)].copy() + ar_month = ar_month.sort_values("month").reset_index(drop=True) + ar_month["ar_end"] = kpi["ar_end"].astype(float) + ar_month["ar_begin"] = ar_month["ar_end"].shift(1) + if not ar_month.empty: + ar_month.loc[0, "ar_begin"] = ar_month.loc[0, "ar_end"] + ar_month["avg_ar"] = 0.5 * (ar_month["ar_begin"] + ar_month["ar_end"]) + ar_month["days_in_month"] = ar_month["month"].astype(str).apply(_days_in_month).astype(int) + ar_month["ar_turnover"] = _safe_div(ar_month["credit_sales"], ar_month["avg_ar"]).replace([np.inf, -np.inf], np.nan) + ar_month["dso"] = _safe_div(ar_month["avg_ar"], ar_month["credit_sales"]).replace([np.inf, -np.inf], np.nan) * ar_month[ + "days_in_month" + ] + ar_month["collections_rate"] = _safe_div(ar_month["collections"], ar_month["credit_sales"]).replace( + [np.inf, -np.inf], np.nan + ) + + # A/R “days outstanding” distribution (payment slices) + ar_events = _read_csv_required(datadir, "ar_events.csv") if (datadir / "ar_events.csv").exists() else pd.DataFrame() + slices, ar_diag = _ar_payment_slices(ar_events) if not ar_events.empty else ( + pd.DataFrame( + columns=[ + "customer", + "invoice_id", + "invoice_date", + "payment_date", + "month_paid", + "amount_applied", + "days_outstanding", + ] + ), + {"unapplied_collections_total": 0.0, "open_invoices_end": []}, + ) + + ar_days_stats = _ar_days_stats(slices) + + # Summary report (minimal, consistent with earlier chapters) + checks = { + "months": months, + "n_months": int(len(months)), + "kpi_rows": int(len(kpi)), + "ar_monthly_rows": int(len(ar_month)), + "ar_payment_slices_rows": int(len(slices)), + "gross_margin_pct_in_range": bool( + kpi["gross_margin_pct"].dropna().between(-1.0, 1.0).all() if not kpi.empty else True + ), + "dso_nonnegative": bool(ar_month["dso"].dropna().ge(0.0).all() if not ar_month.empty else True), + } + + data_dictionary = { + "gl_kpi_monthly.csv": { + "grain": "one row per month", + "notes": "Income statement KPIs + ratios + rolling mean/std + z-scores.", + }, + "ar_monthly_metrics.csv": { + "grain": "one row per month", + "notes": "Credit sales and collections inferred from AR account activity; includes DSO approximation.", + }, + "ar_payment_slices.csv": { + "grain": "one row per payment slice", + "notes": "FIFO allocation of collections to invoices; used to illustrate skew (mean vs median).", + }, + "ar_days_stats.csv": { + "grain": "one row per customer plus ALL", + "notes": "Summary stats for days outstanding (unweighted + amount-weighted).", + }, + "ch08_summary.json": { + "grain": "one JSON document", + "notes": "Row counts, checks, and A/R diagnostics.", + }, + } + + summary: dict[str, Any] = { + "chapter": "business_ch08_descriptive_statistics_financial_performance", + "seed": int(seed), + "checks": checks, + "ar_diagnostics": { + "unapplied_collections_total": float(ar_diag.get("unapplied_collections_total", 0.0)), + "open_invoices_end_count": int(len(ar_diag.get("open_invoices_end", []))), + }, + "data_dictionary": data_dictionary, + } + + outputs = Ch08Outputs( + gl_kpi_monthly=kpi, + ar_monthly_metrics=ar_month, + ar_payment_slices=slices, + ar_days_stats=ar_days_stats, + summary=summary, + ) + + # If outdir is provided, write artifacts (keeps CLI + tests simple) + if outdir is not None: + write_ch08(outputs, Path(outdir)) + + return outputs + + +def write_ch08(outputs: Ch08Outputs, outdir: Path) -> None: + outdir.mkdir(parents=True, exist_ok=True) + outputs.gl_kpi_monthly.to_csv(outdir / "gl_kpi_monthly.csv", index=False) + outputs.ar_monthly_metrics.to_csv(outdir / "ar_monthly_metrics.csv", index=False) + outputs.ar_payment_slices.to_csv(outdir / "ar_payment_slices.csv", index=False) + outputs.ar_days_stats.to_csv(outdir / "ar_days_stats.csv", index=False) + (outdir / "ch08_summary.json").write_text(json.dumps(outputs.summary, indent=2), encoding="utf-8") + + +def main(argv: list[str] | None = None) -> int: + p = base_parser("Business Ch08: descriptive stats for financial performance") + p.add_argument("--datadir", type=str, required=True, help="Path to NSO v1 dataset folder") + args = p.parse_args(argv) + + seed = int(args.seed) if args.seed is not None else 123 + analyze_ch08(Path(args.datadir), outdir=Path(args.outdir), seed=seed) + print(f"Wrote Chapter 8 artifacts -> {Path(args.outdir)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch09_reporting_style_contract.py b/workbooks/track_d_template/scripts/business_ch09_reporting_style_contract.py new file mode 100644 index 0000000..aa9f98e --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch09_reporting_style_contract.py @@ -0,0 +1,463 @@ +# SPDX-License-Identifier: MIT +# business_ch09_reporting_style_contract.py +"""Track D Business Chapter 9: Plotting/reporting style contract + example outputs. + +Chapter 9 defines a small, reusable plotting/reporting *style contract* for +Track D and produces example figures that comply with it. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch09_style_contract.json +* ch09_figures_manifest.csv +* ch09_executive_memo.md +* ch09_summary.json + +Design notes +------------ +- Matplotlib only (no seaborn) +- Deterministic file names +- “Axis guardrails”: bar charts start at 0; truncated axes must be explicit""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from ._cli import base_parser +from ._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_ecdf, + plot_histogram_with_markers, + plot_waterfall_bridge, + save_figure, + style_context, + write_contract_json, +) +from .business_ch08_descriptive_statistics_financial_performance import analyze_ch08 + + +@dataclass(frozen=True) +class Ch09Outputs: + contract_path: Path + manifest_path: Path + memo_path: Path + summary_path: Path + figures_dir: Path + figure_paths: list[Path] + + +def _ensure_outdir(outdir: Path) -> tuple[Path, Path, Path, Path, Path]: + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + contract_path = outdir / "ch09_style_contract.json" + manifest_path = outdir / "ch09_figures_manifest.csv" + memo_path = outdir / "ch09_executive_memo.md" + summary_path = outdir / "ch09_summary.json" + return contract_path, manifest_path, memo_path, summary_path, figures_dir + + +def _make_executive_memo(kpi: pd.DataFrame, ar_monthly: pd.DataFrame, ar_days_stats: pd.DataFrame) -> str: + """Create a compact, deterministic markdown memo (10 bullets max). + + Chapter 9 embeds *ethics guardrails* directly in the memo generator so the + output is harder to misuse in executive settings. + """ + lines: list[str] = [] + lines.append("# Chapter 9 — Executive memo (example)\n") + lines.append( + "This memo is generated from the synthetic NSO v1 data and exists to demonstrate " + "consistent reporting style (not to make real-world claims).\n" + ) + + # --- Ethics guardrails (deterministic) --- + # Percent metrics (margins, growth) can be misleading when the denominator is tiny. + # We flag percent metrics when the revenue base is "small" relative to the series. + PCT_DENOM_ABS_MIN: float = 1_000.0 + PCT_DENOM_REL_MIN: float = 0.05 # 5% of typical (median) revenue + + # DSO is a risk metric; extreme values are flagged neutrally as "needs investigation". + DSO_NEEDS_INVESTIGATION_DAYS: float = 75.0 + + typical_revenue = 0.0 + if not kpi.empty and "revenue" in kpi.columns: + s = ( + pd.to_numeric(kpi["revenue"], errors="coerce") + .replace([np.inf, -np.inf], np.nan) + .dropna() + .astype(float) + ) + if len(s) > 0: + typical_revenue = float(s.median()) + + def _is_small_denominator(denom: float | int | None, typical: float) -> bool: + if denom is None: + return True + d = float(denom) + if not np.isfinite(d): + return True + floor = max(PCT_DENOM_ABS_MIN, PCT_DENOM_REL_MIN * float(typical)) + return d < floor + + def _pct_note_if_unstable(denom: float | int | None, typical: float) -> str: + if _is_small_denominator(denom=denom, typical=typical): + return " (flag: denominator small; interpret with caution)" + return "" + + if not kpi.empty: + latest = kpi.iloc[-1] + prev = kpi.iloc[-2] if len(kpi) > 1 else latest + lines.append(f"- Latest month: **{latest['month']}**") + lines.append(f"- Revenue: **{latest['revenue']:.0f}** (prev {prev['revenue']:.0f})") + lines.append(f"- Net income: **{latest['net_income']:.0f}** (prev {prev['net_income']:.0f})") + if pd.notna(latest.get("gross_margin_pct", pd.NA)): + note = _pct_note_if_unstable(denom=latest.get("revenue"), typical=typical_revenue) + lines.append(f"- Gross margin: **{latest['gross_margin_pct']:.1%}**{note}") + if pd.notna(latest.get("net_margin_pct", pd.NA)): + note = _pct_note_if_unstable(denom=latest.get("revenue"), typical=typical_revenue) + lines.append(f"- Net margin: **{latest['net_margin_pct']:.1%}**{note}") + if pd.notna(latest.get("revenue_growth_pct", pd.NA)): + note = _pct_note_if_unstable(denom=prev.get("revenue"), typical=typical_revenue) + lines.append(f"- MoM revenue growth: **{latest['revenue_growth_pct']:.1%}**{note}") + + if not ar_monthly.empty and "dso" in ar_monthly.columns: + latest_ar = ar_monthly.iloc[-1] + dso = latest_ar.get("dso") + cr = latest_ar.get("collections_rate") + if pd.notna(dso): + note = "" + if float(dso) >= DSO_NEEDS_INVESTIGATION_DAYS: + note = " (flag: unusually high; needs investigation)" + elif float(dso) < 0: + note = " (flag: negative; check data/definitions)" + lines.append(f"- DSO (approx): **{float(dso):.1f} days**{note}") + if pd.notna(cr): + lines.append(f"- Collections rate: **{float(cr):.1%}**") + + # Tail risk from payment lag distribution + if not ar_days_stats.empty: + all_row = ar_days_stats.loc[ar_days_stats["customer"] == "ALL"] + if not all_row.empty: + r = all_row.iloc[0] + if pd.notna(r.get("median_days", pd.NA)): + lines.append(f"- Payment lag median: **{float(r['median_days']):.0f} days**") + if pd.notna(r.get("p90_days", pd.NA)): + lines.append(f"- Payment lag p90 (tail): **{float(r['p90_days']):.0f} days**") + + # Keep it to ~10 bullets for a one-page feel + bullet_lines = [ln for ln in lines if ln.startswith("-")] + if len(bullet_lines) > 10: + trimmed: list[str] = [] + kept = 0 + for ln in lines: + if ln.startswith("-"): + kept += 1 + if kept > 10: + continue + trimmed.append(ln) + lines = trimmed + + return "\n".join(lines).strip() + "\n" + + +def _month_ticks(ax: Any, months: list[str], step: int = 2) -> None: + """Readable month ticks for short monthly series.""" + if not months: + return + idx = np.arange(len(months)) + ticks = idx[::step] + labels = [months[i] for i in ticks] + ax.set_xticks(ticks) + ax.set_xticklabels(labels, rotation=0, ha="center") + + +def _plot_time_series( + df: pd.DataFrame, + x_col: str, + y_cols: list[str], + title: str, + x_label: str, + y_label: str, + show_zero_line: bool = False, +) -> plt.Figure: + months = df[x_col].astype(str).tolist() + x = np.arange(len(months)) + + fig, ax = plt.subplots(figsize=(10, 4)) + for col in y_cols: + ax.plot(x, df[col].astype(float).to_numpy(), marker="o", linewidth=2, label=col) + + if show_zero_line: + ax.axhline(0.0, linewidth=1, linestyle="--") + + ax.set_title(title) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + _month_ticks(ax, months, step=2) + ax.grid(True, axis="y") + ax.legend(loc="best") + fig.tight_layout() + return fig + + +def analyze_ch09(datadir: Path, outdir: Path, seed: int = 123) -> Ch09Outputs: + """Run Chapter 9 and write contract + example figures.""" + contract_path, manifest_path, memo_path, summary_path, figures_dir = _ensure_outdir(Path(outdir)) + + # 1) Produce the style contract JSON (the “rules” later chapters will follow) + write_contract_json(contract_path) + + # 2) Use Chapter 8 to get consistent inputs (KPIs + A/R metrics + A/R day slices) + ch08 = analyze_ch08(datadir=Path(datadir), outdir=None, seed=seed) + kpi = ch08.gl_kpi_monthly.copy() + ar_month = ch08.ar_monthly_metrics.copy() + slices = ch08.ar_payment_slices.copy() + + ar_days_stats = ch08.ar_days_stats.copy() + + # 3) Build figures (each must comply with the contract) + manifest: list[FigureManifestRow] = [] + figure_paths: list[Path] = [] + + # KPI: Revenue + Net income + with style_context(): + fig = _plot_time_series( + kpi, + x_col="month", + y_cols=["revenue", "net_income"], + title="Monthly revenue and net income", + x_label="Month", + y_label="Amount (currency units)", + show_zero_line=True, + ) + path = figures_dir / "kpi_revenue_net_income_line.png" + save_figure(fig, path, FigureSpec(chart_type="line", title="Monthly revenue and net income")) + plt.close(fig) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="line", + title="Monthly revenue and net income", + x_label="Month", + y_label="Amount (currency units)", + guardrail_note="Line charts may use non-zero y-limits; include a 0 reference line when helpful.", + data_source="gl_kpi_monthly.csv", + ) + ) + figure_paths.append(path) + + # KPI: margins + with style_context(): + fig = _plot_time_series( + kpi, + x_col="month", + y_cols=["gross_margin_pct", "net_margin_pct"], + title="Gross and net margin (%)", + x_label="Month", + y_label="Margin (fraction)", + show_zero_line=True, + ) + path = figures_dir / "kpi_margins_line.png" + save_figure(fig, path, FigureSpec(chart_type="line", title="Gross and net margin (%)")) + plt.close(fig) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="line", + title="Gross and net margin (%)", + x_label="Month", + y_label="Margin (fraction)", + guardrail_note="Include a 0 reference line for ratio charts.", + data_source="gl_kpi_monthly.csv", + ) + ) + figure_paths.append(path) + + # Net income variance bridge (waterfall) + # Decompose change from prior month net income to latest month net income. + if len(kpi) >= 2 and {"month", "revenue", "cogs", "operating_expenses", "net_income"}.issubset(set(kpi.columns)): + prev = kpi.iloc[-2] + latest = kpi.iloc[-1] + + start = float(prev["net_income"]) + end = float(latest["net_income"]) + + components: list[tuple[str, float]] = [ + ("Revenue", float(latest["revenue"]) - float(prev["revenue"])), + ("COGS", -(float(latest["cogs"]) - float(prev["cogs"]))), + ("Operating expenses", -(float(latest["operating_expenses"]) - float(prev["operating_expenses"]))), + ] + + residual = end - (start + sum(delta for _, delta in components)) + if abs(float(residual)) > 1e-6: + components.append(("Other / rounding", float(residual))) + + with style_context(): + fig = plot_waterfall_bridge( + start_label=f"{prev['month']} net income", + end_label=f"{latest['month']} net income", + start_value=start, + end_value=end, + components=components, + title=f"Net income bridge: {prev['month']} → {latest['month']}", + y_label="Net income (currency units)", + ) + path = figures_dir / "net_income_bridge.png" + save_figure(fig, path, FigureSpec(chart_type="waterfall_bridge", title="Net income bridge")) + plt.close(fig) + + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="waterfall_bridge", + title=f"Net income bridge: {prev['month']} → {latest['month']}", + x_label="Component", + y_label="Net income (currency units)", + guardrail_note="Bridge chart reconciles explicit start/end totals using additive components; use sign conventions consistently. 'Other / rounding' closes any residual.", + data_source="gl_kpi_monthly.csv", + ) + ) + figure_paths.append(path) + + # A/R: DSO + + if not ar_month.empty and "dso_approx" in ar_month.columns: + with style_context(): + fig = _plot_time_series( + ar_month, + x_col="month", + y_cols=["dso_approx"], + title="A/R days sales outstanding (approx.)", + x_label="Month", + y_label="Days", + show_zero_line=False, + ) + path = figures_dir / "ar_dso_line.png" + save_figure(fig, path, FigureSpec(chart_type="line", title="A/R DSO (approx.)")) + plt.close(fig) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="line", + title="A/R days sales outstanding (approx.)", + x_label="Month", + y_label="Days", + guardrail_note="Do not truncate y-axis to exaggerate small changes.", + data_source="ar_monthly_metrics.csv", + ) + ) + figure_paths.append(path) + + # A/R distribution: days outstanding (hist + ECDF) + if not slices.empty and "days_outstanding" in slices.columns: + days = slices["days_outstanding"].astype(float).to_numpy() + weights = slices.get("amount_applied", pd.Series(np.ones(len(slices)))).astype(float).to_numpy() + + mean_days = float(np.average(days, weights=np.clip(weights, 0, None))) if days.size else float("nan") + med_days = float(np.nanmedian(days)) if days.size else float("nan") + p90 = float(np.nanpercentile(days, 90)) if days.size else float("nan") + + with style_context(): + fig = plot_histogram_with_markers( + values=days, + title="A/R payment lag (days) — histogram", + x_label="Days outstanding", + y_label="Count", + markers={"mean": mean_days, "median": med_days, "p90": p90}, + ) + path = figures_dir / "ar_days_hist.png" + save_figure(fig, path, FigureSpec(chart_type="histogram", title="A/R payment lag histogram")) + plt.close(fig) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="histogram", + title="A/R payment lag (days) — histogram", + x_label="Days outstanding", + y_label="Count", + guardrail_note="Histogram x-axis uses real units; avoid binning that hides tails.", + data_source="ar_payment_slices.csv", + ) + ) + figure_paths.append(path) + + with style_context(): + fig = plot_ecdf( + values=days, + title="A/R payment lag (days) — ECDF", + x_label="Days outstanding", + y_label="Cumulative proportion", + markers={"median": med_days, "p90": p90}, + ) + path = figures_dir / "ar_days_ecdf.png" + save_figure(fig, path, FigureSpec(chart_type="ecdf", title="A/R payment lag ECDF")) + plt.close(fig) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="ecdf", + title="A/R payment lag (days) — ECDF", + x_label="Days outstanding", + y_label="Cumulative proportion", + guardrail_note="ECDF is recommended for skewed distributions; show tail percentiles.", + data_source="ar_payment_slices.csv", + ) + ) + figure_paths.append(path) + + # 4) Write manifest + summary + pd.DataFrame([m.__dict__ for m in manifest]).to_csv(manifest_path, index=False) + + summary: dict[str, Any] = { + "chapter": "business_ch09_reporting_style_contract", + "seed": int(seed), + "paths": { + "contract": str(contract_path), + "manifest": str(manifest_path), + "memo": str(memo_path), + "summary": str(summary_path), + "figures_dir": str(figures_dir), + }, + "n_figures": int(len(figure_paths)), + "figures": [p.name for p in figure_paths], + } + # Small “one-page” executive memo (markdown) + # memo_path.write_text(_make_executive_memo(kpi=kpi, ar_monthly=ar_m, ar_days_stats=ar_days_stats), encoding="utf-8") + memo_path.write_text( + _make_executive_memo(kpi=kpi, ar_monthly=ar_month, ar_days_stats=ar_days_stats), + encoding="utf-8", + ) + + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") + + return Ch09Outputs( + contract_path=contract_path, + manifest_path=manifest_path, + memo_path=memo_path, + summary_path=summary_path, + figures_dir=figures_dir, + figure_paths=figure_paths, + ) + + +def main(argv: list[str] | None = None) -> int: + p = base_parser("Business Ch09: plotting/reporting style contract") + p.add_argument("--datadir", type=str, required=True, help="Path to NSO v1 dataset folder") + args = p.parse_args(argv) + + seed = int(args.seed) if args.seed is not None else 123 + analyze_ch09(datadir=Path(args.datadir), outdir=Path(args.outdir), seed=seed) + print(f"Wrote Chapter 9 artifacts -> {Path(args.outdir)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch10_probability_risk.py b/workbooks/track_d_template/scripts/business_ch10_probability_risk.py new file mode 100644 index 0000000..e0a11b9 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch10_probability_risk.py @@ -0,0 +1,559 @@ +# SPDX-License-Identifier: MIT +# business_ch10_probability_risk.py +"""Track D — Business Statistics & Forecasting for Accountants. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch10_figures_manifest.csv +* ch10_risk_memo.md +* ch10_risk_summary.json + +Chapter 10: Probability and Risk in Business Terms + +This chapter translates abstract probability into concrete operational risk. + +Outputs are designed to be: +- deterministic (seeded) +- audit-friendly (explicit assumptions + defensible calculations) +- matplotlib-only (no seaborn) + +We focus on two common small-business risks: +1) Cash shortfall risk -> size an emergency fund so cash stays >= 0 with ~95% confidence. +2) Bad debt risk (A/R) -> estimate expected loss and a p90 "worst-case" loss. + +The goal is not perfect forecasting; it is honest, explainable risk communication.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser +from scripts._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_ecdf, + plot_histogram_with_markers, + save_figure, + style_context, +) +from scripts.business_ch08_descriptive_statistics_financial_performance import analyze_ch08 + + +# --- Guardrails / defaults (explicit + testable) --- + +DEFAULT_HORIZON_MONTHS = 12 +DEFAULT_N_SIMS = 5000 +DEFAULT_CONFIDENCE = 0.95 + +# A/R late threshold used as a *proxy* for default risk. +DEFAULT_SEVERE_LATE_DAYS = 90 + +# Loss-given-default (LGD) assumption used in this toy model. +DEFAULT_LGD = 1.0 + +# Rounding for "no false precision" communication. +_MONEY_ROUND_TO = 100.0 + + +@dataclass(frozen=True) +class Ch10Outputs: + manifest_path: Path + memo_path: Path + summary_path: Path + figures_dir: Path + figure_paths: list[Path] + + +def _read_csv_required(datadir: Path, fname: str) -> pd.DataFrame: + path = datadir / fname + if not path.exists(): + raise FileNotFoundError(f"Missing required dataset file: {path}") + return pd.read_csv(path) + + +def _latest_bs_value(bs: pd.DataFrame, line_name: str) -> float: + """Return latest month amount for a given balance sheet line.""" + + if bs.empty: + return float("nan") + + if {"month", "line", "amount"}.issubset(set(bs.columns)): + df = bs.loc[bs["line"].astype(str) == str(line_name)].copy() + if df.empty: + return float("nan") + df["month"] = df["month"].astype(str) + df = df.sort_values("month", kind="mergesort") + return float(df.iloc[-1]["amount"]) + + # Fallback: try a common alternative schema. + if "line" in bs.columns and "amount" in bs.columns: + df = bs.loc[bs["line"].astype(str) == str(line_name)].copy() + if df.empty: + return float("nan") + return float(df["amount"].iloc[-1]) + + return float("nan") + + +def _monthly_net_cash_changes(bank: pd.DataFrame) -> pd.DataFrame: + """Aggregate bank statement transactions into monthly net cash change.""" + + if bank.empty: + return pd.DataFrame(columns=["month", "net_cash_change"]) + + if not {"month", "amount"}.issubset(set(bank.columns)): + raise ValueError("bank_statement.csv must contain columns: month, amount") + + df = bank.copy() + df["month"] = df["month"].astype(str) + df["amount"] = df["amount"].astype(float) + + out = ( + df.groupby("month", observed=True)["amount"] + .sum() + .reset_index() + .rename(columns={"amount": "net_cash_change"}) + .sort_values("month", kind="mergesort") + .reset_index(drop=True) + ) + return out + + +def _bootstrap_cash_buffer( + net_changes: np.ndarray, + current_cash: float, + horizon_months: int, + n_sims: int, + rng: np.random.Generator, +) -> dict[str, Any]: + """Bootstrap resampling model for cash shortfall risk. + + We resample historical monthly net cash changes with replacement. + + Returns + ------- + Dict with: + - buffer_needed: array of required buffer per simulation + - p_shortfall: prob(cash dips below 0 at least once) without extra buffer + - buffer_p95: 95th percentile buffer + """ + + if net_changes.size == 0 or not np.isfinite(float(current_cash)): + return { + "buffer_needed": np.array([], dtype=float), + "p_shortfall": float("nan"), + "buffer_p95": float("nan"), + "buffer_mean": float("nan"), + } + + draws = rng.choice(net_changes.astype(float), size=(int(n_sims), int(horizon_months)), replace=True) + balances = float(current_cash) + np.cumsum(draws, axis=1) + min_bal = np.min(balances, axis=1) + + buffer_needed = np.maximum(0.0, -min_bal) + p_shortfall = float(np.mean(min_bal < 0.0)) + + buffer_p95 = float(np.nanpercentile(buffer_needed, 95)) + buffer_mean = float(np.nanmean(buffer_needed)) + + return { + "buffer_needed": buffer_needed, + "p_shortfall": p_shortfall, + "buffer_p95": buffer_p95, + "buffer_mean": buffer_mean, + } + + +def _severe_late_rates_from_slices(slices: pd.DataFrame, severe_late_days: int) -> np.ndarray: + """Compute monthly severe-late share as a proxy for bad-debt risk. + + Rate per month = (sum(amount_applied where days_outstanding >= threshold)) / (sum(amount_applied)). + + Note: This is not a true default model; it's a pedagogical proxy. + """ + + if slices.empty: + return np.array([], dtype=float) + + required = {"month_paid", "days_outstanding", "amount_applied"} + if not required.issubset(set(slices.columns)): + return np.array([], dtype=float) + + df = slices.copy() + df["month_paid"] = df["month_paid"].astype(str) + df["days_outstanding"] = df["days_outstanding"].astype(float) + df["amount_applied"] = df["amount_applied"].astype(float) + + df["is_severe_late"] = df["days_outstanding"] >= float(severe_late_days) + + # Compute severe-late share per month without groupby.apply (avoids pandas FutureWarning). + monthly = ( + df.groupby(["month_paid", "is_severe_late"], observed=True)["amount_applied"] + .sum() + .unstack(fill_value=0.0) + ) + + severe = monthly.get(True, pd.Series(0.0, index=monthly.index)) + total = monthly.sum(axis=1).replace(0.0, np.nan) + + grouped = severe / total + + rates = grouped.to_numpy(dtype=float) + rates = rates[np.isfinite(rates)] + return rates + + +def _fmt_money(x: float, round_to: float = _MONEY_ROUND_TO) -> str: + """Format money without false precision (round first).""" + + if not np.isfinite(float(x)): + return "n/a" + + xr = float(np.round(float(x) / float(round_to)) * float(round_to)) + sign = "-" if xr < 0 else "" + xr = abs(xr) + + if xr >= 1_000_000: + return f"{sign}${xr/1_000_000:.1f}M" + if xr >= 1_000: + return f"{sign}${xr/1_000:.1f}k" + return f"{sign}${xr:.0f}" + + +def _prob_to_frequency(p: float) -> str: + """Translate probability into an intuitive "1 out of every N" statement.""" + + if not np.isfinite(float(p)): + return "n/a" + p = float(p) + if p <= 0: + return "rare/none observed" + if p >= 1: + return "every month" + + n = int(round(1.0 / p)) + n = max(1, n) + return f"about 1 out of every {n} months" + + +def _make_risk_memo( + *, + latest_month: str, + current_cash: float, + current_ar: float, + cash_buffer_p95: float, + p_shortfall: float, + expected_loss: float, + worst_case_p90_loss: float, + severe_late_days: int, + lgd: float, + horizon_months: int, +) -> str: + """Return a short, deterministic markdown memo (<= 10 bullets).""" + + bullets: list[str] = [] + + bullets.append( + f"Cash risk (as of {latest_month}): current cash is {_fmt_money(current_cash)}. " + f"Using a bootstrap of historical monthly net cash changes, the probability of cash dipping below $0 at least once in the next {horizon_months} months is {p_shortfall:.1%} " + f"({_prob_to_frequency(p_shortfall)})." + ) + + bullets.append( + f"Recommended emergency fund (95% confidence): {_fmt_money(cash_buffer_p95)}. " + "Interpretation: adding this buffer to the current cash balance keeps simulated paths >= $0 in ~95% of scenarios under the model assumptions." + ) + + bullets.append( + f"Bad debt risk (A/R): current A/R is {_fmt_money(current_ar)}. " + f"Using severe-late payments (≥ {severe_late_days} days) as a proxy for default risk and LGD={lgd:.0%}, the expected loss is {_fmt_money(expected_loss)}." + ) + + bullets.append( + f"Bad debt worst case (p90): {_fmt_money(worst_case_p90_loss)}. " + "Interpretation: in 1 out of 10 months like the high-risk tail of recent history, losses could be this large (or larger)." + ) + + bullets.append( + "Assumptions (explicit): (1) future cash-change months are resampled from recent history (no structural change), " + "(2) months are treated as independent (no seasonality model), " + "(3) severe-late share is a proxy for default (not a true write-off model), " + "(4) numbers are rounded to avoid false precision." + ) + + bullets.append( + "Unknown unknowns to monitor: customer concentration risk, one-time cash shocks (taxes, capex, fraud), " + "credit policy changes, supply disruptions, and accounting system/process changes that break comparability." + ) + + # Keep <= 10 bullets (contract guardrail) + bullets = bullets[:10] + + return "\n".join([f"- {b}" for b in bullets]) + "\n" + + +def analyze_ch10( + *, + datadir: Path, + outdir: Path | None, + seed: int = 123, + horizon_months: int = DEFAULT_HORIZON_MONTHS, + n_sims: int = DEFAULT_N_SIMS, + severe_late_days: int = DEFAULT_SEVERE_LATE_DAYS, + lgd: float = DEFAULT_LGD, +) -> Ch10Outputs: + """Run Chapter 10 analysis and write deterministic artifacts.""" + + apply_seed(seed) + + if outdir is None: + outdir = Path("outputs/track_d") + + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + manifest_path = outdir / "ch10_figures_manifest.csv" + memo_path = outdir / "ch10_risk_memo.md" + summary_path = outdir / "ch10_risk_summary.json" + + # --- Inputs --- + bs = _read_csv_required(datadir, "statements_bs_monthly.csv") + bank = _read_csv_required(datadir, "bank_statement.csv") + + # Latest month for labels + latest_month = str(bs["month"].astype(str).max()) if "month" in bs.columns and not bs.empty else "(unknown)" + + current_cash = _latest_bs_value(bs, "Cash") + current_ar = _latest_bs_value(bs, "Accounts Receivable") + + # Monthly net cash change distribution + bank_m = _monthly_net_cash_changes(bank) + net_changes = bank_m["net_cash_change"].astype(float).to_numpy() if not bank_m.empty else np.array([], dtype=float) + + rng = np.random.default_rng(int(seed) + 10_000) + cash_sim = _bootstrap_cash_buffer( + net_changes=net_changes, + current_cash=float(current_cash), + horizon_months=int(horizon_months), + n_sims=int(n_sims), + rng=rng, + ) + + buffer_needed = cash_sim["buffer_needed"] + p_shortfall = float(cash_sim["p_shortfall"]) + buffer_p95 = float(cash_sim["buffer_p95"]) + buffer_mean = float(cash_sim["buffer_mean"]) + + # Bad debt proxy from Chapter 8 A/R payment lag slices + ch08 = analyze_ch08(datadir=datadir, outdir=None, seed=seed) + rates = _severe_late_rates_from_slices(ch08.ar_payment_slices, severe_late_days=int(severe_late_days)) + + expected_rate = float(np.nanmean(rates)) if rates.size else float("nan") + p90_rate = float(np.nanpercentile(rates, 90)) if rates.size else float("nan") + + expected_loss = float(current_ar) * float(lgd) * expected_rate if np.isfinite(current_ar) else float("nan") + worst_case_p90_loss = float(current_ar) * float(lgd) * p90_rate if np.isfinite(current_ar) else float("nan") + + # --- Figures (style contract compliant) --- + manifest: list[FigureManifestRow] = [] + figure_paths: list[Path] = [] + + # 1) Histogram of monthly net cash changes + if net_changes.size: + markers = { + "mean": float(np.mean(net_changes)), + "median": float(np.median(net_changes)), + "p10": float(np.percentile(net_changes, 10)), + "p90": float(np.percentile(net_changes, 90)), + } + with style_context(): + fig = plot_histogram_with_markers( + values=net_changes, + title="Monthly net cash change — histogram", + x_label="Net cash change (currency units)", + y_label="Count", + markers=markers, + ) + path = figures_dir / "ch10_cash_flow_hist.png" + save_figure(fig, path, FigureSpec(chart_type="histogram", title="Monthly net cash change — histogram")) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="histogram", + title="Monthly net cash change — histogram", + x_label="Net cash change (currency units)", + y_label="Count", + guardrail_note="Histogram uses real units; include quantile markers to reveal tail risk.", + data_source="bank_statement.csv (aggregated by month)", + ) + ) + figure_paths.append(path) + + # 2) ECDF of buffer needed + if buffer_needed.size: + markers = {"p95": float(buffer_p95), "mean": float(buffer_mean)} + with style_context(): + fig = plot_ecdf( + values=buffer_needed, + title=f"Emergency fund needed to keep cash ≥ 0 (horizon={horizon_months} months)", + x_label="Buffer needed (currency units)", + y_label="Cumulative proportion", + markers=markers, + ) + path = figures_dir / "ch10_cash_buffer_ecdf.png" + save_figure(fig, path, FigureSpec(chart_type="ecdf", title="Emergency fund buffer ECDF")) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="ecdf", + title=f"Emergency fund needed (horizon={horizon_months} months)", + x_label="Buffer needed (currency units)", + y_label="Cumulative proportion", + guardrail_note="ECDF communicates uncertainty; highlight p95 to avoid false precision.", + data_source="bank_statement.csv (bootstrap resample of monthly net changes)", + ) + ) + figure_paths.append(path) + + # 3) ECDF of bad-debt loss proxy (loss = AR * rate * LGD) + if rates.size and np.isfinite(float(current_ar)): + losses = float(current_ar) * float(lgd) * rates + markers = {"expected": float(expected_loss), "p90": float(worst_case_p90_loss)} + with style_context(): + fig = plot_ecdf( + values=losses, + title=f"Bad debt loss proxy (severe-late ≥ {severe_late_days} days)", + x_label="Loss (currency units)", + y_label="Cumulative proportion", + markers=markers, + ) + path = figures_dir / "ch10_bad_debt_loss_ecdf.png" + save_figure(fig, path, FigureSpec(chart_type="ecdf", title="Bad debt loss proxy ECDF")) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="ecdf", + title=f"Bad debt loss proxy (severe-late ≥ {severe_late_days} days)", + x_label="Loss (currency units)", + y_label="Cumulative proportion", + guardrail_note="Proxy model: severe-late share is not equal to true default; present as risk range.", + data_source="ar_payment_slices.csv (via Ch08)", + ) + ) + figure_paths.append(path) + + # --- Memo + summary --- + memo = _make_risk_memo( + latest_month=latest_month, + current_cash=float(current_cash), + current_ar=float(current_ar), + cash_buffer_p95=float(buffer_p95), + p_shortfall=float(p_shortfall), + expected_loss=float(expected_loss), + worst_case_p90_loss=float(worst_case_p90_loss), + severe_late_days=int(severe_late_days), + lgd=float(lgd), + horizon_months=int(horizon_months), + ) + memo_path.write_text(memo, encoding="utf-8") + + pd.DataFrame([m.__dict__ for m in manifest]).to_csv(manifest_path, index=False) + + summary: dict[str, Any] = { + "chapter": "business_ch10_probability_risk", + "seed": int(seed), + "inputs": { + "datadir": str(datadir), + "files": ["statements_bs_monthly.csv", "bank_statement.csv", "ar_events.csv (via Ch08)"] , + }, + "assumptions": { + "horizon_months": int(horizon_months), + "n_sims": int(n_sims), + "cash_model": "bootstrap monthly net cash change (bank statement aggregated by month)", + "bad_debt_proxy": f"severe-late share of collections (>= {int(severe_late_days)} days)", + "lgd": float(lgd), + "rounding": float(_MONEY_ROUND_TO), + }, + "cash_buffer": { + "confidence": float(DEFAULT_CONFIDENCE), + "horizon_months": int(horizon_months), + "current_cash": float(current_cash), + "p_shortfall": float(p_shortfall), + "buffer_p95": float(buffer_p95), + "buffer_mean": float(buffer_mean), + "n_months_history": int(net_changes.size), + }, + "cash": { + "latest_month": latest_month, + "current_cash": float(current_cash), + "p_shortfall": float(p_shortfall), + "recommended_buffer_p95": float(buffer_p95), + "buffer_mean": float(buffer_mean), + "n_months_history": int(net_changes.size), + }, + + "bad_debt": { + "current_ar": float(current_ar), + "expected_rate": float(expected_rate), + "p90_rate": float(p90_rate), + "expected_loss": float(expected_loss), + # Preferred key name (matches tests / public schema) + "worst_case_loss_p90": float(worst_case_p90_loss), + # Back-compat key (safe to keep for now) + "worst_case_p90_loss": float(worst_case_p90_loss), + "n_months_rates": int(rates.size), + }, + "paths": { + "manifest": str(manifest_path), + "memo": str(memo_path), + "summary": str(summary_path), + "figures_dir": str(figures_dir), + }, + "figures": [p.name for p in figure_paths], + } + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") + + return Ch10Outputs( + manifest_path=manifest_path, + memo_path=memo_path, + summary_path=summary_path, + figures_dir=figures_dir, + figure_paths=figure_paths, + ) + + +def main(argv: list[str] | None = None) -> int: + p = base_parser("Business Ch10: probability and risk") + p.add_argument("--datadir", type=Path, required=True, help="Path to NSO v1 dataset folder") + p.add_argument("--horizon-months", type=int, default=DEFAULT_HORIZON_MONTHS) + p.add_argument("--n-sims", type=int, default=DEFAULT_N_SIMS) + p.add_argument("--severe-late-days", type=int, default=DEFAULT_SEVERE_LATE_DAYS) + p.add_argument("--lgd", type=float, default=DEFAULT_LGD) + args = p.parse_args(argv) + + analyze_ch10( + datadir=Path(args.datadir), + outdir=Path(args.outdir), + seed=int(args.seed) if args.seed is not None else 123, + horizon_months=int(args.horizon_months), + n_sims=int(args.n_sims), + severe_late_days=int(args.severe_late_days), + lgd=float(args.lgd), + ) + + print(f"Wrote Chapter 10 artifacts -> {args.outdir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch11_sampling_estimation_audit_controls.py b/workbooks/track_d_template/scripts/business_ch11_sampling_estimation_audit_controls.py new file mode 100644 index 0000000..ffd3d46 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch11_sampling_estimation_audit_controls.py @@ -0,0 +1,506 @@ +# SPDX-License-Identifier: MIT +# business_ch11_sampling_estimation_audit_controls.py +"""Track D — Business Statistics & Forecasting for Accountants. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch11_sampling_plan.json +* ch11_sampling_summary.json +* ch11_audit_memo.md +* ch11_figures_manifest.csv + +Chapter 11: Sampling and Estimation (Audit and Controls Lens) + +Accountants often review thousands of transactions. Sampling is a cost-effective control +only when it is (1) explicit, (2) risk-based, and (3) communicated with uncertainty. + +This chapter produces deterministic, audit-friendly artifacts: +- a stratified sampling plan (material items 100% + random samples for the long tail) +- an error-rate confidence interval (Wilson) with plain-language interpretation +- a short memo using audit vocabulary (population, sample size, materiality, tolerance) + +Notes +----- +* This is *not* an external-audit substitute. It is a teaching chapter. +* The NSO v1 dataset does not contain true "control failures", so we simulate an error flag + deterministically from a seeded RNG to demonstrate CI mechanics. + +Outputs are designed to be: +- deterministic (seeded) +- audit-friendly (explicit assumptions + defensible calculations) +- matplotlib-only (no seaborn)""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser +from scripts._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_bar, + save_figure, + style_context, +) + +# --- Defaults / guardrails (explicit + testable) --- + +DEFAULT_CONFIDENCE = 0.95 +DEFAULT_TOLERANCE = 0.02 # management tolerance for error rate + +DEFAULT_HIGH_VALUE_THRESHOLD = 1000.0 # "material" items -> 100% review +DEFAULT_LOW_VALUE_THRESHOLD = 50.0 # "immaterial" long tail +DEFAULT_LOW_VALUE_RATE = 0.05 # sample 5% of low-value items +DEFAULT_MID_VALUE_RATE = 0.10 # sample 10% of mid-value items (teaching default) + +# Simulation of control failures (teaching device) +_BASE_ERROR_P = 0.01 +_HIGH_VALUE_ERROR_BONUS = 0.01 +_LOW_VALUE_ERROR_BONUS = 0.005 + + +@dataclass(frozen=True) +class Ch11Outputs: + sampling_plan_path: Path + memo_path: Path + summary_path: Path + manifest_path: Path + figures_dir: Path + figure_paths: list[Path] + + +def _read_csv_required(datadir: Path, fname: str) -> pd.DataFrame: + path = datadir / fname + if not path.exists(): + raise FileNotFoundError(f"Missing required dataset file: {path}") + return pd.read_csv(path) + + +def _z_for_confidence(confidence: float) -> float: + """Return a z critical value for common confidence levels. + + We keep this explicit (audit-friendly) and avoid extra dependencies. + """ + c = float(confidence) + if np.isclose(c, 0.90): + return 1.6448536269514722 + if np.isclose(c, 0.95): + return 1.959963984540054 + if np.isclose(c, 0.99): + return 2.5758293035489004 + raise ValueError("confidence must be one of: 0.90, 0.95, 0.99") + + +def proportion_ci_wilson(k: int, n: int, confidence: float = DEFAULT_CONFIDENCE) -> tuple[float, float]: + """Wilson score interval for a binomial proportion (k/n). + + Returns (low, high) in [0, 1]. If n==0 -> (nan, nan). + """ + n = int(n) + k = int(k) + if n <= 0: + return (float("nan"), float("nan")) + + z = _z_for_confidence(confidence) + phat = k / n + denom = 1.0 + (z**2) / n + center = (phat + (z**2) / (2.0 * n)) / denom + half = (z * np.sqrt((phat * (1.0 - phat) + (z**2) / (4.0 * n)) / n)) / denom + + low = max(0.0, float(center - half)) + high = min(1.0, float(center + half)) + return (low, high) + + +def _risk_based_sample( + invoices: pd.DataFrame, + *, + rng: np.random.Generator, + high_value_threshold: float, + low_value_threshold: float, + low_value_rate: float, + mid_value_rate: float, + id_col: str = "invoice_id", + amount_col: str = "amount", +) -> tuple[pd.DataFrame, dict[str, Any]]: + """Return a sampled invoice table and a sampling-plan dict. + + Stratification: + - high_value (>= high_value_threshold): 100% selection (materiality) + - low_value (< low_value_threshold): random sample at low_value_rate + - mid_value (else): random sample at mid_value_rate + """ + required = {id_col, amount_col} + if not required.issubset(set(invoices.columns)): + raise ValueError(f"invoices must include columns: {sorted(required)}") + + df = invoices[[id_col, amount_col]].copy() + df[id_col] = df[id_col].astype(str) + df[amount_col] = df[amount_col].astype(float) + + df = df[df[amount_col].notna()].copy() + df = df[df[amount_col] > 0].copy() + + def _stratum(a: float) -> str: + if a >= float(high_value_threshold): + return "high_value" + if a < float(low_value_threshold): + return "low_value" + return "mid_value" + + df["stratum"] = df[amount_col].map(_stratum) + + # Determine selection counts (deterministic given seed + sort) + df = df.sort_values(["stratum", id_col], kind="mergesort").reset_index(drop=True) + + selected_ids: set[str] = set() + + # High value: include all + high = df[df["stratum"] == "high_value"] + selected_ids.update(high[id_col].tolist()) + + # Mid and low: random sample without replacement + for stratum, rate in [("mid_value", mid_value_rate), ("low_value", low_value_rate)]: + g = df[df["stratum"] == stratum] + n = int(g.shape[0]) + k = int(np.floor(n * float(rate))) + k = max(0, min(n, k)) + if k > 0: + choices = rng.choice(g[id_col].to_numpy(dtype=str), size=k, replace=False) + selected_ids.update([str(x) for x in choices]) + + df["selected"] = df[id_col].isin(selected_ids) + + plan = { + "population_n": int(df.shape[0]), + "sample_n": int(df["selected"].sum()), + "params": { + "high_value_threshold": float(high_value_threshold), + "low_value_threshold": float(low_value_threshold), + "low_value_rate": float(low_value_rate), + "mid_value_rate": float(mid_value_rate), + }, + "strata": { + "high_value": { + "population_n": int((df["stratum"] == "high_value").sum()), + "sample_n": int(((df["stratum"] == "high_value") & df["selected"]).sum()), + }, + "mid_value": { + "population_n": int((df["stratum"] == "mid_value").sum()), + "sample_n": int(((df["stratum"] == "mid_value") & df["selected"]).sum()), + }, + "low_value": { + "population_n": int((df["stratum"] == "low_value").sum()), + "sample_n": int(((df["stratum"] == "low_value") & df["selected"]).sum()), + }, + }, + "selected_invoice_ids": df.loc[df["selected"], id_col].tolist(), + } + + return df, plan + + +def _simulate_control_errors( + sampled: pd.DataFrame, + *, + rng: np.random.Generator, + amount_col: str = "amount", +) -> np.ndarray: + """Deterministically simulate control failures for CI demonstration.""" + a = sampled[amount_col].astype(float).to_numpy() + # small risk uplift for very large items and tiny items (more process variation) + p = ( + _BASE_ERROR_P + + _HIGH_VALUE_ERROR_BONUS * (a >= DEFAULT_HIGH_VALUE_THRESHOLD) + + _LOW_VALUE_ERROR_BONUS * (a < DEFAULT_LOW_VALUE_THRESHOLD) + ) + p = np.clip(p.astype(float), 0.0, 1.0) + return rng.random(size=a.shape[0]) < p + + +def _make_audit_memo( + *, + population_n: int, + sample_n: int, + confidence: float, + tolerance: float, + k_errors: int, + ci_low: float, + ci_high: float, + high_value_threshold: float, + low_value_threshold: float, + low_value_rate: float, + mid_value_rate: float, + worked_example: dict[str, Any], +) -> str: + """Return a short deterministic memo (markdown).""" + lines: list[str] = [] + + lines.append( + "We designed a risk-based sampling plan for invoice review. " + f"Population size: {population_n} invoices. Sample size: {sample_n} invoices." + ) + + lines.append( + f"Materiality rule: reviewed 100% of items ≥ ${high_value_threshold:,.0f}. " + f"For the long tail, we randomly sampled {mid_value_rate:.0%} of $[{low_value_threshold:,.0f}, {high_value_threshold:,.0f}) " + f"and {low_value_rate:.0%} of items < ${low_value_threshold:,.0f}. " + "This stratification concentrates effort where risk and impact are highest." + ) + + phat = (k_errors / sample_n) if sample_n > 0 else float("nan") + lines.append( + f"Observed error rate in the sample: {phat:.1%} ({k_errors} errors / {sample_n} reviewed). " + f"Using a {confidence:.0%} Wilson confidence interval, the true population error rate is plausibly between {ci_low:.1%} and {ci_high:.1%}." + ) + + decision = "PASS" if (np.isfinite(ci_high) and ci_high <= float(tolerance)) else "FAIL / INVESTIGATE" + lines.append( + f"Control decision vs tolerance ({tolerance:.1%}): {decision}. " + "Controls lens: if the CI upper bound exceeds tolerance, we cannot claim the process meets the threshold at the stated confidence." + ) + + # Worked example (problem statement) + we = worked_example + lines.append( + f"Worked example (inventory counts): {we['k_errors']} errors in {we['n']} counts -> {we['phat']:.1%} observed. " + f"{we['confidence']:.0%} CI: [{we['ci_low']:.1%}, {we['ci_high']:.1%}] vs tolerance {we['tolerance']:.1%} -> {we['decision']}." + ) + + lines.append( + "Assumptions: random sampling within strata; items are treated as independent; the interval describes uncertainty from sampling, not all forms of risk (fraud, collusion, data loss). " + "If the population changes materially (new vendors, new system), re-baseline the plan." + ) + + return "\n\n".join(lines) + "\n" + + +def analyze_ch11( + *, + datadir: Path, + outdir: Path | None, + seed: int = 123, + confidence: float = DEFAULT_CONFIDENCE, + tolerance: float = DEFAULT_TOLERANCE, + high_value_threshold: float = DEFAULT_HIGH_VALUE_THRESHOLD, + low_value_threshold: float = DEFAULT_LOW_VALUE_THRESHOLD, + low_value_rate: float = DEFAULT_LOW_VALUE_RATE, + mid_value_rate: float = DEFAULT_MID_VALUE_RATE, +) -> Ch11Outputs: + """Run Chapter 11 analysis and write deterministic artifacts.""" + apply_seed(seed) + + if outdir is None: + outdir = Path("outputs/track_d") + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + sampling_plan_path = outdir / "ch11_sampling_plan.json" + summary_path = outdir / "ch11_sampling_summary.json" + memo_path = outdir / "ch11_audit_memo.md" + manifest_path = outdir / "ch11_figures_manifest.csv" + + # --- Inputs: treat AP invoice rows as the audit population --- + ap = _read_csv_required(datadir, "ap_events.csv") + if ap.empty or not {"event_type", "invoice_id", "amount"}.issubset(set(ap.columns)): + raise ValueError("ap_events.csv must contain columns: event_type, invoice_id, amount") + + invoices = ap.loc[ap["event_type"].astype(str) == "invoice", ["invoice_id", "amount"]].copy() + + rng = np.random.default_rng(int(seed) + 11_000) + + sampled_df, plan = _risk_based_sample( + invoices, + rng=rng, + high_value_threshold=float(high_value_threshold), + low_value_threshold=float(low_value_threshold), + low_value_rate=float(low_value_rate), + mid_value_rate=float(mid_value_rate), + ) + sampling_plan_path.write_text(json.dumps(plan, indent=2), encoding="utf-8") + + # --- Simulate control failures on the selected sample (teaching) --- + sample = sampled_df.loc[sampled_df["selected"]].copy() + error_flags = _simulate_control_errors(sample, rng=rng, amount_col="amount") + k_errors = int(np.sum(error_flags)) + n = int(sample.shape[0]) + + ci_low, ci_high = proportion_ci_wilson(k=k_errors, n=n, confidence=float(confidence)) + + # Worked example from the vision document (n=50, k=2) + ex_n = 50 + ex_k = 2 + ex_low, ex_high = proportion_ci_wilson(k=ex_k, n=ex_n, confidence=float(confidence)) + ex_phat = ex_k / ex_n + ex_decision = "PASS" if (np.isfinite(ex_high) and ex_high <= float(tolerance)) else "FAIL / INVESTIGATE" + worked_example = { + "n": int(ex_n), + "k_errors": int(ex_k), + "phat": float(ex_phat), + "confidence": float(confidence), + "ci_low": float(ex_low), + "ci_high": float(ex_high), + "tolerance": float(tolerance), + "decision": ex_decision, + } + + # --- Figures + manifest (style contract) --- + manifest: list[FigureManifestRow] = [] + figure_paths: list[Path] = [] + + # Figure 1: population vs sample by stratum + strata = ["high_value", "mid_value", "low_value"] + pop_counts = [int((sampled_df["stratum"] == s).sum()) for s in strata] + + pop_df_plot = pd.DataFrame({"stratum": strata, "count": pop_counts}) + + with style_context(): + fig = plot_bar( + df=pop_df_plot, + x="stratum", + y="count", + title="Invoice population by stratum (for sampling)", + x_label="Stratum", + y_label="Count", + ) + path = figures_dir / "ch11_strata_sampling_bar.png" + save_figure(fig, path, FigureSpec(chart_type="bar", title="Invoice population by stratum")) + manifest.append( + FigureManifestRow( + filename=path.name, + chart_type="bar", + title="Invoice population by stratum (for sampling)", + x_label="Stratum", + y_label="Count", + guardrail_note="Bar chart starts at zero; strata are explicit to avoid cherry-picking.", + data_source="ap_events.csv (invoice rows)", + ) + ) + figure_paths.append(path) + + # Figure 2: error rate CI (simple bar + whisker) + import matplotlib.pyplot as plt + + phat = (k_errors / n) if n > 0 else float("nan") + with style_context(): + fig, ax = plt.subplots() + ax.bar([0], [phat]) + if np.isfinite(ci_low) and np.isfinite(ci_high): + ax.errorbar([0], [phat], yerr=[[phat - ci_low], [ci_high - phat]], fmt="none", capsize=6) + ax.set_xticks([0]) + ax.set_xticklabels(["observed"]) + ax.set_ylim(0, max(0.05, float(ci_high) * 1.2 if np.isfinite(ci_high) else 0.05)) + ax.set_title(f"Sample error rate with {confidence:.0%} CI (Wilson)") + ax.set_ylabel("Error rate") + ax.set_xlabel(" ") + ax.grid(True, axis="y", alpha=0.2) + fig.tight_layout() + path2 = figures_dir / "ch11_error_rate_ci.png" + save_figure(fig, path2, FigureSpec(chart_type="scatter", title="Sample error rate with CI")) + manifest.append( + FigureManifestRow( + filename=path2.name, + chart_type="scatter", + title=f"Sample error rate with {confidence:.0%} CI (Wilson)", + x_label="", + y_label="Error rate", + guardrail_note="CI communicates sampling uncertainty; do not interpret a point estimate as certainty.", + data_source="ap_events.csv (sample) + simulated error flags (teaching)", + ) + ) + figure_paths.append(path2) + + pd.DataFrame([m.__dict__ for m in manifest]).to_csv(manifest_path, index=False) + + memo = _make_audit_memo( + population_n=int(plan["population_n"]), + sample_n=int(plan["sample_n"]), + confidence=float(confidence), + tolerance=float(tolerance), + k_errors=int(k_errors), + ci_low=float(ci_low), + ci_high=float(ci_high), + high_value_threshold=float(high_value_threshold), + low_value_threshold=float(low_value_threshold), + low_value_rate=float(low_value_rate), + mid_value_rate=float(mid_value_rate), + worked_example=worked_example, + ) + memo_path.write_text(memo, encoding="utf-8") + + summary: dict[str, Any] = { + "chapter": "business_ch11_sampling_estimation_audit_controls", + "seed": int(seed), + "inputs": {"datadir": str(datadir), "files": ["ap_events.csv"]}, + "sampling": plan, + "ci": { + "confidence": float(confidence), + "method": "wilson", + "sample_n": int(n), + "k_errors": int(k_errors), + "phat": float(phat) if np.isfinite(phat) else float("nan"), + "ci_low": float(ci_low), + "ci_high": float(ci_high), + "tolerance": float(tolerance), + "decision": "PASS" if (np.isfinite(ci_high) and ci_high <= float(tolerance)) else "FAIL / INVESTIGATE", + }, + "worked_example": worked_example, + "paths": { + "sampling_plan": str(sampling_plan_path), + "summary": str(summary_path), + "memo": str(memo_path), + "manifest": str(manifest_path), + "figures_dir": str(figures_dir), + }, + "figures": [p.name for p in figure_paths], + } + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") + + return Ch11Outputs( + sampling_plan_path=sampling_plan_path, + memo_path=memo_path, + summary_path=summary_path, + manifest_path=manifest_path, + figures_dir=figures_dir, + figure_paths=figure_paths, + ) + + +def main(argv: list[str] | None = None) -> int: + p = base_parser("Business Ch11: sampling and estimation (audit/controls)") + p.add_argument("--datadir", type=Path, required=True, help="Path to NSO v1 dataset folder") + p.add_argument("--confidence", type=float, default=DEFAULT_CONFIDENCE, help="CI confidence (0.90, 0.95, 0.99)") + p.add_argument("--tolerance", type=float, default=DEFAULT_TOLERANCE, help="Management tolerance for error rate") + p.add_argument("--high-value-threshold", type=float, default=DEFAULT_HIGH_VALUE_THRESHOLD) + p.add_argument("--low-value-threshold", type=float, default=DEFAULT_LOW_VALUE_THRESHOLD) + p.add_argument("--low-value-rate", type=float, default=DEFAULT_LOW_VALUE_RATE) + p.add_argument("--mid-value-rate", type=float, default=DEFAULT_MID_VALUE_RATE) + args = p.parse_args(argv) + + analyze_ch11( + datadir=Path(args.datadir), + outdir=Path(args.outdir), + seed=int(args.seed) if args.seed is not None else 123, + confidence=float(args.confidence), + tolerance=float(args.tolerance), + high_value_threshold=float(args.high_value_threshold), + low_value_threshold=float(args.low_value_threshold), + low_value_rate=float(args.low_value_rate), + mid_value_rate=float(args.mid_value_rate), + ) + print(f"Wrote Chapter 11 artifacts -> {args.outdir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch12_hypothesis_testing_decisions.py b/workbooks/track_d_template/scripts/business_ch12_hypothesis_testing_decisions.py new file mode 100644 index 0000000..5849f95 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch12_hypothesis_testing_decisions.py @@ -0,0 +1,524 @@ +# SPDX-License-Identifier: MIT +"""Track D — Business Statistics for Accountants. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch12_experiment_design.json +* ch12_hypothesis_testing_summary.json +* ch12_experiment_memo.md +* ch12_figures_manifest.csv + +Chapter 12: Hypothesis Testing for Decisions (Practical, Not Math-Heavy) + +This chapter reframes hypothesis testing as *business experimentation*. + +NSO v1 doesn't contain explicit "region" or "promotion" labels, so we generate two +*deterministic teaching cases* from NSO v1 input data: + +1) Promotion test (two groups): + - Build two "regions" by deterministically assigning customers to treatment/control. + - Choose a promo month with the most AR invoices. + - Apply a small, configurable uplift to treated invoices in the promo month. + - Use a permutation test on the mean to estimate a p-value. + - Report effect size + practical significance. + +2) Cycle time test (before/after): + - Compute AP "days-to-pay" per invoice from invoice and payment events. + - Choose a changeover date at the median invoice date. + - Apply a small, configurable reduction to the "after" group to represent a process + improvement. + - Use a permutation test on the mean. + +Outputs are deterministic for a given seed (stable file names, JSON summary, memo, +figure manifest, and charts following the repo's style contract).""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser +from scripts._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_bar, + plot_histogram_with_markers, + save_figure, + style_context, +) + + +DEFAULT_CONFIDENCE = 0.95 +DEFAULT_N_PERM = 2_000 + +# Teaching knobs (keep small; emphasize effect size, not p-values) +DEFAULT_PROMO_UPLIFT_PCT = 0.08 +DEFAULT_CYCLE_TIME_REDUCTION_DAYS = 2.0 + + +@dataclass(frozen=True) +class Ch12Outputs: + figures_dir: Path + figure_paths: list[Path] + manifest_path: Path + memo_path: Path + design_path: Path + summary_path: Path + + +def _read_csv_required(datadir: Path, filename: str) -> pd.DataFrame: + path = Path(datadir) / filename + if not path.exists(): + raise FileNotFoundError(f"Required input missing: {path}") + return pd.read_csv(path) + + +def _as_month(series: pd.Series) -> pd.Series: + # Keep month as YYYY-MM for stable grouping. + dt = pd.to_datetime(series, errors="coerce") + return dt.dt.to_period("M").astype(str) + + +def _assign_customers_to_groups( + customers: list[str], *, rng: np.random.Generator, treated_fraction: float = 0.5 +) -> dict[str, str]: + """Deterministically assign customers to "treatment" or "control".""" + cust = sorted(set(str(c) for c in customers)) + n = len(cust) + if n == 0: + return {} + + n_treat = int(round(n * float(treated_fraction))) + # Deterministic: shuffle with RNG derived from seed. + idx = np.arange(n) + rng.shuffle(idx) + treat_idx = set(idx[:n_treat].tolist()) + return {cust[i]: ("treatment" if i in treat_idx else "control") for i in range(n)} + + +def _mean_diff(x: np.ndarray, y: np.ndarray) -> float: + return float(np.mean(x) - np.mean(y)) + + +def _cohens_d(x: np.ndarray, y: np.ndarray) -> float: + # Pooled SD (unbiased). Return NaN if degenerate. + x = np.asarray(x, dtype=float) + y = np.asarray(y, dtype=float) + nx, ny = x.size, y.size + if nx < 2 or ny < 2: + return float("nan") + vx = np.var(x, ddof=1) + vy = np.var(y, ddof=1) + pooled = ((nx - 1) * vx + (ny - 1) * vy) / float(nx + ny - 2) + if not np.isfinite(pooled) or pooled <= 0: + return float("nan") + return float((np.mean(x) - np.mean(y)) / np.sqrt(pooled)) + + +def _bootstrap_mean_diff( + x: np.ndarray, y: np.ndarray, *, rng: np.random.Generator, n_boot: int = 2_000 +) -> np.ndarray: + x = np.asarray(x, dtype=float) + y = np.asarray(y, dtype=float) + nx, ny = x.size, y.size + if nx == 0 or ny == 0: + return np.array([], dtype=float) + + # Bootstrap within each group. + bx = rng.integers(0, nx, size=(n_boot, nx)) + by = rng.integers(0, ny, size=(n_boot, ny)) + diffs = np.mean(x[bx], axis=1) - np.mean(y[by], axis=1) + return diffs.astype(float) + + +def _ci_from_samples(samples: np.ndarray, confidence: float) -> tuple[float, float]: + if samples.size == 0: + return float("nan"), float("nan") + alpha = 1.0 - float(confidence) + lo = float(np.quantile(samples, alpha / 2.0)) + hi = float(np.quantile(samples, 1.0 - alpha / 2.0)) + return lo, hi + + +def _permutation_test_mean_diff( + x: np.ndarray, y: np.ndarray, *, rng: np.random.Generator, n_perm: int = DEFAULT_N_PERM +) -> tuple[float, np.ndarray]: + """Two-sided permutation test for difference in means.""" + x = np.asarray(x, dtype=float) + y = np.asarray(y, dtype=float) + nx, ny = x.size, y.size + if nx == 0 or ny == 0: + return float("nan"), np.array([], dtype=float) + + observed = _mean_diff(x, y) + pooled = np.concatenate([x, y]) + n = pooled.size + + diffs = np.empty(int(n_perm), dtype=float) + for i in range(int(n_perm)): + perm = np.arange(n) + rng.shuffle(perm) + x_p = pooled[perm[:nx]] + y_p = pooled[perm[nx:]] + diffs[i] = _mean_diff(x_p, y_p) + + # Add-one smoothing. + p = (1.0 + float(np.sum(np.abs(diffs) >= abs(observed)))) / (float(n_perm) + 1.0) + return float(p), diffs + + +def analyze_ch12( + *, + datadir: Path, + outdir: Path | None, + seed: int = 123, + confidence: float = DEFAULT_CONFIDENCE, + n_perm: int = DEFAULT_N_PERM, + promo_uplift_pct: float = DEFAULT_PROMO_UPLIFT_PCT, + cycle_time_reduction_days: float = DEFAULT_CYCLE_TIME_REDUCTION_DAYS, +) -> Ch12Outputs: + """Run Chapter 12 analysis and write deterministic artifacts.""" + + apply_seed(seed) + + if outdir is None: + outdir = Path("outputs/track_d") + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + design_path = outdir / "ch12_experiment_design.json" + summary_path = outdir / "ch12_hypothesis_testing_summary.json" + memo_path = outdir / "ch12_experiment_memo.md" + manifest_path = outdir / "ch12_figures_manifest.csv" + + rng = np.random.default_rng(int(seed) + 12_000) + + # -------------------- Case 1: Promotion test (AR invoices) -------------------- + ar = _read_csv_required(datadir, "ar_events.csv") + required_cols = {"event_type", "customer", "invoice_id", "amount", "date"} + if ar.empty or not required_cols.issubset(set(ar.columns)): + raise ValueError(f"ar_events.csv must contain columns: {sorted(required_cols)}") + + inv = ar.loc[ar["event_type"].astype(str) == "invoice", ["date", "customer", "invoice_id", "amount"]].copy() + inv["month"] = _as_month(inv["date"]) + + # Choose the month with the most invoices (ensures sample size). + if inv.empty: + raise ValueError("No AR invoice events found to construct promotion case.") + promo_month = str(inv["month"].value_counts().idxmax()) + + cust_to_group = _assign_customers_to_groups(inv["customer"].astype(str).tolist(), rng=rng) + inv["group"] = inv["customer"].astype(str).map(cust_to_group).fillna("control") + + promo = inv.loc[inv["month"] == promo_month].copy() + promo["amount"] = pd.to_numeric(promo["amount"], errors="coerce") + promo = promo.dropna(subset=["amount"]).copy() + + # Teaching: apply uplift to treated amounts during the promo month. + promo["amount_adjusted"] = promo["amount"] + promo.loc[promo["group"] == "treatment", "amount_adjusted"] = promo.loc[ + promo["group"] == "treatment", "amount" + ] * (1.0 + float(promo_uplift_pct)) + + x = promo.loc[promo["group"] == "treatment", "amount_adjusted"].to_numpy(dtype=float) + y = promo.loc[promo["group"] == "control", "amount_adjusted"].to_numpy(dtype=float) + + promo_p_value, promo_perm_diffs = _permutation_test_mean_diff(x, y, rng=rng, n_perm=int(n_perm)) + promo_obs_diff = _mean_diff(x, y) + promo_cohen_d = _cohens_d(x, y) + + promo_boot_diffs = _bootstrap_mean_diff(x, y, rng=rng, n_boot=2_000) + promo_ci_low, promo_ci_high = _ci_from_samples(promo_boot_diffs, float(confidence)) + + promo_control_mean = float(np.mean(y)) if y.size else float("nan") + promo_rel_lift = float(promo_obs_diff / promo_control_mean) if np.isfinite(promo_control_mean) and promo_control_mean != 0 else float("nan") + + # -------------------- Case 2: Cycle time test (AP days-to-pay) -------------------- + ap = _read_csv_required(datadir, "ap_events.csv") + required_cols = {"event_type", "invoice_id", "amount", "date"} + if ap.empty or not required_cols.issubset(set(ap.columns)): + raise ValueError(f"ap_events.csv must contain columns: {sorted(required_cols)}") + + ap = ap[["event_type", "invoice_id", "amount", "date"]].copy() + ap["date"] = pd.to_datetime(ap["date"], errors="coerce") + + ap["month"] = ap["date"].dt.to_period("M").astype(str) + + inv_rows = ap.loc[ + ap["event_type"].astype(str) == "invoice", + ["invoice_id", "month", "date", "amount"], + ].rename(columns={"date": "invoice_date", "amount": "invoice_amount"}) + + # In NSO v1 the payment events are modeled at the month level (not per-invoice), + # so we map each invoice to the *latest* payment date in its invoice month. + pay_rows = ap.loc[ + ap["event_type"].astype(str) == "payment", + ["month", "date"], + ].rename(columns={"date": "payment_date"}) + + pay_by_month = pay_rows.groupby("month", as_index=False)["payment_date"].max() + + lags = inv_rows.merge(pay_by_month, on="month", how="left") + lags = lags.dropna(subset=["invoice_date", "payment_date"]).copy() + lags["days_to_pay"] = (lags["payment_date"] - lags["invoice_date"]).dt.days.astype(float) + lags["days_to_pay"] = np.maximum(0.0, lags["days_to_pay"].to_numpy(dtype=float)) + + if lags.empty: + raise ValueError("No AP invoices with a month-level payment date found to compute cycle time.") + + changeover_date = lags["invoice_date"].median() + lags["period"] = np.where(lags["invoice_date"] < changeover_date, "before", "after") + + # Teaching: apply a small reduction to the after-period cycle time. + lags["days_to_pay_adjusted"] = lags["days_to_pay"] + lags.loc[lags["period"] == "after", "days_to_pay_adjusted"] = np.maximum( + 0.0, + lags.loc[lags["period"] == "after", "days_to_pay"].to_numpy(dtype=float) - float(cycle_time_reduction_days), + ) + + xb = lags.loc[lags["period"] == "before", "days_to_pay_adjusted"].to_numpy(dtype=float) + ya = lags.loc[lags["period"] == "after", "days_to_pay_adjusted"].to_numpy(dtype=float) + + # We report (before - after) so a positive diff means improvement. + cycle_p_value, cycle_perm_diffs = _permutation_test_mean_diff(xb, ya, rng=rng, n_perm=int(n_perm)) + cycle_obs_diff = _mean_diff(xb, ya) + cycle_cohen_d = _cohens_d(xb, ya) + + cycle_boot_diffs = _bootstrap_mean_diff(xb, ya, rng=rng, n_boot=2_000) + cycle_ci_low, cycle_ci_high = _ci_from_samples(cycle_boot_diffs, float(confidence)) + + # -------------------- Design (pre-commitment) -------------------- + design = { + "chapter": "business_ch12_hypothesis_testing_decisions", + "seed": int(seed), + "confidence": float(confidence), + "test": "two-sided permutation test on mean difference", + "n_perm": int(n_perm), + "precommitment": { + "alpha": float(1.0 - float(confidence)), + "stopping_rule": "Run exactly n_perm permutations; do not peek and stop early.", + "primary_metrics": { + "promotion": "Average invoice amount (AR) in promo month", + "cycle_time": "Days-to-pay (AP) per invoice", + }, + "decision_rule": { + "promotion": "Consider rollout if p < alpha AND effect size is practically meaningful.", + "cycle_time": "Consider full deployment if p < alpha AND average days saved justifies costs.", + }, + }, + "teaching_knobs": { + "promo_month": promo_month, + "promo_uplift_pct": float(promo_uplift_pct), + "cycle_time_reduction_days": float(cycle_time_reduction_days), + "note": "Knobs inject a small effect so examples are non-trivial; set to 0 for a pure 'measurement-only' run.", + }, + } + design_path.write_text(json.dumps(design, indent=2), encoding="utf-8") + + # -------------------- Figures + manifest -------------------- + manifest: list[FigureManifestRow] = [] + figure_paths: list[Path] = [] + + # Figure 1: Bar chart of mean cycle time before vs after + cycle_means = ( + lags.groupby("period", observed=True)["days_to_pay_adjusted"].mean().reindex(["before", "after"]).reset_index() + ) + cycle_means = cycle_means.rename(columns={"period": "group", "days_to_pay_adjusted": "mean_days"}) + + with style_context(): + fig1 = plot_bar( + df=cycle_means, + x="group", + y="mean_days", + title="AP cycle time (mean days-to-pay): before vs after change", + x_label="Period", + y_label="Mean days-to-pay", + ) + path1 = figures_dir / "ch12_cycle_time_means_bar.png" + save_figure(fig1, path1, FigureSpec(chart_type="bar", title="AP cycle time mean (before vs after)")) + figure_paths.append(path1) + manifest.append( + FigureManifestRow( + filename=path1.name, + chart_type="bar", + title="AP cycle time (mean days-to-pay): before vs after change", + x_label="Period", + y_label="Mean days-to-pay", + guardrail_note="Bar chart starts at zero; report effect size and CI in the memo.", + data_source="Computed from ap_events.csv invoice/payment pairs (with a small teaching adjustment).", + ) + ) + + # Figure 2: Histogram of bootstrap mean differences for promotion test + with style_context(): + fig2 = plot_histogram_with_markers( + values=pd.Series(promo_boot_diffs), + title=f"Promotion test: bootstrap distribution of mean AOV difference (promo month {promo_month})", + x_label="Mean difference (treatment - control)", + y_label="Frequency", + markers={"No effect": 0.0, "Observed": promo_obs_diff}, + ) + path2 = figures_dir / "ch12_promo_bootstrap_hist.png" + save_figure(fig2, path2, FigureSpec(chart_type="histogram", title="Promotion effect (bootstrap mean diff)")) + figure_paths.append(path2) + manifest.append( + FigureManifestRow( + filename=path2.name, + chart_type="histogram", + title=f"Promotion test: bootstrap distribution of mean AOV difference (promo month {promo_month})", + x_label="Mean difference (treatment - control)", + y_label="Frequency", + guardrail_note="Do not over-interpret the tail; pair p-value with effect size and a practical threshold.", + data_source="Derived from ar_events.csv invoices; treatment group gets a small uplift for teaching.", + ) + ) + + manifest_df = pd.DataFrame([m.__dict__ for m in manifest]) + manifest_df.to_csv(manifest_path, index=False) + + # -------------------- Summary JSON -------------------- + summary: dict[str, Any] = { + "chapter": "business_ch12_hypothesis_testing_decisions", + "seed": int(seed), + "confidence": float(confidence), + "promotion_test": { + "promo_month": promo_month, + "uplift_pct_assumed": float(promo_uplift_pct), # <-- ADD THIS LINE + "n_treatment": int(x.size), + "n_control": int(y.size), + "mean_treatment": float(np.mean(x)) if x.size else float("nan"), + "mean_control": float(np.mean(y)) if y.size else float("nan"), + "mean_diff": float(promo_obs_diff), + "relative_lift": float(promo_rel_lift), + "ci_low": float(promo_ci_low), + "ci_high": float(promo_ci_high), + "p_value": float(promo_p_value), + "cohens_d": float(promo_cohen_d), + "n_perm": int(n_perm), + "note": "Permutation p-value on mean difference (two-sided).", + }, + "cycle_time_test": { + "changeover_date": str(pd.to_datetime(changeover_date).date()), + "reduction_days_assumed": float(cycle_time_reduction_days), # <-- ADD THIS LINE + "n_before": int(xb.size), + "n_after": int(ya.size), + "mean_before": float(np.mean(xb)) if xb.size else float("nan"), + "mean_after": float(np.mean(ya)) if ya.size else float("nan"), + "mean_days_saved": float(cycle_obs_diff), + "ci_low": float(cycle_ci_low), + "ci_high": float(cycle_ci_high), + "p_value": float(cycle_p_value), + "cohens_d": float(cycle_cohen_d), + "n_perm": int(n_perm), + "note": "Permutation p-value on (before - after) mean days-to-pay (two-sided).", + }, + + "p_hacking_guardrails": { + "precommitment_file": design_path.name, + "rules": [ + "Fix your metric and alpha before you look at results.", + "Avoid stopping early because the p-value looks good.", + "Report effect size + CI; don't treat p < alpha as the only success criterion.", + "If you test many metrics, adjust or declare one primary metric.", + ], + }, + } + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") + + # -------------------- Audit-style memo (business language) -------------------- + alpha = 1.0 - float(confidence) + + def _fmt_pct(v: float) -> str: + if not np.isfinite(v): + return "NA" + return f"{100.0 * v:.1f}%" + + def _fmt_days(v: float) -> str: + if not np.isfinite(v): + return "NA" + return f"{v:.2f} days" + + def _fmt_money(v: float) -> str: + if not np.isfinite(v): + return "NA" + return f"${v:,.2f}" + + memo_lines = [ + "# Chapter 12 — Experiment Results Memo (Hypothesis Testing for Decisions)", + "", + "## Pre-commitment (anti p-hacking)", + f"- Confidence level: {confidence:.2f} (alpha = {alpha:.2f})", + f"- Test: two-sided permutation test on mean difference (n_perm = {int(n_perm)})", + "- Decision rule: require both (a) statistical evidence (p < alpha) and (b) practical significance.", + "", + "## Case 1 — Promotion Test (A/B style)", + f"- Promo month selected (most invoices): {promo_month}", + f"- Treatment vs control sample sizes: n_treat={int(x.size)}, n_control={int(y.size)}", + f"- Mean invoice amount: treat={_fmt_money(float(np.mean(x)) if x.size else float('nan'))}, control={_fmt_money(float(np.mean(y)) if y.size else float('nan'))}", + f"- Effect (mean diff): {_fmt_money(promo_obs_diff)} (lift: {_fmt_pct(promo_rel_lift)})", + f"- {confidence:.0%} bootstrap CI for mean diff: [{_fmt_money(promo_ci_low)}, {_fmt_money(promo_ci_high)}]", + f"- p-value: {promo_p_value:.4f} ; Cohen's d: {promo_cohen_d:.2f}", + "- Business read: if lift is small (or CI crosses 0), the promo may not be worth the operational cost.", + "", + "## Case 2 — AP Cycle Time (before/after change)", + f"- Changeover date (median invoice date): {str(pd.to_datetime(changeover_date).date())}", + f"- Sample sizes: n_before={int(xb.size)}, n_after={int(ya.size)}", + f"- Mean days-to-pay: before={_fmt_days(float(np.mean(xb)) if xb.size else float('nan'))}, after={_fmt_days(float(np.mean(ya)) if ya.size else float('nan'))}", + f"- Estimated days saved (before - after): {_fmt_days(cycle_obs_diff)}", + f"- {confidence:.0%} bootstrap CI for days saved: [{_fmt_days(cycle_ci_low)}, {_fmt_days(cycle_ci_high)}]", + f"- p-value: {cycle_p_value:.4f} ; Cohen's d: {cycle_cohen_d:.2f}", + "- Business read: translate days saved into dollars (labor time + vendor terms + late-fee risk).", + "", + "## Notes / guardrails", + "- p-values answer: \"Is this signal likely under 'no effect'?\" They do *not* answer: \"Is it worth doing?\"", + "- Effect sizes + confidence intervals are the bridge from statistics to decisions.", + "- If you test multiple metrics (AOV, conversion, churn...), pick one primary metric upfront.", + "", + ] + memo_path.write_text("\n".join(memo_lines), encoding="utf-8") + + return Ch12Outputs( + figures_dir=figures_dir, + figure_paths=figure_paths, + manifest_path=manifest_path, + memo_path=memo_path, + design_path=design_path, + summary_path=summary_path, + ) + + +def main(argv: list[str] | None = None) -> int: + p = base_parser("Business Ch12: hypothesis testing for decisions (A/B + before/after)") + p.add_argument("--datadir", type=Path, required=True, help="Path to NSO v1 dataset folder") + p.add_argument("--confidence", type=float, default=DEFAULT_CONFIDENCE) + p.add_argument("--n-perm", type=int, default=DEFAULT_N_PERM) + p.add_argument("--promo-uplift-pct", type=float, default=DEFAULT_PROMO_UPLIFT_PCT) + p.add_argument("--cycle-time-reduction-days", type=float, default=DEFAULT_CYCLE_TIME_REDUCTION_DAYS) + args = p.parse_args(argv) + + analyze_ch12( + datadir=Path(args.datadir), + outdir=Path(args.outdir), + seed=int(args.seed) if args.seed is not None else 123, + confidence=float(args.confidence), + n_perm=int(args.n_perm), + promo_uplift_pct=float(args.promo_uplift_pct), + cycle_time_reduction_days=float(args.cycle_time_reduction_days), + ) + + print(f"Wrote Chapter 12 artifacts -> {args.outdir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/workbooks/track_d_template/scripts/business_ch13_correlation_causation_controlled_comparisons.py b/workbooks/track_d_template/scripts/business_ch13_correlation_causation_controlled_comparisons.py new file mode 100644 index 0000000..e0eecd8 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch13_correlation_causation_controlled_comparisons.py @@ -0,0 +1,332 @@ +""" +Track D — Chapter 13 +Correlation, Causation, and Controlled Comparisons (NSO running case) + +Goal: +- Show how "two lines move together" can be misleading. +- Demonstrate a controlled comparison via partial correlation: + corr(X,Y) vs corr(resid(X|Z), resid(Y|Z)) + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch13_controlled_comparisons_design.json +* ch13_correlation_summary.json +* ch13_correlation_memo.md +* ch13_figures_manifest.csv +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict + +import numpy as np +import pandas as pd + +from scripts._cli import base_parser +from scripts._reporting_style import FigureManifestRow, FigureSpec, save_figure, style_context + + +CHAPTER = "ch13" + + +@dataclass(frozen=True) +class Outputs: + figures_dir: Path + design_path: Path + summary_path: Path + memo_path: Path + manifest_path: Path + + +def _month_key(dt: pd.Series) -> pd.Series: + return pd.to_datetime(dt).dt.to_period("M").astype(str) + + +def _signed_amount(df: pd.DataFrame) -> pd.Series: + """GL signed amount convention: debit - credit (credits become negative).""" + return df["debit"].astype(float) - df["credit"].astype(float) + + +def _series_monthly(gl: pd.DataFrame, *, account_name: str) -> pd.Series: + """Return a monthly signed series for a given GL account_name.""" + df = gl.loc[gl["account_name"] == account_name, ["date", "debit", "credit"]].copy() + if df.empty: + raise ValueError(f"account_name not found in gl_journal: {account_name!r}") + df["month"] = _month_key(df["date"]) + df["amount"] = _signed_amount(df) + return df.groupby("month", sort=True)["amount"].sum() + + +def _revenue_monthly(gl: pd.DataFrame) -> pd.Series: + """Revenue is stored as credits (negative signed amounts); convert to positive sales.""" + df = gl.loc[gl["account_type"] == "Revenue", ["date", "debit", "credit"]].copy() + df["month"] = _month_key(df["date"]) + df["amount"] = _signed_amount(df) + return -df.groupby("month", sort=True)["amount"].sum() + + +def _residualize(x: np.ndarray, z: np.ndarray) -> np.ndarray: + """Return residuals of x after linear fit on z.""" + if len(x) != len(z): + raise ValueError("x and z must have same length") + a, b = np.polyfit(z, x, deg=1) + return x - (a * z + b) + + +def _corr(a: np.ndarray, b: np.ndarray) -> float: + if len(a) < 3: + return float("nan") + return float(np.corrcoef(a, b)[0, 1]) + + +def _partial_corr(x: np.ndarray, y: np.ndarray, z: np.ndarray) -> float: + return _corr(_residualize(x, z), _residualize(y, z)) + + +def analyze_ch13(*, datadir: Path, outdir: Path, seed: int) -> Outputs: + rng = np.random.default_rng(seed) + + gl_path = datadir / "gl_journal.csv" + if not gl_path.exists(): + raise FileNotFoundError(f"Missing required input: {gl_path}") + + gl = pd.read_csv(gl_path) + gl["date"] = pd.to_datetime(gl["date"]) + + # ---- Design (pre-commitment) ---- + x_name = "Revenue" + y_name = "Payroll Tax Expense" + z_name = "Payroll Expense" + + design: Dict[str, Any] = { + "chapter": CHAPTER, + "question": "Do payroll taxes move with revenue, and does that imply causation?", + "x": {"name": x_name, "definition": "Monthly sales revenue from GL revenue accounts."}, + "y": {"name": y_name, "definition": "Monthly payroll tax expense from GL."}, + "control_z": { + "name": z_name, + "definition": "Monthly payroll expense (proxy for headcount/activity).", + "why_control": "Payroll taxes are mechanically tied to payroll; revenue may co-move because activity drives both.", + }, + "claim_rules": [ + "Report correlation and partial correlation only (no causal claim).", + "Interpret effect sizes in business terms (direction + materiality).", + "If results are sensitive to one or two months, say so explicitly.", + ], + "seed": int(seed), + } + + # ---- Compute series ---- + revenue = _revenue_monthly(gl) + payroll_tax = _series_monthly(gl, account_name=y_name) + payroll = _series_monthly(gl, account_name=z_name) + + df = pd.concat({"revenue": revenue, "payroll_tax": payroll_tax, "payroll": payroll}, axis=1).dropna() + + # Deterministic jitter so points don't overlap perfectly in plots + df["revenue_j"] = df["revenue"] * (1.0 + rng.normal(loc=0.0, scale=1e-6, size=len(df))) + + x = df["revenue"].to_numpy() + y = df["payroll_tax"].to_numpy() + z = df["payroll"].to_numpy() + + naive_r = _corr(x, y) + partial_r = _partial_corr(x, y, z) + + loo_rs = [] + for i in range(len(df)): + mask = np.ones(len(df), dtype=bool) + mask[i] = False + loo_rs.append(_corr(x[mask], y[mask])) + loo_min, loo_max = float(np.nanmin(loo_rs)), float(np.nanmax(loo_rs)) + + summary: Dict[str, Any] = { + "chapter": CHAPTER, + "n_months": int(len(df)), + "variables": {"x": x_name, "y": y_name, "control_z": z_name}, + "correlations": { + "naive_pearson_r": naive_r, + "partial_pearson_r_control_z": partial_r, + "leave_one_out_naive_r_min": loo_min, + "leave_one_out_naive_r_max": loo_max, + }, + "interpretation": { + "naive": "Correlation is a 'look here' signal; it does not prove causality.", + "controlled": "Partial correlation estimates the relationship between X and Y after removing Z's linear effect.", + "note": "If partial correlation shrinks materially, the naive story is likely confounded by payroll.", + }, + } + + # ---- Write outputs ---- + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + design_path = outdir / "ch13_controlled_comparisons_design.json" + summary_path = outdir / "ch13_correlation_summary.json" + memo_path = outdir / "ch13_correlation_memo.md" + manifest_path = outdir / "ch13_figures_manifest.csv" + + design_path.write_text(json.dumps(design, indent=2) + "\n", encoding="utf-8") + summary_path.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8") + + memo = f"""# Chapter 13 — Correlation, Causation, and Controlled Comparisons + +## Executive summary +- **Revenue vs Payroll Tax Expense** correlation (naïve): r = {naive_r:.3f} +- After controlling for **Payroll Expense** (proxy for headcount/activity): partial r = {partial_r:.3f} + +## What this means +Payroll taxes are mechanically tied to payroll. If revenue and payroll co-move (seasonality/activity), +payroll taxes will also appear to co-move with revenue. + +The controlled comparison (partial correlation) helps avoid a misleading story: “Revenue causes payroll taxes.” + +## What we can and cannot claim +✅ We can say: Revenue and payroll taxes move together in this dataset, and payroll explains much of that relationship. +❌ We cannot say: Increasing revenue causes payroll taxes to rise (causal claim). + +## Sensitivity check +Leave-one-out naive correlation range: {loo_min:.3f} to {loo_max:.3f} + +## Next steps (if you need causality) +- Define an intervention (policy) and a comparison group (or time-based control). +- Pre-commit the metric and the evaluation window (avoid p-hacking). +""" + memo_path.write_text(memo, encoding="utf-8") + + # ---- Figures ---- + import matplotlib.pyplot as plt + + manifest_rows: list[FigureManifestRow] = [] + + # Fig 1: naive scatter + fig1_id = "ch13_fig01_naive_scatter" + fig1_path = figures_dir / f"{fig1_id}.png" + with style_context(): + fig, ax = plt.subplots() + ax.scatter(df["revenue_j"], df["payroll_tax"]) + ax.set_title("Naïve correlation: Revenue vs Payroll Tax Expense") + ax.set_xlabel("Revenue (monthly, $)") + ax.set_ylabel("Payroll Tax Expense (monthly, $)") + a, b = np.polyfit(df["revenue"], df["payroll_tax"], deg=1) + xs = np.linspace(df["revenue"].min(), df["revenue"].max(), 100) + ax.plot(xs, a * xs + b) + spec = FigureSpec( + chart_type="scatter", + title="Revenue vs Payroll Tax Expense (naïve)", + x_label="Revenue (monthly, $)", + y_label="Payroll Tax Expense (monthly, $)", + data_source="NSO v1: gl_journal.csv", + notes="Scatter with OLS line. Correlation ≠ causation.", + ) + save_figure(fig, fig1_path, spec=spec) + manifest_rows.append( + FigureManifestRow( + filename=fig1_path.name, + chart_type="scatter", + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + guardrail_note="Naïve correlation only; do not claim causation.", + data_source=spec.data_source, + ) + ) + + # Fig 2: residual scatter (controlled) + fig2_id = "ch13_fig02_residual_scatter" + fig2_path = figures_dir / f"{fig2_id}.png" + rx = _residualize(x, z) + ry = _residualize(y, z) + with style_context(): + fig, ax = plt.subplots() + ax.scatter(rx, ry) + ax.set_title("Controlled comparison: residuals after controlling for Payroll") + ax.set_xlabel("Revenue residual (after controlling for Payroll)") + ax.set_ylabel("Payroll Tax residual (after controlling for Payroll)") + a, b = np.polyfit(rx, ry, deg=1) + xs = np.linspace(rx.min(), rx.max(), 100) + ax.plot(xs, a * xs + b) + spec2 = FigureSpec( + chart_type="scatter", + title="Residual correlation after controlling for Payroll", + x_label="Revenue residual", + y_label="Payroll Tax residual", + data_source="NSO v1: gl_journal.csv", + notes="Partial correlation uses residuals from linear fits on Payroll Expense.", + ) + save_figure(fig, fig2_path, spec=spec2) + manifest_rows.append( + FigureManifestRow( + filename=fig2_path.name, + chart_type="scatter", + title=spec2.title, + x_label=spec2.x_label, + y_label=spec2.y_label, + guardrail_note="Partial correlation: controlled for Payroll Expense.", + data_source=spec2.data_source, + ) + ) + + # Fig 3: compare r + fig3_id = "ch13_fig03_r_comparison" + fig3_path = figures_dir / f"{fig3_id}.png" + with style_context(): + fig, ax = plt.subplots() + ax.bar(["Naïve r", "Partial r\n(control Payroll)"], [naive_r, partial_r]) + ax.set_ylim(-1, 1) + ax.set_title("Correlation shrinks after a controlled comparison") + ax.set_ylabel("Pearson r") + spec3 = FigureSpec( + chart_type="bar", + title="Naïve vs controlled correlation", + x_label="Correlation type", + y_label="Pearson r", + data_source="Derived from monthly GL aggregates", + notes="If partial r shrinks materially, the naive story is likely confounded by Payroll.", + ) + save_figure(fig, fig3_path, spec=spec3) + manifest_rows.append( + FigureManifestRow( + filename=fig3_path.name, + chart_type="bar", + title=spec3.title, + x_label=spec3.x_label, + y_label=spec3.y_label, + guardrail_note="Compare r values; shrinking suggests confounding.", + data_source=spec3.data_source, + ) + ) + + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv(manifest_path, index=False) + + return Outputs( + figures_dir=figures_dir, + design_path=design_path, + summary_path=summary_path, + memo_path=memo_path, + manifest_path=manifest_path, + ) + + +def _build_cli(): + p = base_parser(description="Track D Ch13: Correlation, causation, and controlled comparisons (NSO).") + p.add_argument("--datadir", type=Path, required=True) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + + analyze_ch13(datadir=args.datadir, outdir=args.outdir, seed=args.seed) + print("Wrote Chapter 13 artifacts ->", args.outdir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch14_regression_driver_analysis.py b/workbooks/track_d_template/scripts/business_ch14_regression_driver_analysis.py new file mode 100644 index 0000000..ec1ebbf --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch14_regression_driver_analysis.py @@ -0,0 +1,362 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 14: Regression Driver Analysis (NSO). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch14_driver_table.csv +* ch14_model_design.json +* ch14_summary.json +* ch14_memo.md +* ch14_figures_manifest.csv + +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import statsmodels.api as sm + +from scripts._cli import base_parser +from scripts._reporting_style import FigureManifestRow, FigureSpec, save_figure, style_context + +CHAPTER = "Track D Chapter 14 — Regression Driver Analysis (NSO)" + + +@dataclass(frozen=True) +class Outputs: + driver_table_csv: Path + design_json: Path + summary_json: Path + memo_md: Path + figures_manifest_csv: Path + + +def _read_csv(datadir: Path, name: str) -> pd.DataFrame: + p = datadir / name + if not p.exists(): + raise FileNotFoundError(f"Expected {name} in {datadir} but not found.") + return pd.read_csv(p) + + +def _month_key_from_date(series: pd.Series) -> pd.Series: + # stable month key used across NSO files + return pd.to_datetime(series).dt.to_period("M").astype(str) + + +def _build_driver_table(datadir: Path) -> pd.DataFrame: + inv = _read_csv(datadir, "inventory_movements.csv") + ar = _read_csv(datadir, "ar_events.csv") + is_df = _read_csv(datadir, "statements_is_monthly.csv") + + # ---- Units sold (monthly): inventory movement_type == sale_issue (qty is negative in sim) + inv_sale = inv[inv["movement_type"] == "sale_issue"].copy() + inv_sale["month"] = _month_key_from_date(inv_sale["date"]) + units = ( + inv_sale.groupby("month", as_index=False)["qty"] + .sum() + .rename(columns={"qty": "units_sold"}) + ) + units["units_sold"] = -units["units_sold"] # convert negative issues to positive units sold + + # ---- Invoice count (monthly): AR event_type == invoice + ar_inv = ar[ar["event_type"] == "invoice"].copy() + ar_inv["month"] = _month_key_from_date(ar_inv["date"]) + invoices = ( + ar_inv.groupby("month", as_index=False)["event_type"] + .count() + .rename(columns={"event_type": "invoice_count"}) + ) + + # ---- Income statement (monthly): extract Sales Revenue + COGS + # statements_is_monthly has columns: month, line, amount + is_keep = is_df[is_df["line"].isin(["Sales Revenue", "Cost of Goods Sold"])].copy() + is_pivot = ( + is_keep.pivot_table(index="month", columns="line", values="amount", aggfunc="sum") + .reset_index() + .rename( + columns={ + "Sales Revenue": "sales_revenue", + "Cost of Goods Sold": "cogs", + } + ) + ) + + # ---- Join drivers + P&L lines + df = is_pivot.merge(units, on="month", how="left").merge(invoices, on="month", how="left") + df["units_sold"] = df["units_sold"].fillna(0.0) + df["invoice_count"] = df["invoice_count"].fillna(0.0) + + # keep a real datetime month column for nicer tables/figures + df["month_dt"] = pd.to_datetime(df["month"] + "-01") + df = df.sort_values("month_dt").reset_index(drop=True) + + # reorder + cols = ["month", "month_dt", "units_sold", "invoice_count", "sales_revenue", "cogs"] + return df[cols] + + +def _fit_ols(y: pd.Series, X: pd.DataFrame) -> sm.regression.linear_model.RegressionResultsWrapper: + Xc = sm.add_constant(X, has_constant="add") + return sm.OLS(y, Xc).fit() + + +def analyze_ch14(datadir: Path, outdir: Path, seed: int = 123) -> Outputs: + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + df = _build_driver_table(datadir) + + # ---- Models (keep explainable) + # COGS ~ units_sold (fixed + variable) + m1 = _fit_ols(df["cogs"], df[["units_sold"]]) + + # Revenue ~ units_sold (avg selling price lens) + m2 = _fit_ols(df["sales_revenue"], df[["units_sold"]]) + + # Revenue ~ units_sold + invoice_count (mix / activity check) + m3 = _fit_ols(df["sales_revenue"], df[["units_sold", "invoice_count"]]) + + # simple forecast example: next month drivers = avg last 3 months + last3 = df.tail(3) + next_units = float(last3["units_sold"].mean()) + next_invoices = float(last3["invoice_count"].mean()) + + pred_cogs = float( + m1.predict( + sm.add_constant(pd.DataFrame({"units_sold": [next_units]}), has_constant="add") + ).iloc[0] + ) + pred_rev_m2 = float( + m2.predict( + sm.add_constant(pd.DataFrame({"units_sold": [next_units]}), has_constant="add") + ).iloc[0] + ) + pred_rev_m3 = float( + m3.predict( + sm.add_constant( + pd.DataFrame({"units_sold": [next_units], "invoice_count": [next_invoices]}), + has_constant="add", + ) + ).iloc[0] + ) + + # ---- Write core artifacts + driver_csv = outdir / "ch14_driver_table.csv" + design_json = outdir / "ch14_regression_design.json" + summary_json = outdir / "ch14_regression_summary.json" + memo_md = outdir / "ch14_regression_memo.md" + manifest_csv = outdir / "ch14_figures_manifest.csv" + + df.to_csv(driver_csv, index=False) + + design = { + "chapter": CHAPTER, + "seed": seed, + "expected_inputs": ["inventory_movements.csv", "ar_events.csv", "statements_is_monthly.csv"], + "drivers": { + "units_sold": "Monthly sum of -qty for inventory_movements where movement_type == 'sale_issue'", + "invoice_count": "Monthly count of ar_events where event_type == 'invoice'", + }, + "models": [ + {"name": "m1_cogs_units", "formula": "cogs ~ 1 + units_sold", "interpretation": "fixed + variable_cost_per_unit"}, + {"name": "m2_rev_units", "formula": "sales_revenue ~ 1 + units_sold", "interpretation": "base + avg_price_per_unit"}, + {"name": "m3_rev_units_invoices", "formula": "sales_revenue ~ 1 + units_sold + invoice_count", "interpretation": "simple multi-driver check"}, + ], + "note": "Regression is a driver lens; it does not prove causation.", + } + + summary = { + "rows": int(df.shape[0]), + "date_range": {"min_month": str(df["month_dt"].min().date()), "max_month": str(df["month_dt"].max().date())}, + "m1_cogs_units": {"params": {"const": float(m1.params["const"]), "units_sold": float(m1.params["units_sold"])}, "r2": float(m1.rsquared)}, + "m2_rev_units": {"params": {"const": float(m2.params["const"]), "units_sold": float(m2.params["units_sold"])}, "r2": float(m2.rsquared)}, + "m3_rev_units_invoices": { + "params": { + "const": float(m3.params["const"]), + "units_sold": float(m3.params["units_sold"]), + "invoice_count": float(m3.params["invoice_count"]), + }, + "r2": float(m3.rsquared), + }, + "forecast_example": { + "assumption": "Next month drivers = average of last 3 months", + "next_units_sold": next_units, + "next_invoice_count": next_invoices, + "predicted_cogs_m1": pred_cogs, + "predicted_sales_rev_m2": pred_rev_m2, + "predicted_sales_rev_m3": pred_rev_m3, + }, + } + + design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + summary_json.write_text(json.dumps(summary, indent=2), encoding="utf-8") + + memo_lines = [ + "# Chapter 14 — Regression Driver Analysis (NSO)\n", + "## What we did\n", + "- Built a monthly driver table:\n", + " - **Units sold** from inventory movements (sale_issue rows)\n", + " - **Invoice count** from AR events (invoice rows)\n", + " - **Revenue & COGS** from the monthly income statement\n", + "- Fit simple, explainable regressions to estimate **rates** and **baseline** components.\n", + "\n", + "## Key results\n", + f"- **COGS ~ units_sold**: intercept ≈ **{m1.params['const']:.2f}**, slope ≈ **{m1.params['units_sold']:.2f}**, R² ≈ **{m1.rsquared:.3f}**\n", + f"- **Revenue ~ units_sold**: intercept ≈ **{m2.params['const']:.2f}**, slope ≈ **{m2.params['units_sold']:.2f}**, R² ≈ **{m2.rsquared:.3f}**\n", + "\n", + "## Forecast example (avg last 3 months)\n", + f"- Next units_sold: **{next_units:.1f}**\n", + f"- Predicted COGS (m1): **{pred_cogs:.2f}**\n", + f"- Predicted Revenue (m2): **{pred_rev_m2:.2f}**\n", + f"- Predicted Revenue (m3): **{pred_rev_m3:.2f}**\n", + "\n", + "## Notes / limitations\n", + "- Regression is a quantitative driver lens — **not proof of causation**.\n", + "- If pricing or product mix changes, re-fit and re-check residuals.\n", + ] + memo_md.write_text("".join(memo_lines), encoding="utf-8") + + # ---- Figures + manifest (match repo reporting style contract) + manifest_rows: list[FigureManifestRow] = [] + + def _add_row(fig_path: Path, spec: FigureSpec) -> None: + manifest_rows.append( + FigureManifestRow( + filename=fig_path.name, + chart_type=spec.chart_type, + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + data_source=spec.data_source, + guardrail_note="Driver lens only; correlation ≠ causation. Interpret slope as rate and intercept as baseline.", + ) + ) + + def _save_scatter_with_fit( + fig_id: str, + x: pd.Series, + y: pd.Series, + model: Any, + xlabel: str, + ylabel: str, + title: str, + ) -> None: + fig_path = figures_dir / f"{fig_id}.png" + xs = np.linspace(float(x.min()), float(x.max()), 120) + + # predict with model + if "invoice_count" in model.model.exog_names: + # only used for the m3 figure; caller should pass scalar invoice_count series + raise RuntimeError("Use the m3-specific helper for multi-driver predictions.") + xdf = pd.DataFrame({"units_sold": xs}) + yhat = model.predict(sm.add_constant(xdf, has_constant="add")) + + spec = FigureSpec( + chart_type="scatter", + title=title, + x_label=xlabel, + y_label=ylabel, + data_source="NSO v1 synthetic outputs", + notes="Scatter with OLS fit line.", + ) + + with style_context(): + fig, ax = plt.subplots() + ax.scatter(x.to_numpy(dtype=float), y.to_numpy(dtype=float)) + ax.plot(xs, yhat.to_numpy(dtype=float)) + ax.set_title(title) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + save_figure(fig, fig_path, spec=spec) + + _add_row(fig_path, spec) + + # Fig 1: COGS vs units + _save_scatter_with_fit( + fig_id="ch14_fig01_cogs_vs_units", + x=df["units_sold"], + y=df["cogs"], + model=m1, + xlabel="Units sold (monthly)", + ylabel="COGS", + title="COGS vs Units Sold (monthly)", + ) + + # Fig 2: Revenue vs units + _save_scatter_with_fit( + fig_id="ch14_fig02_revenue_vs_units", + x=df["units_sold"], + y=df["sales_revenue"], + model=m2, + xlabel="Units sold (monthly)", + ylabel="Sales Revenue", + title="Sales Revenue vs Units Sold (monthly)", + ) + + # Fig 3: Actual vs predicted revenue (m3) + fig3_id = "ch14_fig03_actual_vs_predicted_revenue_m3" + fig3_path = figures_dir / f"{fig3_id}.png" + exog3 = sm.add_constant(df[["units_sold", "invoice_count"]], has_constant="add") + yhat3 = m3.predict(exog3) + + spec3 = FigureSpec( + chart_type="scatter", + title="Actual vs Predicted Sales Revenue (m3)", + x_label="Predicted Sales Revenue", + y_label="Actual Sales Revenue", + data_source="NSO v1 synthetic outputs", + notes="Scatter; closer to the 45° line indicates better fit.", + ) + with style_context(): + fig, ax = plt.subplots() + ax.scatter(yhat3.to_numpy(dtype=float), df["sales_revenue"].to_numpy(dtype=float)) + ax.set_title(spec3.title) + ax.set_xlabel(spec3.x_label) + ax.set_ylabel(spec3.y_label) + save_figure(fig, fig3_path, spec=spec3) + _add_row(fig3_path, spec3) + + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv(manifest_csv, index=False) + + return Outputs( + driver_table_csv=driver_csv, + design_json=design_json, + summary_json=summary_json, + memo_md=memo_md, + figures_manifest_csv=manifest_csv, + ) + + +def write_outputs(_: Outputs) -> None: + # For API symmetry with some chapters; this chapter writes inside analyze_ch14. + return + + +def _build_cli() -> Any: + p = base_parser(description=CHAPTER) + p.add_argument("--datadir", type=Path, required=True) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + + outdir = args.outdir + analyze_ch14(datadir=args.datadir, outdir=outdir, seed=args.seed if args.seed is not None else 123) + + print(f"Wrote Chapter 14 artifacts -> {outdir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch15_forecasting_foundations.py b/workbooks/track_d_template/scripts/business_ch15_forecasting_foundations.py new file mode 100644 index 0000000..f3ec7fb --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch15_forecasting_foundations.py @@ -0,0 +1,465 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 15: Forecasting foundations (NSO). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch15_series_monthly.csv +* ch15_backtest_predictions.csv +* ch15_backtest_metrics.csv +* ch15_forecast_next12.csv +* ch15_assumptions_log_template.csv +* ch15_forecast_design.json +* ch15_forecast_memo.md +* ch15_figures_manifest.csv + +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable + +import numpy as np +import pandas as pd + +from scripts._cli import base_parser +from scripts._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_bar, + plot_time_series, + save_figure, + style_context, +) + +CHAPTER = "Track D Chapter 15 — Forecasting Foundations and Forecast Hygiene (NSO)" + + +@dataclass(frozen=True) +class Outputs: + series_csv: Path + backtest_predictions_csv: Path + backtest_metrics_csv: Path + forecast_csv: Path + assumptions_template_csv: Path + design_json: Path + memo_md: Path + figures_manifest_csv: Path + + +def _read_csv(datadir: Path, name: str) -> pd.DataFrame: + p = datadir / name + if not p.exists(): + raise FileNotFoundError(f"Expected {name} in {datadir} but not found.") + return pd.read_csv(p) + + +def _month_sort_key(month_series: pd.Series) -> pd.Series: + # stable month ordering for strings like "YYYY-MM" + return pd.to_datetime(month_series.astype(str) + "-01") + + +def _snake(s: str) -> str: + return ( + s.strip() + .lower() + .replace("&", "and") + .replace("/", " ") + .replace("-", " ") + .replace("(", "") + .replace(")", "") + .replace(" ", " ") + .replace(" ", "_") + ) + + +def _build_monthly_series(datadir: Path) -> pd.DataFrame: + """ + Build a clean monthly series table from NSO v1 statements_is_monthly.csv. + + Output columns: + - month (YYYY-MM) + - revenue, cogs, gross_profit, operating_expenses, net_income + """ + is_df = _read_csv(datadir, "statements_is_monthly.csv") + + wide = ( + is_df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum") + .reset_index() + .copy() + ) + wide["month"] = wide["month"].astype(str) + wide = wide.sort_values("month", key=_month_sort_key).reset_index(drop=True) + + # normalize columns (keep only the lines we care about for Ch15) + col_map = {c: _snake(str(c)) for c in wide.columns} + wide = wide.rename(columns=col_map) + + # expected normalized line names from the simulator + expected = { + "sales_revenue": "revenue", + "cost_of_goods_sold": "cogs", + "gross_profit": "gross_profit", + "operating_expenses": "operating_expenses", + "net_income": "net_income", + } + missing = [src for src in expected if src not in wide.columns] + if missing: + raise ValueError( + "statements_is_monthly.csv is missing required income statement lines " + f"after pivot: {missing}. Available: {sorted(wide.columns)}" + ) + + keep = ["month"] + list(expected.keys()) + out = wide[keep].rename(columns=expected).copy() + return out + + +def _forecast_naive_last(train: np.ndarray, horizon: int) -> np.ndarray: + return np.repeat(float(train[-1]), horizon) + + +def _forecast_moving_average( + train: np.ndarray, horizon: int, window: int = 3 +) -> np.ndarray: + window = int(max(1, min(window, train.size))) + return np.repeat(float(np.mean(train[-window:])), horizon) + + +def _forecast_linear_trend(train: np.ndarray, horizon: int) -> np.ndarray: + # y = a + b*t, fit by least squares (polyfit degree 1) + t = np.arange(train.size, dtype=float) + b, a = np.polyfit(t, train.astype(float), deg=1) # [slope, intercept] + t_future = np.arange(train.size, train.size + horizon, dtype=float) + return a + b * t_future + + +def _mae(y_true: np.ndarray, y_pred: np.ndarray) -> float: + return float(np.mean(np.abs(y_pred - y_true))) + + +def _mape(y_true: np.ndarray, y_pred: np.ndarray) -> float: + y_true = y_true.astype(float) + denom = np.where(y_true == 0.0, np.nan, y_true) + ape = np.abs((y_pred - y_true) / denom) * 100.0 + return float(np.nanmean(ape)) + + +def analyze_ch15(datadir: Path, outdir: Path, seed: int = 123) -> Outputs: + """ + Chapter 15: Forecasting foundations (baseline methods + backtesting + hygiene). + + Reads: + - statements_is_monthly.csv (NSO v1) + + Writes (into outdir): + - ch15_series_monthly.csv + - ch15_backtest_predictions.csv + - ch15_backtest_metrics.csv + - ch15_forecast_next12.csv + - ch15_assumptions_log_template.csv + - ch15_forecast_design.json + - ch15_forecast_memo.md + - ch15_figures_manifest.csv + - figures/*.png referenced by the manifest + """ + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + series = _build_monthly_series(datadir) + + target_col = "revenue" + months = series["month"].tolist() + y = series[target_col].astype(float).to_numpy() + + if y.size < 18: + raise ValueError( + f"Need at least 18 months to do the Chapter 15 backtest; got {y.size}." + ) + + # Backtest: first 12 months train, last 12 months test + n_train = 12 + y_train, y_test = y[:n_train], y[n_train:] + months_train, months_test = months[:n_train], months[n_train:] + + methods: dict[str, Callable[[np.ndarray, int], np.ndarray]] = { + "naive_last": lambda tr, h: _forecast_naive_last(tr, h), + "moving_avg_3": lambda tr, h: _forecast_moving_average(tr, h, window=3), + "linear_trend": lambda tr, h: _forecast_linear_trend(tr, h), + } + + pred_rows: list[dict[str, Any]] = [] + metric_rows: list[dict[str, Any]] = [] + + for name, fn in methods.items(): + yhat = fn(y_train, y_test.size).astype(float) + err = yhat - y_test.astype(float) + abs_err = np.abs(err) + + ape = np.abs(err / y_test.astype(float)) * 100.0 + for m, a, p, e, ae, pe in zip( + months_test, y_test, yhat, err, abs_err, ape, strict=True + ): + pred_rows.append( + { + "method": name, + "month": m, + "actual": float(a), + "predicted": float(p), + "error": float(e), + "abs_error": float(ae), + "abs_pct_error": float(pe), + } + ) + + metric_rows.append( + { + "method": name, + "mae": _mae(y_test, yhat), + "mape": _mape(y_test, yhat), + } + ) + + metrics = ( + pd.DataFrame(metric_rows) + .sort_values(["mape", "mae"], ascending=[True, True]) + .reset_index(drop=True) + ) + + chosen_method = str(metrics.loc[0, "method"]) + chosen_mae = float(metrics.loc[metrics["method"] == chosen_method, "mae"].iloc[0]) + + # Refit on all data and forecast next 12 months + horizon = 12 + yhat_next = methods[chosen_method](y, horizon).astype(float) + + last_month = pd.Period(months[-1], freq="M") + next_months = [(last_month + i).strftime("%Y-%m") for i in range(1, horizon + 1)] + + lower = np.maximum(0.0, yhat_next - chosen_mae) + upper = yhat_next + chosen_mae + + forecast = pd.DataFrame( + { + "month": next_months, + "method": chosen_method, + "forecast": yhat_next, + "lower": lower, + "upper": upper, + } + ) + + # Write artifacts + series_csv = outdir / "ch15_series_monthly.csv" + backtest_predictions_csv = outdir / "ch15_backtest_predictions.csv" + backtest_metrics_csv = outdir / "ch15_backtest_metrics.csv" + forecast_csv = outdir / "ch15_forecast_next12.csv" + assumptions_template_csv = outdir / "ch15_assumptions_log_template.csv" + design_json = outdir / "ch15_forecast_design.json" + memo_md = outdir / "ch15_forecast_memo.md" + figures_manifest_csv = outdir / "ch15_figures_manifest.csv" + + series.to_csv(series_csv, index=False) + pd.DataFrame(pred_rows).to_csv(backtest_predictions_csv, index=False) + metrics.to_csv(backtest_metrics_csv, index=False) + forecast.to_csv(forecast_csv, index=False) + + tmpl = pd.DataFrame( + [ + { + "as_of_month": months[-1], + "series": target_col, + "horizon_months": horizon, + "assumption": "Example: New customer contract starts in 2027-03", + "direction": "up|down|mixed", + "estimated_impact": "e.g., +12000 revenue/month starting 2027-03", + "owner": "name/role", + "notes": "Delete this example row and use one row per major assumption.", + } + ] + ) + tmpl.to_csv(assumptions_template_csv, index=False) + + design = { + "chapter": CHAPTER, + "seed": seed, + "target_series": target_col, + "history_months": months, + "train_months": months_train, + "test_months": months_test, + "methods_compared": list(methods.keys()), + "selection_rule": "min MAPE on 12-month holdout; tie-break on MAE", + "chosen_method": chosen_method, + "chosen_method_backtest_mae": chosen_mae, + "forecast_horizon_months": horizon, + "forecast_months": next_months, + "forecast_interval_note": ( + "+/- backtest MAE (simple heuristic, not a probabilistic interval)" + ), + } + design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + + memo_lines = [ + "# Chapter 15 Forecast Memo (baseline + hygiene)\n\n", + f"**Target series:** {target_col}\n", + f"**History window:** {months[0]} to {months[-1]} ({len(months)} months)\n", + f"**Backtest:** train={months_train[0]}..{months_train[-1]} (12 months), " + f"test={months_test[0]}..{months_test[-1]} (12 months)\n\n", + "## Compared baseline methods\n", + "- naive_last: use the last observed month as the forecast\n", + "- moving_avg_3: average of the last 3 months\n", + "- linear_trend: least-squares line over time\n\n", + "## Backtest results (lower is better)\n\n", + metrics.to_markdown(index=False), + "\n\n", + f"**Selected method:** `{chosen_method}` (lowest MAPE; tie-break on MAE)\n\n", + "## Next 12-month forecast (with simple range)\n", + "The range shown is a simple heuristic: forecast ± backtest MAE.\n\n", + forecast.to_markdown(index=False), + "\n\n", + "## Forecast hygiene notes\n", + "- Document assumptions in the **assumptions log** (what changed, when, " + "and expected impact).\n", + "- Keep a versioned trail: dataset seed/inputs + method choice + metrics.\n", + "- Treat this as a baseline. Chapter 16+ introduces seasonality, " + "breaks, and better models.\n", + ] + memo_md.write_text("".join(memo_lines), encoding="utf-8") + + # Figures + manifest + manifest_rows: list[FigureManifestRow] = [] + + def _add_row(fig_path: Path, spec: FigureSpec) -> None: + manifest_rows.append( + FigureManifestRow( + filename=fig_path.name, + chart_type=spec.chart_type, + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + data_source="NSO v1 synthetic outputs", + guardrail_note=( + "Forecasts are estimates. Verify assumptions, ranges, " + "and error metrics before sharing." + ), + ) + ) + + with style_context(): + fig = plot_time_series( + series, + x="month", + series={"Revenue (actual)": "revenue"}, + title="Revenue history (monthly)", + x_label="Month", + y_label="Revenue", + ) + spec = FigureSpec( + chart_type="line", + title="Revenue history (monthly)", + x_label="Month", + y_label="Revenue", + data_source="statements_is_monthly.csv (Sales Revenue line)", + notes="Actual history used for baseline forecasting.", + ) + fig_path = figures_dir / "ch15_fig_revenue_history.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + bt_best = pd.read_csv(backtest_predictions_csv) + bt_best = bt_best[bt_best["method"] == chosen_method].copy() + + overlay = series[["month", "revenue"]].copy() + overlay = overlay.merge( + bt_best[["month", "predicted"]].rename(columns={"predicted": "predicted_best"}), + on="month", + how="left", + ) + + with style_context(): + fig = plot_time_series( + overlay, + x="month", + series={ + "Revenue (actual)": "revenue", + f"Backtest ({chosen_method})": "predicted_best", + }, + title="Backtest overlay (12-month holdout)", + x_label="Month", + y_label="Revenue", + ) + spec = FigureSpec( + chart_type="line", + title="Backtest overlay (12-month holdout)", + x_label="Month", + y_label="Revenue", + data_source="statements_is_monthly.csv + ch15_backtest_predictions.csv", + notes="Predicted values shown only for the test window.", + ) + fig_path = figures_dir / "ch15_fig_backtest_overlay.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + with style_context(): + fig = plot_bar( + metrics, + x="method", + y="mape", + title="Backtest MAPE by method (lower is better)", + x_label="Method", + y_label="MAPE (%)", + ) + spec = FigureSpec( + chart_type="bar", + title="Backtest MAPE by method (lower is better)", + x_label="Method", + y_label="MAPE (%)", + data_source="ch15_backtest_metrics.csv", + notes="Use error metrics to avoid 'forecast fantasy'.", + ) + fig_path = figures_dir / "ch15_fig_mape_by_method.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv( + figures_manifest_csv, index=False + ) + + return Outputs( + series_csv=series_csv, + backtest_predictions_csv=backtest_predictions_csv, + backtest_metrics_csv=backtest_metrics_csv, + forecast_csv=forecast_csv, + assumptions_template_csv=assumptions_template_csv, + design_json=design_json, + memo_md=memo_md, + figures_manifest_csv=figures_manifest_csv, + ) + + +def write_outputs(_: Outputs) -> None: + return + + +def _build_cli() -> Any: + p = base_parser(description=CHAPTER) + p.add_argument("--datadir", type=Path, required=True) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + + outdir = args.outdir + analyze_ch15(datadir=args.datadir, outdir=outdir, seed=args.seed or 123) + print("Wrote Chapter 15 artifacts ->", outdir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch16_seasonality_baselines.py b/workbooks/track_d_template/scripts/business_ch16_seasonality_baselines.py new file mode 100644 index 0000000..0e1a683 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch16_seasonality_baselines.py @@ -0,0 +1,498 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 16: Seasonality and baseline forecasts (NSO). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch16_series_monthly.csv +* ch16_seasonal_profile.csv +* ch16_backtest_predictions.csv +* ch16_backtest_metrics.csv +* ch16_forecast_next12.csv +* ch16_design.json +* ch16_memo.md +* ch16_figures_manifest.csv + +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable + +import numpy as np +import pandas as pd + +from scripts._cli import base_parser +from scripts._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_bar, + plot_time_series, + save_figure, + style_context, +) + +CHAPTER = "Track D Chapter 16 — Seasonality and Seasonal Baselines (NSO)" + + +@dataclass(frozen=True) +class Outputs: + series_csv: Path + seasonal_profile_csv: Path + backtest_predictions_csv: Path + backtest_metrics_csv: Path + forecast_csv: Path + design_json: Path + memo_md: Path + figures_manifest_csv: Path + + +def _read_csv(datadir: Path, name: str) -> pd.DataFrame: + p = datadir / name + if not p.exists(): + raise FileNotFoundError(f"Expected {name} in {datadir} but not found.") + return pd.read_csv(p) + + +def _month_sort_key(month_series: pd.Series) -> pd.Series: + return pd.to_datetime(month_series.astype(str) + "-01") + + +def _snake(s: str) -> str: + return ( + s.strip() + .lower() + .replace("&", "and") + .replace("/", " ") + .replace("-", " ") + .replace("(", "") + .replace(")", "") + .replace(" ", " ") + .replace(" ", "_") + ) + + +def _build_monthly_series(datadir: Path) -> pd.DataFrame: + """Build a clean monthly series from NSO v1 statements_is_monthly.csv. + + Output columns (wide, normalized): + - month (YYYY-MM) + - revenue, cogs, gross_profit, operating_expenses, net_income + - month_num (1..N) + - month_of_year (1..12) + """ + is_df = _read_csv(datadir, "statements_is_monthly.csv") + + wide = ( + is_df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum") + .reset_index() + .copy() + ) + wide["month"] = wide["month"].astype(str) + wide = wide.sort_values("month", key=_month_sort_key).reset_index(drop=True) + + col_map = {c: _snake(str(c)) for c in wide.columns} + wide = wide.rename(columns=col_map) + + expected = { + "sales_revenue": "revenue", + "cost_of_goods_sold": "cogs", + "gross_profit": "gross_profit", + "operating_expenses": "operating_expenses", + "net_income": "net_income", + } + missing = [src for src in expected if src not in wide.columns] + if missing: + raise ValueError( + "statements_is_monthly.csv is missing required income statement lines " + f"after pivot: {missing}. Available: {sorted(wide.columns)}" + ) + + out = wide[["month"] + list(expected.keys())].rename(columns=expected).copy() + + # Add deterministic helper keys used for seasonality calculations. + out["month_num"] = np.arange(1, len(out) + 1, dtype=int) + out["month_of_year"] = pd.PeriodIndex(out["month"], freq="M").month.astype(int) + return out + + +def _mae(y_true: np.ndarray, y_pred: np.ndarray) -> float: + return float(np.mean(np.abs(y_pred - y_true))) + + +def _mape(y_true: np.ndarray, y_pred: np.ndarray) -> float: + y_true = y_true.astype(float) + denom = np.where(y_true == 0.0, np.nan, y_true) + ape = np.abs((y_pred - y_true) / denom) * 100.0 + return float(np.nanmean(ape)) + + +def _forecast_naive_last(train: np.ndarray, horizon: int) -> np.ndarray: + return np.repeat(float(train[-1]), horizon) + + +def _forecast_moving_average(train: np.ndarray, horizon: int, window: int = 3) -> np.ndarray: + window = int(max(1, min(window, train.size))) + return np.repeat(float(np.mean(train[-window:])), horizon) + + +def _forecast_linear_trend(train: np.ndarray, horizon: int) -> np.ndarray: + t = np.arange(train.size, dtype=float) + slope, intercept = np.polyfit(t, train.astype(float), deg=1) + t_future = np.arange(train.size, train.size + horizon, dtype=float) + return intercept + slope * t_future + + +def _forecast_seasonal_naive_12(train: np.ndarray, horizon: int) -> np.ndarray: + """Seasonal naive with period=12 months. + + For each month in the next year, reuse the value from the same month last year. + Requires train length >= 12 and horizon == 12 for this chapter. + """ + if train.size < 12: + raise ValueError("seasonal_naive_12 requires at least 12 months of training data.") + if horizon != 12: + # Keep the method explicit and simple for Track D. + raise ValueError("seasonal_naive_12 is defined for a 12-month horizon in this chapter.") + return train[-12:].astype(float).copy() + + +def _forecast_seasonal_mean(train: np.ndarray, train_months: list[str], horizon_months: list[str]) -> np.ndarray: + """Seasonal mean baseline: forecast each month as the mean of that calendar month in training.""" + train_moy = pd.PeriodIndex(pd.Series(train_months), freq="M").month.astype(int) + df = pd.DataFrame({"moy": train_moy, "y": train.astype(float)}) + means = df.groupby("moy")["y"].mean().to_dict() + + future_moy = pd.PeriodIndex(pd.Series(horizon_months), freq="M").month.astype(int) + # Fall back to overall mean if a month_of_year is missing (should not happen with 12-month train). + overall = float(np.mean(train.astype(float))) + return np.array([float(means.get(int(m), overall)) for m in future_moy], dtype=float) + + +def analyze_ch16(datadir: Path, outdir: Path, seed: int = 123) -> Outputs: + """Chapter 16: seasonality + seasonal baselines (NSO running case). + + Reads: + - statements_is_monthly.csv + + Writes (into outdir): + - ch16_series_monthly.csv + - ch16_seasonal_profile.csv + - ch16_backtest_predictions.csv + - ch16_backtest_metrics.csv + - ch16_forecast_next12.csv + - ch16_design.json + - ch16_memo.md + - ch16_figures_manifest.csv + - figures/*.png referenced by the manifest + """ + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + series = _build_monthly_series(datadir) + months = series["month"].tolist() + + if len(months) < 24: + raise ValueError(f"Chapter 16 requires 24 months for a 12/12 backtest; got {len(months)}.") + + target_col = "revenue" + y = series[target_col].astype(float).to_numpy() + + # --- Seasonality profile (month-of-year mean + index) --- + seasonal = ( + series.groupby("month_of_year", as_index=False)[target_col] + .mean() + .rename(columns={target_col: "mean_revenue"}) + .sort_values("month_of_year") + .reset_index(drop=True) + ) + overall_mean = float(series[target_col].mean()) + seasonal["seasonal_index"] = seasonal["mean_revenue"] / overall_mean + + # --- Backtest: first 12 months train, last 12 months test --- + horizon = 12 + n_train = 12 + y_train, y_test = y[:n_train], y[n_train:] + months_train, months_test = months[:n_train], months[n_train:] + + methods_simple: dict[str, Callable[[np.ndarray, int], np.ndarray]] = { + "naive_last": _forecast_naive_last, + "moving_avg_3": lambda tr, h: _forecast_moving_average(tr, h, window=3), + "linear_trend": _forecast_linear_trend, + "seasonal_naive_12": _forecast_seasonal_naive_12, + } + + pred_rows: list[dict[str, Any]] = [] + metric_rows: list[dict[str, Any]] = [] + + # methods that depend on month labels + def _run_method(name: str) -> np.ndarray: + if name == "seasonal_mean": + return _forecast_seasonal_mean(y_train, months_train, months_test) + return methods_simple[name](y_train, horizon).astype(float) + + method_names = ["naive_last", "moving_avg_3", "linear_trend", "seasonal_naive_12", "seasonal_mean"] + for name in method_names: + yhat = _run_method(name).astype(float) + err = yhat - y_test.astype(float) + abs_err = np.abs(err) + ape = np.abs(err / y_test.astype(float)) * 100.0 + + for m, a, p, e, ae, pe in zip(months_test, y_test, yhat, err, abs_err, ape, strict=True): + pred_rows.append( + { + "method": name, + "month": m, + "actual": float(a), + "predicted": float(p), + "error": float(e), + "abs_error": float(ae), + "abs_pct_error": float(pe), + } + ) + + metric_rows.append({"method": name, "mae": _mae(y_test, yhat), "mape": _mape(y_test, yhat)}) + + metrics = ( + pd.DataFrame(metric_rows) + .sort_values(["mape", "mae"], ascending=[True, True]) + .reset_index(drop=True) + ) + chosen_method = str(metrics.loc[0, "method"]) + chosen_mae = float(metrics.loc[metrics["method"] == chosen_method, "mae"].iloc[0]) + + # --- Refit on all data and forecast next 12 months --- + last_month = pd.Period(months[-1], freq="M") + next_months = [(last_month + i).strftime("%Y-%m") for i in range(1, horizon + 1)] + + def _forecast_full(name: str) -> np.ndarray: + if name == "seasonal_mean": + # Use all history for means; forecast next months by month-of-year means. + return _forecast_seasonal_mean(y, months, next_months) + if name == "seasonal_naive_12": + return _forecast_seasonal_naive_12(y, horizon) + return methods_simple[name](y, horizon).astype(float) + + yhat_next = _forecast_full(chosen_method).astype(float) + lower = np.maximum(0.0, yhat_next - chosen_mae) + upper = yhat_next + chosen_mae + + forecast = pd.DataFrame( + { + "month": next_months, + "method": chosen_method, + "forecast": yhat_next, + "lower": lower, + "upper": upper, + } + ) + + # --- Write artifacts --- + series_csv = outdir / "ch16_series_monthly.csv" + seasonal_profile_csv = outdir / "ch16_seasonal_profile.csv" + backtest_predictions_csv = outdir / "ch16_backtest_predictions.csv" + backtest_metrics_csv = outdir / "ch16_backtest_metrics.csv" + forecast_csv = outdir / "ch16_forecast_next12.csv" + design_json = outdir / "ch16_design.json" + memo_md = outdir / "ch16_memo.md" + figures_manifest_csv = outdir / "ch16_figures_manifest.csv" + + series.to_csv(series_csv, index=False) + seasonal.to_csv(seasonal_profile_csv, index=False) + pd.DataFrame(pred_rows).to_csv(backtest_predictions_csv, index=False) + metrics.to_csv(backtest_metrics_csv, index=False) + forecast.to_csv(forecast_csv, index=False) + + design = { + "chapter": CHAPTER, + "seed": seed, + "target_series": target_col, + "history_months": months, + "train_months": months_train, + "test_months": months_test, + "methods_compared": method_names, + "selection_rule": "min MAPE on 12-month holdout; tie-break on MAE", + "chosen_method": chosen_method, + "chosen_method_backtest_mae": chosen_mae, + "seasonality_profile_note": "month-of-year mean and seasonal index (mean / overall mean)", + "forecast_horizon_months": horizon, + "forecast_months": next_months, + "forecast_interval_note": "+/- backtest MAE (simple heuristic, not a probabilistic interval)", + } + design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + + top = seasonal.sort_values("seasonal_index", ascending=False).head(3) + bottom = seasonal.sort_values("seasonal_index", ascending=True).head(3) + + memo_lines = [ + "# Chapter 16 Memo — Seasonality and Seasonal Baselines\n\n", + f"**Target series:** {target_col}\n", + f"**History window:** {months[0]} to {months[-1]} ({len(months)} months)\n", + f"**Backtest:** train={months_train[0]}..{months_train[-1]} (12), " + f"test={months_test[0]}..{months_test[-1]} (12)\n\n", + "## What seasonality means (in business terms)\n", + "Seasonality is a repeating calendar pattern (month-of-year effects) that shows up even when\n", + "nothing is “wrong.” If revenue is typically higher in some months and lower in others, a\n", + "non-seasonal baseline (like naive_last) will systematically miss those cycles.\n\n", + "## Seasonality profile (month-of-year)\n\n", + seasonal.to_markdown(index=False), + "\n\n", + "Top seasonal months (highest index):\n\n", + top.to_markdown(index=False), + "\n\n", + "Bottom seasonal months (lowest index):\n\n", + bottom.to_markdown(index=False), + "\n\n", + "## Backtest results (lower is better)\n\n", + metrics.to_markdown(index=False), + "\n\n", + f"**Selected method:** `{chosen_method}` (lowest MAPE; tie-break on MAE)\n\n", + "## Next 12-month forecast (with simple range)\n", + "Range shown is forecast ± backtest MAE.\n\n", + forecast.to_markdown(index=False), + "\n\n", + "## Hygiene reminders\n", + "- Seasonal baselines are not magic; they are a disciplined way to respect calendar patterns.\n", + "- If a major business change occurred (pricing, channel shift, store closure), document it and\n", + " consider resetting the training window or splitting pre/post periods.\n", + "- Always sanity-check outliers: one unusual month can distort both seasonal means and trend.\n", + ] + memo_md.write_text("".join(memo_lines), encoding="utf-8") + + # --- Figures + manifest --- + manifest_rows: list[FigureManifestRow] = [] + + def _add_row(fig_path: Path, spec: FigureSpec) -> None: + manifest_rows.append( + FigureManifestRow( + filename=fig_path.name, + chart_type=spec.chart_type, + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + data_source="NSO v1 synthetic outputs", + guardrail_note=( + "Seasonality baselines assume repeating calendar patterns. " + "Confirm that the business context supports this assumption." + ), + ) + ) + + # Figure 1: seasonal index profile + seasonal_plot = seasonal.copy() + seasonal_plot["month_of_year"] = seasonal_plot["month_of_year"].astype(int) + + with style_context(): + fig = plot_bar( + seasonal_plot, + x="month_of_year", + y="seasonal_index", + title="Seasonal index by month-of-year (Revenue)", + x_label="Month of year (1=Jan ... 12=Dec)", + y_label="Seasonal index (mean / overall mean)", + ) + spec = FigureSpec( + chart_type="bar", + title="Seasonal index by month-of-year (Revenue)", + x_label="Month of year (1=Jan ... 12=Dec)", + y_label="Seasonal index", + data_source="statements_is_monthly.csv (Sales Revenue)", + notes="Index > 1 means above-average month; < 1 means below-average.", + ) + fig_path = figures_dir / "ch16_fig_seasonal_profile.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + # Figure 2: backtest overlay for chosen method + bt = pd.DataFrame(pred_rows) + bt_best = bt[bt["method"] == chosen_method].copy() + + overlay = series[["month", "revenue"]].copy() + overlay = overlay.merge( + bt_best[["month", "predicted"]].rename(columns={"predicted": "predicted_best"}), + on="month", + how="left", + ) + + with style_context(): + fig = plot_time_series( + overlay, + x="month", + series={"Revenue (actual)": "revenue", f"Backtest ({chosen_method})": "predicted_best"}, + title="Backtest overlay (seasonality-aware method selection)", + x_label="Month", + y_label="Revenue", + ) + spec = FigureSpec( + chart_type="line", + title="Backtest overlay (seasonality-aware method selection)", + x_label="Month", + y_label="Revenue", + data_source="statements_is_monthly.csv + ch16_backtest_predictions.csv", + notes="Predicted values shown only for the 12-month test window.", + ) + fig_path = figures_dir / "ch16_fig_backtest_overlay.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + # Figure 3: forecast next 12 months (actual history + future forecast) + hist = pd.DataFrame({"month": months, "actual": y.astype(float), "forecast": np.nan}) + fut = pd.DataFrame({"month": next_months, "actual": np.nan, "forecast": yhat_next.astype(float)}) + combined = pd.concat([hist, fut], ignore_index=True) + with style_context(): + fig = plot_time_series( + combined, + x="month", + series={"Revenue (actual)": "actual", f"Forecast ({chosen_method})": "forecast"}, + title="Revenue forecast (next 12 months)", + x_label="Month", + y_label="Revenue", + ) + spec = FigureSpec( + chart_type="line", + title="Revenue forecast (next 12 months)", + x_label="Month", + y_label="Revenue", + data_source="statements_is_monthly.csv + ch16_forecast_next12.csv", + notes="Forecast shown for future months only (history remains actuals).", + ) + fig_path = figures_dir / "ch16_fig_forecast_next12.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv(figures_manifest_csv, index=False) + + return Outputs( + series_csv=series_csv, + seasonal_profile_csv=seasonal_profile_csv, + backtest_predictions_csv=backtest_predictions_csv, + backtest_metrics_csv=backtest_metrics_csv, + forecast_csv=forecast_csv, + design_json=design_json, + memo_md=memo_md, + figures_manifest_csv=figures_manifest_csv, + ) + + +def _build_cli() -> Any: + p = base_parser(description=CHAPTER) + p.add_argument("--datadir", type=Path, required=True) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + + outdir = args.outdir + analyze_ch16(datadir=args.datadir, outdir=outdir, seed=args.seed or 123) + print("Wrote Chapter 16 artifacts ->", outdir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) \ No newline at end of file diff --git a/workbooks/track_d_template/scripts/business_ch17_revenue_forecasting_segmentation_drivers.py b/workbooks/track_d_template/scripts/business_ch17_revenue_forecasting_segmentation_drivers.py new file mode 100644 index 0000000..e3c1f6a --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch17_revenue_forecasting_segmentation_drivers.py @@ -0,0 +1,833 @@ +"""Track D — Chapter 17: Revenue forecasting via segmentation + drivers (NSO v1). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch17_ar_revenue_segment_monthly.csv +* ch17_series_monthly.csv +* ch17_customer_segments.csv +* ch17_backtest_metrics.csv +* ch17_backtest_total_revenue.csv +* ch17_forecast_next12.csv +* ch17_memo.md +* ch17_design.json +* ch17_known_events_template.json +* ch17_figures_manifest.csv +* ch17_manifest.json +* ch17_forecast_next_12m.csv +* ch17_forecast_memo.md + +We forecast AR invoice revenue by combining two drivers: + +- Invoice count +- Average invoice value + +Revenue = invoice_count × avg_invoice_value. + +Customers are segmented into the top-K customers by invoice revenue plus an +"All other customers" bucket. + +The forecasting methods are intentionally simple baselines (last value, moving +averages, seasonal naive, month-of-year mean) selected by 12-month backtest. + +Run: + python -m scripts.business_ch17_revenue_forecasting_segmentation_drivers \ + --datadir data/synthetic/nso_v1 --outdir outputs/track_d --seed 123""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +from scripts._reporting_style import style_context + + +def _save_fig(fig, path: Path) -> None: + """Save and close a matplotlib figure (create parent dirs).""" + path.parent.mkdir(parents=True, exist_ok=True) + fig.tight_layout() + fig.savefig(path, dpi=150) + plt.close(fig) +# ---------------------------- +# Constants +# ---------------------------- + +DEFAULT_TOP_K = 3 +BACKTEST_WINDOW_MONTHS = 12 +SEASONAL_PERIOD = 12 + + +# ---------------------------- +# Data structures +# ---------------------------- + + +@dataclass(frozen=True) +class Outputs: + outdir: Path + figures_dir: Path + + seg_monthly_csv: Path + series_monthly_csv: Path + customer_segments_csv: Path + + backtest_metrics_csv: Path + backtest_total_csv: Path + + forecast_csv: Path + memo_md: Path + + design_json: Path + known_events_template_json: Path + + fig_segment_history: Path + fig_backtest_total: Path + fig_forecast_total: Path + + figures_manifest_csv: Path + manifest_json: Path + + # Backwards-compat / convenience aliases + forecast_csv_alias: Path + memo_md_alias: Path + + +# ---------------------------- +# IO helpers +# ---------------------------- + + +def _read_csv(path: Path, schema: dict[str, str] | None = None) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"Missing required file: {path}") + if schema is None: + return pd.read_csv(path) + return pd.read_csv(path, dtype=schema) + + +def _ensure_dir(path: Path) -> None: + path.mkdir(parents=True, exist_ok=True) + + +def _safe_div(numer: float, denom: float) -> float: + if denom == 0: + return float("nan") + return numer / denom + + +def _as_month_period(s: pd.Series) -> pd.PeriodIndex: + # Expect formats like YYYY-MM-DD or YYYY-MM + return pd.to_datetime(s, errors="raise").dt.to_period("M") + + +# ---------------------------- +# Loading + shaping +# ---------------------------- + + +def load_ar_invoices(datadir: Path) -> pd.DataFrame: + """Load AR events and keep only invoice rows.""" + path = datadir / "ar_events.csv" + df = _read_csv(path) + + # Normalize expected fields + # - date: YYYY-MM-DD + # - month: YYYY-MM (optional) + if "date" not in df.columns: + raise ValueError("ar_events.csv must include a 'date' column") + + if "month" in df.columns: + month = df["month"].astype(str) + # Coerce to Period via YYYY-MM parsing + month_period = pd.PeriodIndex(month, freq="M") + else: + month_period = _as_month_period(df["date"].astype(str)) + + df = df.copy() + df["month"] = month_period.astype(str) + + if "event_type" not in df.columns: + raise ValueError("ar_events.csv must include an 'event_type' column") + + inv = df[df["event_type"].astype(str).str.lower().eq("invoice")].copy() + + # Minimal required columns + required = {"month", "customer", "amount"} + missing = required.difference(inv.columns) + if missing: + raise ValueError(f"ar_events.csv invoice rows missing columns: {sorted(missing)}") + + inv["customer"] = inv["customer"].astype(str) + inv["amount"] = pd.to_numeric(inv["amount"], errors="coerce").fillna(0.0) + + return inv[["month", "customer", "amount"]] + + +def build_customer_segments(inv: pd.DataFrame, top_k: int) -> tuple[list[str], pd.DataFrame]: + """Return (top_customers, customer_segments_df).""" + totals = ( + inv.groupby("customer", as_index=False)["amount"] + .sum() + .rename(columns={"amount": "total_invoice_amount"}) + .sort_values("total_invoice_amount", ascending=False) + .reset_index(drop=True) + ) + + top_customers = totals.head(top_k)["customer"].tolist() + + seg = totals.copy() + seg["segment"] = np.where(seg["customer"].isin(top_customers), seg["customer"], "All other customers") + + return top_customers, seg + + +def build_segment_monthly(inv: pd.DataFrame, top_customers: list[str]) -> pd.DataFrame: + """Monthly drivers table by segment.""" + df = inv.copy() + df["segment"] = np.where(df["customer"].isin(top_customers), df["customer"], "All other customers") + + monthly = ( + df.groupby(["month", "segment"], as_index=False) + .agg(invoice_count=("amount", "size"), invoice_amount=("amount", "sum")) + .sort_values(["month", "segment"], ascending=True) + ) + + monthly["avg_invoice_value"] = monthly.apply( + lambda r: _safe_div(float(r["invoice_amount"]), float(r["invoice_count"])), axis=1 + ) + + # Ensure a complete month x segment grid with zeros for missing months + months = sorted(monthly["month"].unique().tolist()) + segments = sorted(monthly["segment"].unique().tolist()) + + grid = pd.MultiIndex.from_product([months, segments], names=["month", "segment"]).to_frame(index=False) + merged = grid.merge(monthly, on=["month", "segment"], how="left") + merged["invoice_count"] = merged["invoice_count"].fillna(0).astype(float) + merged["invoice_amount"] = merged["invoice_amount"].fillna(0.0).astype(float) + merged["avg_invoice_value"] = merged.apply( + lambda r: _safe_div(float(r["invoice_amount"]), float(r["invoice_count"])), axis=1 + ) + + # Month-of-year helper (1..12) + merged["moy"] = pd.PeriodIndex(merged["month"], freq="M").month + + return merged[["month", "moy", "segment", "invoice_count", "invoice_amount", "avg_invoice_value"]] + + +def build_series_monthly(seg_monthly: pd.DataFrame) -> pd.DataFrame: + g = seg_monthly.groupby("month", as_index=False).agg( + invoice_count=("invoice_count", "sum"), + invoice_amount=("invoice_amount", "sum"), + ) + g["avg_invoice_value"] = g.apply( + lambda r: _safe_div(float(r["invoice_amount"]), float(r["invoice_count"])), axis=1 + ) + g["moy"] = pd.PeriodIndex(g["month"], freq="M").month + return g[["month", "moy", "invoice_count", "invoice_amount", "avg_invoice_value"]] + + +# ---------------------------- +# Forecast methods +# ---------------------------- + + +def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float: + return float(np.mean(np.abs(y_true - y_pred))) + + +def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float: + y_true = np.asarray(y_true, dtype=float) + y_pred = np.asarray(y_pred, dtype=float) + mask = y_true != 0 + if mask.sum() == 0: + return float("nan") + return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))) + + +def forecast_naive_last(train: np.ndarray, h: int) -> np.ndarray: + if len(train) == 0: + return np.zeros(h) + return np.full(h, train[-1]) + + +def forecast_ma(train: np.ndarray, h: int, window: int) -> np.ndarray: + if len(train) == 0: + return np.zeros(h) + w = min(window, len(train)) + return np.full(h, float(np.mean(train[-w:]))) + + +def forecast_seasonal_naive(train: np.ndarray, h: int, period: int = SEASONAL_PERIOD) -> np.ndarray: + if len(train) == 0: + return np.zeros(h) + if len(train) < period: + return forecast_naive_last(train, h) + last_season = train[-period:] + reps = int(np.ceil(h / period)) + return np.tile(last_season, reps)[:h] + + +def forecast_moy_mean(train_df: pd.DataFrame, value_col: str, h_months: list[str]) -> np.ndarray: + # train_df must have columns: month, moy, value_col + moy_means = train_df.groupby("moy")[value_col].mean() + h_moy = pd.PeriodIndex(h_months, freq="M").month + vals = [] + for m in h_moy: + if m in moy_means.index: + vals.append(float(moy_means.loc[m])) + else: + vals.append(float(train_df[value_col].mean())) + return np.asarray(vals, dtype=float) + + +# ---------------------------- +# Backtest + selection +# ---------------------------- + + +def _split_train_test_months(months: list[str], test_window: int) -> tuple[list[str], list[str]]: + if len(months) <= test_window: + raise ValueError("Not enough months to backtest") + train_months = months[:-test_window] + test_months = months[-test_window:] + return train_months, test_months + + +def _eval_methods_for_segment( + seg_df: pd.DataFrame, + value_col: str, + methods: list[str], + test_window: int, +) -> pd.DataFrame: + months = seg_df["month"].tolist() + train_months, test_months = _split_train_test_months(months, test_window) + + train = seg_df.loc[seg_df["month"].isin(train_months), value_col].to_numpy(dtype=float) + test = seg_df.loc[seg_df["month"].isin(test_months), value_col].to_numpy(dtype=float) + + rows: list[dict[str, object]] = [] + + for method in methods: + if method == "naive_last": + pred = forecast_naive_last(train, len(test)) + elif method == "ma3": + pred = forecast_ma(train, len(test), window=3) + elif method == "seasonal_naive": + pred = forecast_seasonal_naive(train, len(test), period=SEASONAL_PERIOD) + elif method == "last": + pred = forecast_naive_last(train, len(test)) + elif method == "ma6": + pred = forecast_ma(train, len(test), window=6) + elif method == "moy_mean": + train_df = seg_df.loc[seg_df["month"].isin(train_months), ["month", "moy", value_col]].copy() + pred = forecast_moy_mean(train_df, value_col, test_months) + else: + raise ValueError(f"Unknown method: {method}") + + rows.append( + { + "series": value_col, + "method": method, + "mae": mae(test, pred), + "mape": mape(test, pred), + } + ) + + return pd.DataFrame(rows) + + +def select_best_method(metrics: pd.DataFrame) -> str: + # Prefer lowest MAE; tie-breaker lowest MAPE. + # If MAPE is NaN for some methods, treat as worse than numeric. + df = metrics.copy() + df["mape_rank"] = df["mape"].fillna(np.inf) + df = df.sort_values(["mae", "mape_rank"], ascending=[True, True]) + return str(df.iloc[0]["method"]) + + +# ---------------------------- +# Figures +# ---------------------------- + + +def plot_segment_revenue_history(seg_monthly: pd.DataFrame, fig_path: Path) -> None: + """Stacked bar chart of AR invoice revenue by customer segment over time.""" + piv = ( + seg_monthly.pivot_table(index="month", columns="segment", values="invoice_amount", aggfunc="sum") + .fillna(0.0) + ) + + with style_context(): + fig, ax = plt.subplots(figsize=(10, 5)) + piv.plot(kind="bar", stacked=True, ax=ax) + ax.set_title("AR (invoice) revenue by customer segment (history)") + ax.set_xlabel("Month") + ax.set_ylabel("Revenue (invoice amount)") + for lbl in ax.get_xticklabels(): + lbl.set_rotation(45) + lbl.set_ha("right") + _save_fig(fig, fig_path) + + +def plot_backtest_total(backtest_total: pd.DataFrame, fig_path: Path) -> None: + """Line chart comparing actual vs predicted total revenue in backtest window.""" + with style_context(): + fig, ax = plt.subplots(figsize=(10, 4)) + x = pd.to_datetime(backtest_total["month"]) + ax.plot(x, backtest_total["actual"], marker="o", label="Actual") + ax.plot(x, backtest_total["pred"], marker="o", label="Pred") + ax.set_title("Backtest: Total revenue (1-step ahead)") + ax.set_xlabel("Month") + ax.set_ylabel("Revenue") + fig.autofmt_xdate(rotation=45) + ax.legend() + _save_fig(fig, fig_path) + + + +def plot_forecast_total( + history_total: pd.DataFrame, + forecast_total: pd.DataFrame, + fig_path: Path, +) -> None: + """Line chart of historical total revenue and the next-12-month forecast (with optional band).""" + with style_context(): + fig, ax = plt.subplots(figsize=(10, 4)) + x_hist = pd.to_datetime(history_total["month"]) + x_fc = pd.to_datetime(forecast_total["month"]) + ax.plot(x_hist, history_total["invoice_amount"], marker="o", label="History") + ax.plot(x_fc, forecast_total["forecast_revenue"], marker="o", label="Forecast") + + if "forecast_lo" in forecast_total.columns and "forecast_hi" in forecast_total.columns: + lo = forecast_total["forecast_lo"] + hi = forecast_total["forecast_hi"] + if lo.notna().any() and hi.notna().any(): + ax.fill_between(x_fc, lo, hi, alpha=0.2, label="p10–p90 band") + + ax.set_title("Forecast: Total revenue (next 12 months)") + ax.set_xlabel("Month") + ax.set_ylabel("Revenue") + fig.autofmt_xdate(rotation=45) + ax.legend() + _save_fig(fig, fig_path) + + + +def write_markdown_memo( + out_path: Path, + design: dict, + selected_models: dict, + next12_total: pd.DataFrame, +) -> None: + lines: list[str] = [] + lines.append("# Chapter 17 — Revenue forecasting (segmentation + drivers)\n") + lines.append("## What we forecast\n") + lines.append("## Top customers\n") + lines.append( + "Top customers are treated as separate segments (top_k by total invoice amount in AR).\n\n" + ) + lines.append("We forecast AR (invoice) revenue as:\n") + lines.append("- **Revenue = invoice_count × avg_invoice_value**\n") + lines.append("\n") + + segs = design["segments"] + lines.append("## Customer segmentation\n") + lines.append(f"Segments (top-{design['top_k']} customers + bucket): {', '.join(segs)}\n") + lines.append("\n") + + lines.append("## Model selection (12-month backtest)\n") + lines.append("Per segment, we select:\n") + lines.append("- one method for **invoice_count** (naive_last / ma3 / seasonal_naive)\n") + lines.append("- one method for **avg_invoice_value** (last / ma6 / moy_mean)\n") + lines.append("Chosen by lowest MAE (tie-breaker MAPE).\n") + lines.append("\n") + + lines.append("### Selected models\n") + for seg, cfg in selected_models.items(): + lines.append(f"- **{seg}**: count={cfg['count_method']}, value={cfg['value_method']}\n") + lines.append("\n") + + lines.append("## Next 12 months (TOTAL)\n") + if len(next12_total) > 0: + lo = next12_total["forecast_lo"].dropna().min() + hi = next12_total["forecast_hi"].dropna().max() + lines.append( + "Forecast includes a simple uncertainty band for TOTAL derived from backtest errors (p10–p90).\n" + ) + if pd.notna(lo) and pd.notna(hi): + lines.append(f"Band range over horizon: lo={lo:,.2f}, hi={hi:,.2f}\n") + lines.append("\n") + + lines.append("## Files written\n") + lines.append("Outputs are written under the Track D output folder:\n") + lines.append("- CSV/JSON/MD in the chapter output directory\n") + lines.append("- Figures under `figures/`\n") + + out_path.write_text("".join(lines), encoding="utf-8") + + +def write_figures_manifest(paths: Outputs) -> None: + rows = [ + { + "filename": paths.fig_segment_history.name, + "chart_type": "bar", + "title": "AR (invoice) revenue by customer segment (history)", + "x_label": "Month", + "y_label": "Revenue (invoice amount)", + "guardrail_note": "Stacked bar of segmented AR invoice revenue; verify totals align with ch17_ar_revenue_segment_monthly.csv.", + "data_source": "ar_events.csv (invoice rows), grouped by month and customer segment", + }, + { + "filename": paths.fig_backtest_total.name, + "chart_type": "line", + "title": "Backtest: total revenue (12-month holdout)", + "x_label": "Month", + "y_label": "Revenue", + "guardrail_note": "Backtest compares predicted vs actual total revenue on a 12-month holdout window.", + "data_source": "ch17_ar_revenue_segment_monthly.csv (summed across segments)", + }, + { + "filename": paths.fig_forecast_total.name, + "chart_type": "line", + "title": "Forecast: total revenue (next 12 months)", + "x_label": "Month", + "y_label": "Revenue", + "guardrail_note": "Forecast total revenue using selected driver methods and a simple p10–p90 band (TOTAL only).", + "data_source": "ch17_forecast_next12.csv", + }, + ] + pd.DataFrame(rows).to_csv(paths.figures_manifest_csv, index=False) + + +def write_manifest(paths: Outputs) -> None: + payload = { + "chapter": 17, + "created_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), + "outdir": str(paths.outdir.as_posix()), + "figures_dir": str(paths.figures_dir.as_posix()), + "artifacts": [ + paths.seg_monthly_csv.name, + paths.series_monthly_csv.name, + paths.customer_segments_csv.name, + paths.backtest_metrics_csv.name, + paths.backtest_total_csv.name, + paths.forecast_csv.name, + paths.memo_md.name, + paths.design_json.name, + paths.known_events_template_json.name, + paths.figures_manifest_csv.name, + # figures (names only; stored under figures_dir) + paths.fig_segment_history.name, + paths.fig_backtest_total.name, + paths.fig_forecast_total.name, + ], + "aliases": [paths.forecast_csv_alias.name, paths.memo_md_alias.name], + } + paths.manifest_json.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + +# ---------------------------- +# Main analysis +# ---------------------------- + + +def make_outpaths(outdir: Path) -> Outputs: + figures_dir = outdir / "figures" + + return Outputs( + outdir=outdir, + figures_dir=figures_dir, + seg_monthly_csv=outdir / "ch17_ar_revenue_segment_monthly.csv", + series_monthly_csv=outdir / "ch17_series_monthly.csv", + customer_segments_csv=outdir / "ch17_customer_segments.csv", + backtest_metrics_csv=outdir / "ch17_backtest_metrics.csv", + backtest_total_csv=outdir / "ch17_backtest_total_revenue.csv", + forecast_csv=outdir / "ch17_forecast_next12.csv", + memo_md=outdir / "ch17_memo.md", + design_json=outdir / "ch17_design.json", + known_events_template_json=outdir / "ch17_known_events_template.json", + fig_segment_history=figures_dir / "ch17_fig_segment_revenue_history.png", + fig_backtest_total=figures_dir / "ch17_fig_backtest_total_revenue.png", + fig_forecast_total=figures_dir / "ch17_fig_forecast_total_revenue.png", + figures_manifest_csv=outdir / "ch17_figures_manifest.csv", + manifest_json=outdir / "ch17_manifest.json", + forecast_csv_alias=outdir / "ch17_forecast_next_12m.csv", + memo_md_alias=outdir / "ch17_forecast_memo.md", + ) + + +def analyze_ch17(datadir: Path, outdir: Path, seed: int = 123, top_k: int = DEFAULT_TOP_K) -> Outputs: + np.random.seed(seed) + paths = make_outpaths(outdir) + _ensure_dir(paths.outdir) + _ensure_dir(paths.figures_dir) + + inv = load_ar_invoices(datadir) + top_customers, customer_segments = build_customer_segments(inv, top_k=top_k) + + seg_monthly = build_segment_monthly(inv, top_customers=top_customers) + series_monthly = build_series_monthly(seg_monthly) + + # Backtest + model selection per segment + segments = sorted(seg_monthly["segment"].unique().tolist()) + months = sorted(seg_monthly["month"].unique().tolist()) + + count_methods = ["naive_last", "ma3", "seasonal_naive"] + value_methods = ["last", "ma6", "moy_mean"] + + metrics_rows: list[pd.DataFrame] = [] + selected: dict[str, dict[str, str]] = {} + + for seg in segments: + seg_df = seg_monthly[seg_monthly["segment"].eq(seg)].sort_values("month") + + m_count = _eval_methods_for_segment(seg_df, "invoice_count", count_methods, BACKTEST_WINDOW_MONTHS) + m_count["segment"] = seg + metrics_rows.append(m_count) + + m_val = _eval_methods_for_segment(seg_df, "avg_invoice_value", value_methods, BACKTEST_WINDOW_MONTHS) + m_val["segment"] = seg + metrics_rows.append(m_val) + + best_count = select_best_method(m_count) + best_value = select_best_method(m_val) + selected[seg] = {"count_method": best_count, "value_method": best_value} + + metrics = pd.concat(metrics_rows, ignore_index=True) + metrics = metrics[["segment", "series", "method", "mae", "mape"]].sort_values( + ["segment", "series", "mae"], ascending=[True, True, True] + ) + + # Backtest total revenue: build a 12-month holdout using selected methods + train_months, test_months = _split_train_test_months(months, BACKTEST_WINDOW_MONTHS) + + # Predict per segment in the test window + preds_by_seg: dict[str, np.ndarray] = {} + + for seg in segments: + seg_df = seg_monthly[seg_monthly["segment"].eq(seg)].sort_values("month") + train_df = seg_df[seg_df["month"].isin(train_months)].copy() + + # count + train_count = train_df["invoice_count"].to_numpy(dtype=float) + cm = selected[seg]["count_method"] + if cm == "naive_last": + pred_count = forecast_naive_last(train_count, len(test_months)) + elif cm == "ma3": + pred_count = forecast_ma(train_count, len(test_months), window=3) + elif cm == "seasonal_naive": + pred_count = forecast_seasonal_naive(train_count, len(test_months), period=SEASONAL_PERIOD) + else: + raise ValueError(f"Unknown count method: {cm}") + + # value + train_value = train_df["avg_invoice_value"].to_numpy(dtype=float) + vm = selected[seg]["value_method"] + if vm == "last": + pred_value = forecast_naive_last(train_value, len(test_months)) + elif vm == "ma6": + pred_value = forecast_ma(train_value, len(test_months), window=6) + elif vm == "moy_mean": + pred_value = forecast_moy_mean(train_df[["month", "moy", "avg_invoice_value"]], "avg_invoice_value", test_months) + else: + raise ValueError(f"Unknown value method: {vm}") + + preds_by_seg[seg] = pred_count * pred_value + + # Actual total revenue in the test window + actual_total = ( + seg_monthly[seg_monthly["month"].isin(test_months)] + .groupby("month", as_index=False)["invoice_amount"] + .sum() + .rename(columns={"invoice_amount": "actual"}) + .sort_values("month") + ) + + pred_total = pd.DataFrame({"month": test_months, "pred": np.zeros(len(test_months), dtype=float)}) + for seg in segments: + pred_total["pred"] += preds_by_seg[seg] + + backtest_total = actual_total.merge(pred_total, on="month", how="left") + backtest_total["error"] = backtest_total["actual"] - backtest_total["pred"] + + # Build simple TOTAL band using backtest percent error quantiles + pct_err = backtest_total.apply( + lambda r: _safe_div(float(r["error"]), float(r["actual"])) if float(r["actual"]) != 0 else float("nan"), axis=1 + ) + pct_err = pct_err.replace([np.inf, -np.inf], np.nan).dropna() + lo_pct = float(np.quantile(pct_err, 0.10)) if len(pct_err) else float("nan") + hi_pct = float(np.quantile(pct_err, 0.90)) if len(pct_err) else float("nan") + + # Forecast next 12 months (per segment) + last_period = pd.Period(months[-1], freq="M") + future_months = [(last_period + i).strftime("%Y-%m") for i in range(1, 13)] + + fc_rows: list[dict[str, object]] = [] + + for seg in segments: + seg_df = seg_monthly[seg_monthly["segment"].eq(seg)].sort_values("month") + + # counts + train_count_full = seg_df["invoice_count"].to_numpy(dtype=float) + cm = selected[seg]["count_method"] + if cm == "naive_last": + fc_count = forecast_naive_last(train_count_full, 12) + elif cm == "ma3": + fc_count = forecast_ma(train_count_full, 12, window=3) + elif cm == "seasonal_naive": + fc_count = forecast_seasonal_naive(train_count_full, 12, period=SEASONAL_PERIOD) + else: + raise ValueError(f"Unknown count method: {cm}") + + # values + train_value_full = seg_df["avg_invoice_value"].to_numpy(dtype=float) + vm = selected[seg]["value_method"] + if vm == "last": + fc_value = forecast_naive_last(train_value_full, 12) + elif vm == "ma6": + fc_value = forecast_ma(train_value_full, 12, window=6) + elif vm == "moy_mean": + fc_value = forecast_moy_mean(seg_df[["month", "moy", "avg_invoice_value"]], "avg_invoice_value", future_months) + else: + raise ValueError(f"Unknown value method: {vm}") + + for m, c, v in zip(future_months, fc_count, fc_value, strict=True): + fc_rows.append( + { + "month": m, + "segment": seg, + "forecast_invoice_count": float(c), + "forecast_avg_invoice_value": float(v), + "forecast_revenue": float(c) * float(v), + "forecast_lo": np.nan, + "forecast_hi": np.nan, + } + ) + + forecast = pd.DataFrame(fc_rows) + + # Add TOTAL segment row + total = ( + forecast.groupby("month", as_index=False) + .agg( + forecast_invoice_count=("forecast_invoice_count", "sum"), + forecast_avg_invoice_value=("forecast_avg_invoice_value", "mean"), + forecast_revenue=("forecast_revenue", "sum"), + ) + ) + total["segment"] = "TOTAL" + + if pd.notna(lo_pct) and pd.notna(hi_pct): + total["forecast_lo"] = total["forecast_revenue"] * (1.0 + lo_pct) + total["forecast_hi"] = total["forecast_revenue"] * (1.0 + hi_pct) + + forecast = pd.concat([forecast, total], ignore_index=True) + forecast = forecast[[ + "month", + "segment", + "forecast_invoice_count", + "forecast_avg_invoice_value", + "forecast_revenue", + "forecast_lo", + "forecast_hi", + ]].sort_values(["month", "segment"], ascending=[True, True]) + + # Known events template + known_template = { + "notes": "Optional: add known events / one-off adjustments that affect invoices or values.", + "schema": { + "month": "YYYY-MM", + "segment": "One of design.segments (including 'All other customers')", + "delta_invoice_count": "integer (additive)", + "mult_avg_invoice_value": "float multiplier, e.g. 1.10", + "comment": "string", + }, + "events": [], + } + + # Design json + design = { + "chapter": 17, + "seed": seed, + "datadir": str(datadir), + "top_k": top_k, + "segments": ["All other customers"] + top_customers, + "segment_meta": { + "top_k": top_k, + "top_customers": top_customers, + "segment_definition": "Top customers by total invoice amount (AR) vs All other customers", + "n_invoices": int(len(inv)), + "months": months, + }, + "count_methods": count_methods, + "value_methods": value_methods, + "selected_models": selected, + "backtest_window_months": BACKTEST_WINDOW_MONTHS, + "total_band_pct_quantiles": { + "lo": 0.10, + "hi": 0.90, + "lo_pct": lo_pct, + "hi_pct": hi_pct, + }, + } + + # Write outputs + seg_monthly.to_csv(paths.seg_monthly_csv, index=False) + series_monthly.to_csv(paths.series_monthly_csv, index=False) + customer_segments.to_csv(paths.customer_segments_csv, index=False) + + metrics.to_csv(paths.backtest_metrics_csv, index=False) + backtest_total.to_csv(paths.backtest_total_csv, index=False) + + forecast.to_csv(paths.forecast_csv, index=False) + # aliases + forecast.to_csv(paths.forecast_csv_alias, index=False) + + paths.design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + paths.known_events_template_json.write_text(json.dumps(known_template, indent=2), encoding="utf-8") + + # Figures + plot_segment_revenue_history(seg_monthly, paths.fig_segment_history) + plot_backtest_total(backtest_total, paths.fig_backtest_total) + + hist_total = series_monthly[["month", "invoice_amount"]].copy() + fc_total = forecast[forecast["segment"].eq("TOTAL")].copy().sort_values("month") + plot_forecast_total(hist_total, fc_total, paths.fig_forecast_total) + + # Memo + write_markdown_memo(paths.memo_md, design, selected, fc_total) + # memo alias + paths.memo_md_alias.write_text(paths.memo_md.read_text(encoding="utf-8"), encoding="utf-8") + + # Manifests + write_figures_manifest(paths) + write_manifest(paths) + + return paths + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Track D Chapter 17: revenue forecasting via segmentation + drivers") + p.add_argument("--datadir", type=Path, required=True, help="Directory containing NSO v1 synthetic data") + p.add_argument("--outdir", type=Path, required=True, help="Output root (e.g. outputs/track_d)") + p.add_argument("--seed", type=int, default=123, help="Random seed") + p.add_argument("--top-k", type=int, default=DEFAULT_TOP_K, help="Number of top customers to segment") + return p + + +def main() -> None: + args = build_parser().parse_args() + outdir = args.outdir + res = analyze_ch17(args.datadir, outdir, seed=int(args.seed), top_k=int(args.top_k)) + print(f"Wrote Chapter 17 artifacts -> {res.outdir}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/workbooks/track_d_template/scripts/business_ch18_expense_forecasting_fixed_variable_step_payroll.py b/workbooks/track_d_template/scripts/business_ch18_expense_forecasting_fixed_variable_step_payroll.py new file mode 100644 index 0000000..a8b1aca --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch18_expense_forecasting_fixed_variable_step_payroll.py @@ -0,0 +1,699 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 18: Expense forecasting (NSO running case). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch18_expense_monthly_by_account.csv +* ch18_expense_behavior_map.csv +* ch18_payroll_monthly.csv +* ch18_payroll_scenarios_forecast.csv +* ch18_expense_forecast_next12_detail.csv +* ch18_expense_forecast_next12_summary.csv +* ch18_control_plan_template.csv +* ch18_design.json +* ch18_memo.md +* ch18_figures_manifest.csv + +Focus: +- classify expenses by cost behavior (fixed / variable / step) +- forecast payroll using a simple scenario model +- produce an accountant-friendly expense control plan template + +Reads (from NSO v1 simulator output folder): +- chart_of_accounts.csv +- gl_journal.csv +- payroll_events.csv + +Writes (into outdir): +- ch18_expense_monthly_by_account.csv +- ch18_expense_behavior_map.csv +- ch18_payroll_monthly.csv +- ch18_payroll_scenarios_forecast.csv +- ch18_expense_forecast_next12_detail.csv +- ch18_expense_forecast_next12_summary.csv +- ch18_control_plan_template.csv +- ch18_design.json +- ch18_memo.md +- ch18_figures_manifest.csv +- figures/*.png referenced by the manifest + +Guardrails: +- This chapter builds planning baselines, not causal claims. +- Coefficients / rates are interpreted as "rules of thumb".""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from scripts._cli import base_parser +from scripts._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_time_series, + save_figure, + style_context, +) + +CHAPTER = "Track D — Chapter 18" + + +@dataclass(frozen=True) +class Outputs: + expense_monthly_by_account_csv: Path + expense_behavior_map_csv: Path + payroll_monthly_csv: Path + payroll_scenarios_forecast_csv: Path + expense_forecast_detail_csv: Path + expense_forecast_summary_csv: Path + control_plan_template_csv: Path + design_json: Path + memo_md: Path + figures_manifest_csv: Path + + +def _read_csv(datadir: Path, name: str) -> pd.DataFrame: + path = datadir / name + if not path.exists(): + raise FileNotFoundError(f"Expected {name} at {path}, but it was not found.") + return pd.read_csv(path) + + +def _month_from_date(series: pd.Series) -> pd.Series: + dt = pd.to_datetime(series.astype(str), errors="coerce") + if dt.isna().any(): + raise ValueError("Found invalid dates when building month keys.") + return dt.dt.to_period("M").astype(str) + + +def _next_months(last_month: str, n: int) -> list[str]: + start = pd.Period(str(last_month), freq="M") + return [str(start + i) for i in range(1, n + 1)] + + +def _month_of_year(month_str: str) -> int: + p = pd.Period(month_str, freq="M") + return int(p.month) + + +def _behavior_map() -> pd.DataFrame: + rows = [ + { + "account_id": "6100", + "account_name": "Rent Expense", + "behavior": "fixed", + "controllable": "mostly", + "primary_driver": "lease/contract", + "suggested_method": "flat (recent average)", + "monitoring_kpi": "rent as % of revenue", + }, + { + "account_id": "6200", + "account_name": "Utilities Expense", + "behavior": "variable", + "controllable": "some", + "primary_driver": "activity + season + rates", + "suggested_method": "seasonal mean by month-of-year", + "monitoring_kpi": "utilities per store-day", + }, + { + "account_id": "6300", + "account_name": "Payroll Expense", + "behavior": "step", + "controllable": "yes", + "primary_driver": "headcount × wage × schedule", + "suggested_method": "scenario baseline (multipliers)", + "monitoring_kpi": "payroll per revenue or payroll per transaction", + }, + { + "account_id": "6500", + "account_name": "Payroll Tax Expense", + "behavior": "variable", + "controllable": "limited", + "primary_driver": "payroll × employer tax rate", + "suggested_method": "rate × forecast payroll", + "monitoring_kpi": "employer tax rate (should be stable)", + }, + { + "account_id": "6400", + "account_name": "Depreciation Expense", + "behavior": "fixed", + "controllable": "no (short-run)", + "primary_driver": "asset base + depreciation policy", + "suggested_method": "flat (schedule-driven)", + "monitoring_kpi": "depreciation coverage in budget", + }, + { + "account_id": "6600", + "account_name": "Interest Expense", + "behavior": "fixed", + "controllable": "no (short-run)", + "primary_driver": "debt balance × rate", + "suggested_method": "flat (recent average)", + "monitoring_kpi": "interest coverage ratio", + }, + ] + return pd.DataFrame(rows) + + +def _monthly_expenses_from_gl(datadir: Path) -> pd.DataFrame: + """Build a tidy monthly expense table from the GL. + + Note: NSO v1 gl_journal.csv is already enriched with chart-of-accounts metadata. + """ + + gl = _read_csv(datadir, "gl_journal.csv").copy() + + # month key (YYYY-MM) + if "month" not in gl.columns: + gl["month"] = _month_from_date(gl["date"]) + else: + gl["month"] = gl["month"].astype(str) + + # harmonize metadata column names if they exist with suffixes + if "account_type" not in gl.columns: + for c in ("account_type_x", "account_type_y"): + if c in gl.columns: + gl["account_type"] = gl[c] + break + if "account_name" not in gl.columns: + for c in ("account_name_x", "account_name_y"): + if c in gl.columns: + gl["account_name"] = gl[c] + break + + if "account_type" not in gl.columns or "account_name" not in gl.columns: + raise ValueError( + "Expected gl_journal.csv to contain account_name/account_type columns (NSO v1 contract)." + ) + + # Expenses are debits (positive) for normal usage. + gl["amount"] = gl["debit"].astype(float) - gl["credit"].astype(float) + + exp = gl.loc[gl["account_type"].astype(str) == "Expense"].copy() + exp = exp.loc[exp["account_id"].astype(str) != "5000"].copy() # exclude COGS + + m = ( + exp.groupby(["month", "account_id", "account_name"], as_index=False)["amount"] + .sum() + .sort_values(["month", "account_id"]) + .reset_index(drop=True) + ) + + # Wide for plotting + easier scanning + wide = ( + m.pivot_table(index="month", columns="account_id", values="amount", aggfunc="sum", fill_value=0.0) + .reset_index() + .sort_values("month") + .reset_index(drop=True) + ) + + # Add month keys + wide["month_of_year"] = wide["month"].map(_month_of_year) + + # Stable expense columns for the major NSO accounts (keep stable even if zeros) + mapping = { + "6100": "rent_expense", + "6200": "utilities_expense", + "6300": "payroll_expense", + "6400": "depreciation_expense", + "6500": "payroll_tax_expense", + "6600": "interest_expense", + } + for acc, col in mapping.items(): + if acc not in wide.columns: + wide[acc] = 0.0 + wide = wide.rename(columns={acc: col}) + + wide["operating_expenses_total"] = ( + wide[list(mapping.values())].astype(float).sum(axis=1) + ) + + cols = ["month", "month_of_year", *mapping.values(), "operating_expenses_total"] + return wide[cols].copy() + + + +def _payroll_monthly(datadir: Path) -> pd.DataFrame: + pe = _read_csv(datadir, "payroll_events.csv") + pe = pe.copy() + + if "month" not in pe.columns: + pe["month"] = _month_from_date(pe["date"]) + + accr = pe.loc[pe["event_type"].astype(str) == "payroll_accrual"].copy() + tax = pe.loc[pe["event_type"].astype(str) == "payroll_tax_accrual"].copy() + + gross = accr.groupby("month", as_index=False)["gross_wages"].sum().rename(columns={"gross_wages": "gross_wages"}) + emp_tax = tax.groupby("month", as_index=False)["employer_tax"].sum().rename(columns={"employer_tax": "employer_tax"}) + + df = gross.merge(emp_tax, on="month", how="outer").fillna(0.0) + df = df.sort_values("month").reset_index(drop=True) + df["total_payroll_cost"] = df["gross_wages"].astype(float) + df["employer_tax"].astype(float) + + # rate is only meaningful when gross is non-zero + df["employer_tax_rate"] = np.where( + df["gross_wages"].astype(float) > 0, + df["employer_tax"].astype(float) / df["gross_wages"].astype(float), + np.nan, + ) + + return df + + +def _forecast_fixed(history: pd.Series, fallback: float = 0.0) -> float: + h = history.astype(float) + h = h.replace([np.inf, -np.inf], np.nan).dropna() + if len(h) == 0: + return float(fallback) + # recent average is more stable than a single last value + tail = h.tail(min(12, len(h))) + return float(tail.mean()) + + +def _forecast_seasonal_mean(history_df: pd.DataFrame, value_col: str, months_forecast: list[str]) -> pd.Series: + tmp = history_df[["month", "month_of_year", value_col]].copy() + means = tmp.groupby("month_of_year", as_index=False)[value_col].mean() + + out_rows: list[dict[str, object]] = [] + for m in months_forecast: + moy = _month_of_year(m) + hit = means.loc[means["month_of_year"] == moy] + val = float(hit[value_col].iloc[0]) if not hit.empty else float(tmp[value_col].mean()) + out_rows.append({"month": m, value_col: val}) + + return pd.DataFrame(out_rows).set_index("month")[value_col] + + +def analyze_ch18( + datadir: Path, + outdir: Path, + seed: int = 123, + wage_inflation_monthly: float = 0.002, +) -> Outputs: + """Run Chapter 18 analysis and write outputs into outdir.""" + + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + expense_history = _monthly_expenses_from_gl(datadir) + payroll_history = _payroll_monthly(datadir) + + months = expense_history["month"].tolist() + if len(months) < 12: + raise ValueError(f"Chapter 18 requires at least 12 months of data; got {len(months)}.") + + last_month = str(months[-1]) + horizon = 12 + months_fc = _next_months(last_month, horizon) + + # --- Payroll scenario model (headcount/wage changes modeled as multipliers) --- + scenarios: dict[str, float] = { + "Lean": 0.90, + "Base": 1.00, + "Growth": 1.15, + } + + baseline_gross = float(payroll_history["gross_wages"].tail(12).mean()) + + rate_series = payroll_history["employer_tax_rate"].replace([np.inf, -np.inf], np.nan).dropna() + employer_tax_rate = float(rate_series.mean()) if len(rate_series) > 0 else 0.08 + + payroll_fc_rows: list[dict[str, object]] = [] + for scenario, mult in scenarios.items(): + for i, m in enumerate(months_fc): + infl = float((1.0 + wage_inflation_monthly) ** i) + gross = float(baseline_gross * mult * infl) + emp_tax = float(gross * employer_tax_rate) + payroll_fc_rows.append( + { + "month": m, + "scenario": scenario, + "payroll_multiplier": float(mult), + "wage_inflation_monthly": float(wage_inflation_monthly), + "forecast_gross_wages": gross, + "forecast_employer_tax": emp_tax, + "forecast_total_payroll_cost": float(gross + emp_tax), + } + ) + + payroll_fc = pd.DataFrame(payroll_fc_rows) + + # --- Other expense forecasts (simple explainable baselines) --- + rent_fixed = _forecast_fixed(expense_history["rent_expense"]) + dep_fixed = _forecast_fixed(expense_history["depreciation_expense"]) + int_fixed = _forecast_fixed(expense_history["interest_expense"]) + + util_fc = _forecast_seasonal_mean(expense_history, "utilities_expense", months_fc) + + # Build forecast detail + summary + behavior = _behavior_map() + + detail_rows: list[dict[str, object]] = [] + summary_rows: list[dict[str, object]] = [] + + for scenario in scenarios.keys(): + psub = payroll_fc.loc[payroll_fc["scenario"] == scenario].copy() + psub = psub.sort_values("month") + psub = psub.set_index("month") + + for m in months_fc: + payroll_amt = float(psub.loc[m, "forecast_gross_wages"]) + payroll_tax_amt = float(psub.loc[m, "forecast_employer_tax"]) + + row_fixed = { + "rent_expense": rent_fixed, + "utilities_expense": float(util_fc.loc[m]), + "payroll_expense": payroll_amt, + "payroll_tax_expense": payroll_tax_amt, + "depreciation_expense": dep_fixed, + "interest_expense": int_fixed, + } + + total = float(sum(row_fixed.values())) + controllable = float(row_fixed["rent_expense"] + row_fixed["utilities_expense"] + row_fixed["payroll_expense"] + row_fixed["payroll_tax_expense"]) + + summary_rows.append( + { + "month": m, + "scenario": scenario, + **row_fixed, + "operating_expenses_total": total, + "controllable_expenses_total": controllable, + } + ) + + # detail rows per account + for m in months_fc: + detail_rows.extend( + [ + { + "month": m, + "scenario": scenario, + "account_id": "6100", + "account_name": "Rent Expense", + "behavior": "fixed", + "forecast_method": "flat_recent_avg", + "forecast_amount": float(rent_fixed), + }, + { + "month": m, + "scenario": scenario, + "account_id": "6200", + "account_name": "Utilities Expense", + "behavior": "variable", + "forecast_method": "seasonal_mean_moy", + "forecast_amount": float(util_fc.loc[m]), + }, + { + "month": m, + "scenario": scenario, + "account_id": "6300", + "account_name": "Payroll Expense", + "behavior": "step", + "forecast_method": "scenario_multiplier_inflation", + "forecast_amount": float(psub.loc[m, "forecast_gross_wages"]), + }, + { + "month": m, + "scenario": scenario, + "account_id": "6500", + "account_name": "Payroll Tax Expense", + "behavior": "variable", + "forecast_method": "rate_x_payroll", + "forecast_amount": float(psub.loc[m, "forecast_employer_tax"]), + }, + { + "month": m, + "scenario": scenario, + "account_id": "6400", + "account_name": "Depreciation Expense", + "behavior": "fixed", + "forecast_method": "flat_recent_avg", + "forecast_amount": float(dep_fixed), + }, + { + "month": m, + "scenario": scenario, + "account_id": "6600", + "account_name": "Interest Expense", + "behavior": "fixed", + "forecast_method": "flat_recent_avg", + "forecast_amount": float(int_fixed), + }, + ] + ) + + forecast_detail = pd.DataFrame(detail_rows) + forecast_summary = pd.DataFrame(summary_rows) + + # Control plan template (filled-in placeholders students can edit) + control_rows = [ + { + "expense_group": "Payroll (gross + employer tax)", + "primary_driver": "headcount × wage × schedule", + "controllable": "yes", + "monitoring_kpi": "payroll per revenue; payroll per transaction", + "owner": "", + "review_cadence": "weekly", + "notes": "What staffing decisions change this line?", + }, + { + "expense_group": "Rent", + "primary_driver": "lease contract", + "controllable": "mostly", + "monitoring_kpi": "rent as % of revenue", + "owner": "", + "review_cadence": "monthly", + "notes": "Renewals, sublease options, or space optimization.", + }, + { + "expense_group": "Utilities", + "primary_driver": "activity + season + rates", + "controllable": "some", + "monitoring_kpi": "utilities per store-day", + "owner": "", + "review_cadence": "monthly", + "notes": "Watch rate changes and seasonal spikes.", + }, + { + "expense_group": "Depreciation", + "primary_driver": "asset base + policy", + "controllable": "no (short-run)", + "monitoring_kpi": "capex plan vs budget", + "owner": "", + "review_cadence": "quarterly", + "notes": "Forecast tied to depreciation schedule.", + }, + { + "expense_group": "Interest", + "primary_driver": "debt × rate", + "controllable": "no (short-run)", + "monitoring_kpi": "interest coverage ratio", + "owner": "", + "review_cadence": "monthly", + "notes": "Refinancing is a structural change; document assumptions.", + }, + ] + control_plan = pd.DataFrame(control_rows) + + # --- Write outputs --- + expense_monthly_by_account_csv = outdir / "ch18_expense_monthly_by_account.csv" + expense_behavior_map_csv = outdir / "ch18_expense_behavior_map.csv" + payroll_monthly_csv = outdir / "ch18_payroll_monthly.csv" + payroll_scenarios_forecast_csv = outdir / "ch18_payroll_scenarios_forecast.csv" + expense_forecast_detail_csv = outdir / "ch18_expense_forecast_next12_detail.csv" + expense_forecast_summary_csv = outdir / "ch18_expense_forecast_next12_summary.csv" + control_plan_template_csv = outdir / "ch18_control_plan_template.csv" + design_json = outdir / "ch18_design.json" + memo_md = outdir / "ch18_memo.md" + figures_manifest_csv = outdir / "ch18_figures_manifest.csv" + + expense_history.to_csv(expense_monthly_by_account_csv, index=False) + behavior.to_csv(expense_behavior_map_csv, index=False) + payroll_history.to_csv(payroll_monthly_csv, index=False) + payroll_fc.to_csv(payroll_scenarios_forecast_csv, index=False) + forecast_detail.to_csv(expense_forecast_detail_csv, index=False) + forecast_summary.to_csv(expense_forecast_summary_csv, index=False) + control_plan.to_csv(control_plan_template_csv, index=False) + + design = { + "chapter": CHAPTER, + "seed": seed, + "history_months": months, + "forecast_months": months_fc, + "horizon_months": horizon, + "forecast_horizon_months": horizon, + "scenario_multipliers": scenarios, + "payroll_scenarios": scenarios, + "baseline_payroll_gross_recent_avg_12m": baseline_gross, + "employer_tax_rate_estimate": employer_tax_rate, + "wage_inflation_monthly": wage_inflation_monthly, + "notes": [ + "Fixed costs forecasted as a recent average (simple baseline).", + "Utilities forecasted by month-of-year seasonal mean.", + "Payroll forecasted via scenario multipliers + wage inflation.", + ], + } + design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + + memo_lines = [ + "# Chapter 18 Expense Forecast Memo (fixed / variable / step)\n\n", + "This memo provides a **planning baseline** for key operating expenses.\n", + "It is not a causal model. Treat rates and multipliers as **rules of thumb**.\n\n", + "## Expense behavior map\n\n", + behavior.to_markdown(index=False), + "\n\n", + "## Payroll model (scenario-based)\n\n", + f"- Baseline gross wages: {baseline_gross:,.0f} per month (recent 12-month average)\n", + f"- Estimated employer tax rate: {employer_tax_rate:.3f}\n", + f"- Wage inflation assumption: {wage_inflation_monthly:.3%} per month\n\n", + "Scenarios are implemented as multipliers applied to baseline payroll.\n\n", + "## Next 12 months: expense forecast summary (by scenario)\n\n", + forecast_summary.to_markdown(index=False), + "\n\n", + "## Control plan template\n\n", + "Use this template during month-end close to connect expense monitoring to owners and cadence.\n\n", + control_plan.to_markdown(index=False), + "\n\n", + "## Guardrails\n\n", + "- If payroll changes (hiring/layoffs), update scenario multipliers and document the decision.\n", + "- If utilities are rate-driven (not activity-driven), treat it as a known event rather than noise.\n", + "- Keep a versioned assumptions log when sharing forecasts externally.\n", + ] + memo_md.write_text("".join(memo_lines), encoding="utf-8") + + # --- Figures + manifest --- + manifest_rows: list[FigureManifestRow] = [] + + def _add_row(fig_path: Path, spec: FigureSpec) -> None: + manifest_rows.append( + FigureManifestRow( + filename=fig_path.name, + chart_type=spec.chart_type, + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + data_source="NSO v1 synthetic outputs", + guardrail_note=( + "Forecasts are planning baselines. Confirm assumptions, " + "contracts, and staffing decisions before acting." + ), + ) + ) + + with style_context(): + fig = plot_time_series( + expense_history, + x="month", + series={ + "Rent": "rent_expense", + "Utilities": "utilities_expense", + "Payroll": "payroll_expense", + "Payroll tax": "payroll_tax_expense", + "Depreciation": "depreciation_expense", + "Interest": "interest_expense", + }, + title="Operating expense history by category", + x_label="Month", + y_label="Expense (debit amounts)", + ) + spec = FigureSpec( + chart_type="line", + title="Operating expense history by category", + x_label="Month", + y_label="Expense (debit amounts)", + data_source="gl_journal.csv + chart_of_accounts.csv", + notes="COGS excluded; shows major operating expense categories.", + ) + fig_path = figures_dir / "ch18_fig_expense_history_by_category.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + # Payroll scenarios (total payroll cost) + scen_wide = ( + payroll_fc.pivot_table( + index="month", + columns="scenario", + values="forecast_total_payroll_cost", + aggfunc="mean", + fill_value=0.0, + ) + .reset_index() + .sort_values("month") + .reset_index(drop=True) + ) + + series_map = {f"Payroll cost ({c})": c for c in ["Lean", "Base", "Growth"] if c in scen_wide.columns} + + with style_context(): + fig = plot_time_series( + scen_wide, + x="month", + series=series_map, + title="Payroll forecast scenarios (next 12 months)", + x_label="Month", + y_label="Total payroll cost", + ) + spec = FigureSpec( + chart_type="line", + title="Payroll forecast scenarios (next 12 months)", + x_label="Month", + y_label="Total payroll cost", + data_source="payroll_events.csv (historical) + scenario model", + notes="Scenario multipliers model step-cost behavior in staffing.", + ) + fig_path = figures_dir / "ch18_fig_payroll_scenarios_next12.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv(figures_manifest_csv, index=False) + + return Outputs( + expense_monthly_by_account_csv=expense_monthly_by_account_csv, + expense_behavior_map_csv=expense_behavior_map_csv, + payroll_monthly_csv=payroll_monthly_csv, + payroll_scenarios_forecast_csv=payroll_scenarios_forecast_csv, + expense_forecast_detail_csv=expense_forecast_detail_csv, + expense_forecast_summary_csv=expense_forecast_summary_csv, + control_plan_template_csv=control_plan_template_csv, + design_json=design_json, + memo_md=memo_md, + figures_manifest_csv=figures_manifest_csv, + ) + + +def _build_cli() -> Any: + p = base_parser(description=CHAPTER) + p.add_argument("--datadir", type=Path, required=True) + p.add_argument( + "--wage-inflation-monthly", + type=float, + default=0.002, + help="Monthly wage inflation assumption for payroll scenarios (default: 0.002 = 0.2%).", + ) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + analyze_ch18( + datadir=args.datadir, + outdir=args.outdir, + seed=args.seed or 123, + wage_inflation_monthly=float(args.wage_inflation_monthly), + ) + print("Wrote Chapter 18 artifacts ->", args.outdir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch19_cash_flow_forecasting_direct_method_13_week.py b/workbooks/track_d_template/scripts/business_ch19_cash_flow_forecasting_direct_method_13_week.py new file mode 100644 index 0000000..d97e50f --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch19_cash_flow_forecasting_direct_method_13_week.py @@ -0,0 +1,625 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 19: Cash flow forecasting (direct method, 13-week). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch19_cash_history_weekly.csv +* ch19_cash_forecast_13w_scenarios.csv +* ch19_cash_assumptions.csv +* ch19_cash_governance_template.csv +* ch19_design.json +* ch19_memo.md +* ch19_figures_manifest.csv + +This chapter builds a short-term cash forecast using a **direct method** view: + + cash receipts (inflows) - cash payments (outflows) = net cash flow + +The NSO v1 simulator provides enough structure to demonstrate a realistic, accountant-friendly +workflow: + +- Use the **bank statement feed** as the cash source of truth (timing matters). +- Use AR/AP/payroll/sales tax/debt events to explain (and stress test) working-capital timing. + +Outputs are deterministic and written under outputs/track_d/. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any +import json + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser +from scripts._reporting_style import FigureManifestRow, FigureSpec, plot_time_series, save_figure, style_context + +CHAPTER = "Track D — Chapter 19" + + +@dataclass(frozen=True) +class Outputs: + cash_history_weekly_csv: Path + cash_forecast_13w_scenarios_csv: Path + cash_assumptions_csv: Path + cash_governance_template_csv: Path + design_json: Path + memo_md: Path + figures_manifest_csv: Path + + +def _week_start_monday(dt: pd.Timestamp) -> pd.Timestamp: + # Monday = 0 + return (dt - pd.Timedelta(days=int(dt.weekday()))).normalize() + + +def _classify_bank_txn(description: str, amount: float) -> str: + d = str(description).lower() + + # Receipts + if "cash sale" in d: + return "cash_sales" + if "collect on accounts receivable" in d: + return "ar_collections" + if "owner contribution" in d: + return "owner_contribution" + if "borrow" in d or "loan origination" in d: + return "borrowings" + + # Payments + if "pay accounts payable" in d: + return "ap_payments" + if "inventory purchase (cash" in d or "inventory purchase (cash)" in d: + return "inventory_cash_purchase" + if "pay monthly rent" in d: + return "rent" + if "pay utilities" in d: + return "utilities" + if "pay prior-month net wages" in d: + return "payroll_net_wages" + if "remit prior-month payroll taxes" in d: + return "payroll_tax_remit" + if "remit prior-month sales tax" in d: + return "sales_tax_remit" + if "pay note payable" in d: + return "debt_payment" + if "owner draw" in d: + return "owner_draw" + if "acquire fixed asset" in d: + return "capex" + + return "other_receipts" if float(amount) > 0 else "other_payments" + + +def _build_history_weekly(bank_statement: pd.DataFrame) -> pd.DataFrame: + if bank_statement.empty: + return pd.DataFrame( + columns=[ + "week_start", + "cash_in_total", + "cash_out_total", + "net_cash_flow", + "ending_cash", + ] + ) + + df = bank_statement.copy() + df["posted_date"] = pd.to_datetime(df["posted_date"], errors="coerce") + df = df.dropna(subset=["posted_date"]).copy() + df["week_start"] = df["posted_date"].apply(_week_start_monday) + df["category"] = [ + _classify_bank_txn(desc, amt) for desc, amt in zip(df["description"].astype(str), df["amount"].astype(float)) + ] + + # Aggregate signed amounts per week/category + wk_cat = ( + df.groupby(["week_start", "category"], observed=True)["amount"].sum().reset_index().sort_values("week_start") + ) + + # Wide category view (signed amounts) + wide = wk_cat.pivot_table(index="week_start", columns="category", values="amount", aggfunc="sum", fill_value=0.0) + wide = wide.reset_index().sort_values("week_start").reset_index(drop=True) + + # Totals + amt_cols = [c for c in wide.columns if c != "week_start"] + wide["cash_in_total"] = wide[amt_cols].clip(lower=0.0).sum(axis=1) + wide["cash_out_total"] = (-wide[amt_cols].clip(upper=0.0)).sum(axis=1) + wide["net_cash_flow"] = wide[amt_cols].sum(axis=1) + + # Running cash balance (start at 0; bank statement includes initial capital) + wide["ending_cash"] = wide["net_cash_flow"].cumsum() + + # Friendly ISO date labels + wide["week_start"] = wide["week_start"].dt.strftime("%Y-%m-%d") + return wide + + +def _buffer_target_from_history(history: pd.DataFrame) -> float: + """Simple buffer policy: target covers a "bad" week. + + We use the 90th percentile of weekly cash outflows on weeks where net cash flow is negative. + If there are no negative weeks (rare), fall back to the 75th percentile of total outflow. + + """ + if history.empty: + return 0.0 + + net = history["net_cash_flow"].astype(float) + out = history["cash_out_total"].astype(float) + + bad = (-net.loc[net < 0]).astype(float) + if len(bad) >= 3: + return float(np.quantile(bad, 0.90)) + if len(out) >= 3: + return float(np.quantile(out, 0.75)) + return float(out.mean()) + + +def _seasonal_pattern_by_week_of_month(history: pd.DataFrame) -> pd.DataFrame: + """Compute mean signed amount by (category, week_of_month) from recent history.""" + if history.empty: + return pd.DataFrame(columns=["category", "week_of_month", "mean_amount"]) + + # Identify category columns (everything except these) + reserved = {"week_start", "cash_in_total", "cash_out_total", "net_cash_flow", "ending_cash"} + cat_cols = [c for c in history.columns if c not in reserved] + + # long form + long = history[["week_start", *cat_cols]].copy() + long["week_start"] = pd.to_datetime(long["week_start"]) + long["week_of_month"] = 1 + ((long["week_start"].dt.day - 1) // 7) + + melted = long.melt(id_vars=["week_start", "week_of_month"], var_name="category", value_name="amount") + + pat = ( + melted.groupby(["category", "week_of_month"], observed=True)["amount"] + .mean() + .reset_index() + .rename(columns={"amount": "mean_amount"}) + ) + return pat + + +def _baseline_amount(pattern: pd.DataFrame, overall_means: dict[str, float], category: str, week_of_month: int) -> float: + hit = pattern.loc[(pattern["category"] == category) & (pattern["week_of_month"] == week_of_month)] + if not hit.empty: + return float(hit.iloc[0]["mean_amount"]) + return float(overall_means.get(category, 0.0)) + + +def analyze_ch19(*, datadir: Path, outdir: Path, seed: int = 123) -> Outputs: + """Run Chapter 19 analysis and write deterministic artifacts.""" + apply_seed(seed) + + outdir = Path(outdir) + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + bank_statement_csv = Path(datadir) / "bank_statement.csv" + ar_events_csv = Path(datadir) / "ar_events.csv" + ap_events_csv = Path(datadir) / "ap_events.csv" + + bank = pd.read_csv(bank_statement_csv) if bank_statement_csv.exists() else pd.DataFrame() + ar = pd.read_csv(ar_events_csv) if ar_events_csv.exists() else pd.DataFrame() + ap = pd.read_csv(ap_events_csv) if ap_events_csv.exists() else pd.DataFrame() + + # ------------------------------- + # 1) Weekly cash history from bank feed + # ------------------------------- + hist = _build_history_weekly(bank) + + cash_history_weekly_csv = outdir / "ch19_cash_history_weekly.csv" + hist.to_csv(cash_history_weekly_csv, index=False) + + # ------------------------------- + # 2) Forecast scaffolding (13 weeks) + # ------------------------------- + scenarios = ["Base", "Stress_Delayed_Collections", "Stress_Supplier_Terms_Tighten"] + + # Pattern window: use up to last 52 weeks for seasonality by week-of-month + if not hist.empty: + hist_recent = hist.tail(min(len(hist), 52)).copy() + else: + hist_recent = hist.copy() + + pattern = _seasonal_pattern_by_week_of_month(hist_recent) + + reserved = {"week_start", "cash_in_total", "cash_out_total", "net_cash_flow", "ending_cash"} + categories = [c for c in hist.columns if c not in reserved] + overall_means = {c: float(hist_recent[c].astype(float).mean()) for c in categories} + + # Forecast weeks + if hist.empty: + last_week = pd.Timestamp("2025-01-06") + start_cash = 0.0 + else: + last_week = pd.to_datetime(hist["week_start"].iloc[-1]) + start_cash = float(hist["ending_cash"].iloc[-1]) + + forecast_weeks = [last_week + pd.Timedelta(days=7 * i) for i in range(1, 14)] + + buffer_target = _buffer_target_from_history(hist_recent) + + # Build baseline (signed amounts per category) by applying week-of-month pattern + base_rows: list[dict[str, Any]] = [] + for wk in forecast_weeks: + wom = int(1 + ((wk.day - 1) // 7)) + row: dict[str, Any] = {"week_start": wk.strftime("%Y-%m-%d"), "week_of_month": wom} + for cat in categories: + row[cat] = _baseline_amount(pattern, overall_means, cat, wom) + base_rows.append(row) + + base_weekly = pd.DataFrame(base_rows) + + # Scenario adjustments + def _apply_delayed_collections(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + if "ar_collections" not in out.columns: + return out + shifted = out["ar_collections"].astype(float) * 0.20 + out["ar_collections"] = out["ar_collections"].astype(float) * 0.80 + # push 20% two weeks later (within horizon) + for i in range(len(out)): + j = i + 2 + if j < len(out): + out.loc[j, "ar_collections"] = float(out.loc[j, "ar_collections"] + shifted.iloc[i]) + return out + + def _apply_supplier_tighten(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + for cat in ["ap_payments", "inventory_cash_purchase"]: + if cat in out.columns: + out[cat] = out[cat].astype(float) * 1.15 + return out + + scenario_frames: dict[str, pd.DataFrame] = { + "Base": base_weekly, + "Stress_Delayed_Collections": _apply_delayed_collections(base_weekly), + "Stress_Supplier_Terms_Tighten": _apply_supplier_tighten(base_weekly), + } + + # Assemble forecast table with running cash balances + forecast_rows: list[dict[str, Any]] = [] + for scen in scenarios: + scen_df = scenario_frames[scen].copy() + scen_df = scen_df.sort_values("week_start").reset_index(drop=True) + + begin_cash = start_cash + for _, r in scen_df.iterrows(): + # signed category amounts + signed_sum = 0.0 + cash_in = 0.0 + cash_out = 0.0 + for cat in categories: + amt = float(r.get(cat, 0.0)) + signed_sum += amt + if amt >= 0: + cash_in += amt + else: + cash_out += -amt + + end_cash = float(begin_cash + signed_sum) + trigger = bool(end_cash < buffer_target) + + row_out: dict[str, Any] = { + "week_start": str(r["week_start"]), + "scenario": scen, + "beginning_cash": float(begin_cash), + "cash_in_total": float(cash_in), + "cash_out_total": float(cash_out), + "net_cash_flow": float(signed_sum), + "ending_cash": float(end_cash), + "buffer_target": float(buffer_target), + "buffer_trigger": trigger, + } + # Optional: include the two key working-capital drivers as explicit columns when present + if "ar_collections" in categories: + row_out["cash_in_ar_collections"] = float(max(0.0, float(r.get("ar_collections", 0.0)))) + if "ap_payments" in categories: + ap_amt = float(r.get("ap_payments", 0.0)) + row_out["cash_out_ap_payments"] = float(-ap_amt if ap_amt < 0 else 0.0) + + forecast_rows.append(row_out) + begin_cash = end_cash + + fc = pd.DataFrame(forecast_rows) + + # Ensure stable ordering + fc["week_start"] = pd.to_datetime(fc["week_start"]) + fc = fc.sort_values(["scenario", "week_start"], kind="mergesort").reset_index(drop=True) + fc["week_start"] = fc["week_start"].dt.strftime("%Y-%m-%d") + + cash_forecast_13w_scenarios_csv = outdir / "ch19_cash_forecast_13w_scenarios.csv" + fc.to_csv(cash_forecast_13w_scenarios_csv, index=False) + + # ------------------------------- + # 3) Assumptions + governance templates + # ------------------------------- + # Simple AR/AP behavior summaries (for the story) + def _safe_ratio(n: float, d: float) -> float: + return float(n / d) if abs(d) > 1e-12 else 0.0 + + ar_collect_rate = 0.0 + if not ar.empty and {"event_type", "cash_received", "amount"}.issubset(set(ar.columns)): + inv_amt = float(ar.loc[ar["event_type"].astype(str) == "invoice", "amount"].sum()) + coll_amt = float(ar.loc[ar["event_type"].astype(str) == "collection", "cash_received"].sum()) + ar_collect_rate = _safe_ratio(coll_amt, inv_amt) + + ap_pay_rate = 0.0 + if not ap.empty and {"event_type", "cash_paid", "amount"}.issubset(set(ap.columns)): + inv_amt = float(ap.loc[ap["event_type"].astype(str) == "invoice", "amount"].sum()) + pay_amt = float(ap.loc[ap["event_type"].astype(str) == "payment", "cash_paid"].sum()) + ap_pay_rate = _safe_ratio(pay_amt, inv_amt) + + assumptions_rows: list[dict[str, Any]] = [] + for scen in scenarios: + assumptions_rows.extend( + [ + { + "scenario": scen, + "assumption_key": "pattern_window_weeks", + "assumption_value": int(min(len(hist), 52)) if not hist.empty else 0, + "unit": "weeks", + "note": "Uses recent history to preserve timing patterns by week-of-month.", + }, + { + "scenario": scen, + "assumption_key": "ar_collection_rate_history", + "assumption_value": float(ar_collect_rate), + "unit": "ratio", + "note": "History-based AR cash collections / invoices (teaching simplification).", + }, + { + "scenario": scen, + "assumption_key": "ap_payment_rate_history", + "assumption_value": float(ap_pay_rate), + "unit": "ratio", + "note": "History-based AP cash paid / credit invoices (teaching simplification).", + }, + { + "scenario": scen, + "assumption_key": "buffer_target", + "assumption_value": float(buffer_target), + "unit": "currency", + "note": "Target cash buffer based on recent distribution of weekly outflows.", + }, + ] + ) + + if scen == "Stress_Delayed_Collections": + assumptions_rows.append( + { + "scenario": scen, + "assumption_key": "delayed_collections_shift", + "assumption_value": "20% shifted by +2 weeks", + "unit": "text", + "note": "Stress: some customers pay later than expected.", + } + ) + if scen == "Stress_Supplier_Terms_Tighten": + assumptions_rows.append( + { + "scenario": scen, + "assumption_key": "supplier_cash_out_multiplier", + "assumption_value": 1.15, + "unit": "multiplier", + "note": "Stress: suppliers require more cash / faster payment behavior.", + } + ) + + cash_assumptions_csv = outdir / "ch19_cash_assumptions.csv" + pd.DataFrame(assumptions_rows).to_csv(cash_assumptions_csv, index=False) + + governance = pd.DataFrame( + [ + { + "item": "Update cadence", + "description": "Update the 13-week forecast weekly (after bank download).", + "owner_role": "Bookkeeper / Controller", + "cadence": "Weekly", + "artifact": "ch19_cash_forecast_13w_scenarios.csv", + "escalation_trigger": "If buffer_trigger is True for any scenario in next 4 weeks.", + }, + { + "item": "Collections assumptions", + "description": "Review AR collections behavior (aging, large invoices, disputes).", + "owner_role": "AR lead", + "cadence": "Weekly", + "artifact": "AR aging + collections notes", + "escalation_trigger": "If stressed scenario shows cash < buffer for 2+ consecutive weeks.", + }, + { + "item": "Payments discipline", + "description": "Confirm AP payment plan and supplier term changes.", + "owner_role": "AP lead", + "cadence": "Weekly", + "artifact": "AP aging + payment run", + "escalation_trigger": "If supplier terms tighten or a key vendor changes terms.", + }, + { + "item": "Cash governance", + "description": "Decide actions when triggers fire (delay discretionary spend, expedite collections, renegotiate terms).", + "owner_role": "CFO / Owner", + "cadence": "As-needed", + "artifact": "ch19_memo.md", + "escalation_trigger": "Projected ending cash < buffer_target (any scenario).", + }, + ] + ) + + cash_governance_template_csv = outdir / "ch19_cash_governance_template.csv" + governance.to_csv(cash_governance_template_csv, index=False) + + # ------------------------------- + # 4) Design JSON + memo + # ------------------------------- + design = { + "chapter": CHAPTER, + "horizon_weeks": 13, + "scenarios": scenarios, + "data_sources": { + "bank_statement": "bank_statement.csv", + "ar_events": "ar_events.csv", + "ap_events": "ap_events.csv", + "payroll_events": "payroll_events.csv", + "sales_tax_events": "sales_tax_events.csv", + "debt_schedule": "debt_schedule.csv", + }, + "direct_method": { + "definition": "Cash receipts and cash payments, forecast at weekly granularity.", + "note": "Use as a planning baseline; confirm large known events manually.", + }, + "buffer_policy": { + "buffer_target": float(buffer_target), + "method": "p90 of negative weekly net cash flows (recent history)", + }, + "stress_tests": { + "delayed_collections": "Shift 20% of AR collections two weeks later.", + "supplier_terms_tighten": "Increase key supplier cash outflows by 15%.", + }, + } + + design_json = outdir / "ch19_design.json" + design_json.write_text(json.dumps(design, indent=2, sort_keys=True), encoding="utf-8") + + # A short memo (CFO-style) + memo_lines: list[str] = [] + memo_lines.append(f"# {CHAPTER} — 13-week cash forecast (direct method)\n") + memo_lines.append("This forecast is a **planning baseline**. It is not a guarantee.") + memo_lines.append("\n## Key points\n") + memo_lines.append(f"- Starting cash (end of history): **{start_cash:,.0f}**") + memo_lines.append(f"- Buffer target: **{buffer_target:,.0f}** (trigger when projected ending cash falls below this)") + memo_lines.append("- Scenarios: Base, delayed collections, and tighter supplier terms") + + # Base scenario preview table (next 4 weeks) + base_preview = ( + fc.loc[fc["scenario"] == "Base", ["week_start", "cash_in_total", "cash_out_total", "ending_cash", "buffer_trigger"]] + .head(4) + .copy() + ) + memo_lines.append("\n## Base scenario: next 4 weeks\n") + memo_lines.append(base_preview.to_markdown(index=False)) + + memo_lines.append("\n## Guardrails\n") + memo_lines.append( + "- Treat timing assumptions as **editable** (collections delays, payment runs, tax remittances).\n" + "- Do not over-interpret patterns: this is a short horizon meant for **cash governance**.\n" + "- When triggers fire, document actions and owners (see governance template)." + ) + + memo_md = outdir / "ch19_memo.md" + memo_md.write_text("\n".join(memo_lines) + "\n", encoding="utf-8") + + # ------------------------------- + # 5) Figures + manifest + # ------------------------------- + manifest_rows: list[FigureManifestRow] = [] + + def _add_row(fig_path: Path, spec: FigureSpec) -> None: + manifest_rows.append( + FigureManifestRow( + filename=fig_path.name, + chart_type=spec.chart_type, + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + data_source="NSO v1 synthetic outputs", + guardrail_note=( + "Short-term cash forecasts depend on timing assumptions. " + "Treat outputs as planning baselines; confirm large known events." + ), + ) + ) + + # Figure 1: history net cash flow (last 26 weeks) + if not hist.empty: + hist_tail = hist.tail(min(len(hist), 26)).copy() + with style_context(): + fig = plot_time_series( + hist_tail, + x="week_start", + series={"Net cash flow": "net_cash_flow"}, + title="Weekly net cash flow (recent history)", + x_label="Week start", + y_label="Net cash flow", + show_zero_line=True, + ) + spec = FigureSpec( + chart_type="line", + title="Weekly net cash flow (recent history)", + x_label="Week start", + y_label="Net cash flow", + data_source="bank_statement.csv", + notes="Direct method: receipts minus payments, aggregated weekly.", + ) + fig_path = figures_dir / "ch19_fig_weekly_net_cash_flow_history.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + # Figure 2: forecast ending cash by scenario + fc_wide = ( + fc.pivot_table(index="week_start", columns="scenario", values="ending_cash", aggfunc="mean", fill_value=0.0) + .reset_index() + .sort_values("week_start") + .reset_index(drop=True) + ) + + series_map = {f"Ending cash ({c})": c for c in scenarios if c in fc_wide.columns} + + with style_context(): + fig = plot_time_series( + fc_wide, + x="week_start", + series=series_map, + title="13-week cash balance forecast by scenario", + x_label="Week start", + y_label="Ending cash", + ) + spec = FigureSpec( + chart_type="line", + title="13-week cash balance forecast by scenario", + x_label="Week start", + y_label="Ending cash", + data_source="bank_statement.csv + scenario adjustments", + notes="Use Base for baseline planning; stresses show downside risk.", + ) + fig_path = figures_dir / "ch19_fig_cash_balance_forecast_by_scenario.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + figures_manifest_csv = outdir / "ch19_figures_manifest.csv" + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv(figures_manifest_csv, index=False) + + return Outputs( + cash_history_weekly_csv=cash_history_weekly_csv, + cash_forecast_13w_scenarios_csv=cash_forecast_13w_scenarios_csv, + cash_assumptions_csv=cash_assumptions_csv, + cash_governance_template_csv=cash_governance_template_csv, + design_json=design_json, + memo_md=memo_md, + figures_manifest_csv=figures_manifest_csv, + ) + + +def _build_cli() -> Any: + p = base_parser(description=CHAPTER) + p.add_argument("--datadir", type=Path, required=True) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + + analyze_ch19(datadir=args.datadir, outdir=args.outdir, seed=args.seed or 123) + print("Wrote Chapter 19 artifacts ->", args.outdir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch20_integrated_forecasting_three_statements.py b/workbooks/track_d_template/scripts/business_ch20_integrated_forecasting_three_statements.py new file mode 100644 index 0000000..9b2f4cd --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch20_integrated_forecasting_three_statements.py @@ -0,0 +1,575 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 20: Integrated forecasting (P&L + balance sheet + cash tie-out). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch20_pnl_forecast_monthly.csv +* ch20_balance_sheet_forecast_monthly.csv +* ch20_cash_flow_forecast_monthly.csv +* ch20_assumptions.csv +* ch20_design.json +* ch20_memo.md +* ch20_figures_manifest.csv + +This chapter connects the three core statements into one **integrated** forecast: + +- Profit & loss (accrual): revenue, costs, and net income. +- Balance sheet (stocks): cash, working capital, debt, equity. +- Cash flow (flows): explain how we get from beginning cash to ending cash. + +Key idea (accountant-friendly): a forecast is not "done" until it reconciles. +If the model says profit is up but cash is down, it should be explainable through +working capital, capex, or financing. + +Data source: NSO v1 simulator outputs under a folder like ``data/synthetic/nso_v1``. + +Outputs are deterministic and written under ``outputs/track_d``. + +Guardrails +---------- +- These are planning baselines, not causal claims. +- Rates and "days" assumptions (DSO/DIO/DPO) are descriptive of recent history. +- Any reconciliation residual is surfaced explicitly.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any +import json + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser +from scripts._reporting_style import FigureManifestRow, FigureSpec, plot_time_series, save_figure, style_context + +CHAPTER = "Track D — Chapter 20" + + +@dataclass(frozen=True) +class Outputs: + pnl_forecast_monthly_csv: Path + balance_sheet_forecast_monthly_csv: Path + cash_flow_forecast_monthly_csv: Path + assumptions_csv: Path + design_json: Path + memo_md: Path + figures_manifest_csv: Path + + +IS_LINE_MAP: dict[str, str] = { + "Sales Revenue": "sales_revenue", + "Cost of Goods Sold": "cogs", + "Operating Expenses": "operating_expenses", + "Net Income": "net_income", +} + +BS_LINE_MAP: dict[str, str] = { + "Cash": "cash", + "Accounts Receivable": "accounts_receivable", + "Inventory": "inventory", + "PP&E (Cost)": "ppe_cost", + "Accumulated Depreciation": "accumulated_depreciation", + "Net PP&E": "net_ppe", + "Accounts Payable": "accounts_payable", + "Notes Payable": "notes_payable", + "Wages Payable": "wages_payable", + "Payroll Taxes Payable": "payroll_taxes_payable", + "Sales Tax Payable": "sales_tax_payable", + "Owner Capital": "owner_capital", + "Owner Draw": "owner_draw", + "Retained Earnings (Cumulative, derived)": "retained_earnings", +} + +CF_LINE_MAP: dict[str, str] = { + "Add back Depreciation": "add_back_depreciation", + "Capital Expenditures (cash)": "capex_cash", + "Owner Draw (cash)": "owner_draw_cash", +} + + +def _read_csv(datadir: Path, name: str) -> pd.DataFrame: + path = datadir / name + if not path.exists(): + raise FileNotFoundError(f"Expected {name} at {path}, but it was not found.") + return pd.read_csv(path) + + +def _pivot_statement(df_long: pd.DataFrame, line_map: dict[str, str]) -> pd.DataFrame: + """Pivot a month/line/amount statement into a wide dataframe with canonical column names.""" + if df_long.empty: + return pd.DataFrame(columns=["month", *sorted(set(line_map.values()))]) + + df = df_long.copy() + df["month"] = df["month"].astype(str) + df = df[df["line"].astype(str).isin(line_map.keys())].copy() + df["line"] = df["line"].astype(str).map(line_map) + + wide = df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum", fill_value=0.0) + wide = wide.reset_index().sort_values("month").reset_index(drop=True) + + # Ensure all expected columns exist. + for col in sorted(set(line_map.values())): + if col not in wide.columns: + wide[col] = 0.0 + + return wide + + +def _next_months(last_month: str, n: int) -> list[str]: + p0 = pd.Period(last_month, freq="M") + return [(p0 + i).strftime("%Y-%m") for i in range(1, n + 1)] + + +def _safe_ratio(num: pd.Series, den: pd.Series) -> pd.Series: + den2 = den.replace(0.0, np.nan) + return (num.astype(float) / den2.astype(float)).replace([np.inf, -np.inf], np.nan) + + +def _median_clip(x: pd.Series, lo: float, hi: float, default: float) -> float: + v = pd.to_numeric(x, errors="coerce").dropna() + if len(v) == 0: + return float(default) + m = float(np.nanmedian(v.to_numpy(dtype=float))) + return float(np.clip(m, lo, hi)) + + +def _estimate_days(ar: pd.Series, inv: pd.Series, ap: pd.Series, rev: pd.Series, cogs: pd.Series) -> dict[str, float]: + # Simple “days” heuristics from recent history. + dso = _median_clip(_safe_ratio(ar, rev) * 30.0, lo=0.0, hi=120.0, default=25.0) + dio = _median_clip(_safe_ratio(inv, cogs) * 30.0, lo=0.0, hi=180.0, default=45.0) + dpo = _median_clip(_safe_ratio(ap, cogs) * 30.0, lo=0.0, hi=180.0, default=30.0) + return {"dso": dso, "dio": dio, "dpo": dpo} + + +def _seasonal_naive_forecast(history: pd.Series, history_months: pd.Series, future_months: list[str]) -> pd.Series: + """Seasonal naive: for each future month-of-year, use the average of that month in history.""" + df = pd.DataFrame({"month": history_months.astype(str), "y": pd.to_numeric(history, errors="coerce")}).dropna() + if df.empty: + return pd.Series([0.0 for _ in future_months], index=future_months, dtype=float) + + df["moy"] = pd.to_datetime(df["month"] + "-01").dt.month + moy_mean = df.groupby("moy")["y"].mean().to_dict() + overall = float(df["y"].mean()) + + out = [] + for m in future_months: + moy = int(pd.to_datetime(m + "-01").month) + out.append(float(moy_mean.get(moy, overall))) + return pd.Series(out, index=future_months, dtype=float) + + +def analyze_ch20(datadir: Path, outdir: Path, seed: int = 123) -> Outputs: + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + figures_dir = outdir / "figures" + figures_dir.mkdir(parents=True, exist_ok=True) + + is_long = _read_csv(datadir, "statements_is_monthly.csv") + bs_long = _read_csv(datadir, "statements_bs_monthly.csv") + cf_long = _read_csv(datadir, "statements_cf_monthly.csv") + debt = _read_csv(datadir, "debt_schedule.csv") if (datadir / "debt_schedule.csv").exists() else pd.DataFrame() + + is_w = _pivot_statement(is_long, IS_LINE_MAP) + bs_w = _pivot_statement(bs_long, BS_LINE_MAP) + cf_w = _pivot_statement(cf_long, CF_LINE_MAP) + + if is_w.empty or bs_w.empty: + raise ValueError("NSO statement tables are empty; cannot run Chapter 20.") + + # Align months on the intersection to avoid mismatched simulator slices. + months = sorted(set(is_w["month"].tolist()) & set(bs_w["month"].tolist())) + is_w = is_w[is_w["month"].isin(months)].sort_values("month").reset_index(drop=True) + bs_w = bs_w[bs_w["month"].isin(months)].sort_values("month").reset_index(drop=True) + cf_w = cf_w[cf_w["month"].isin(months)].sort_values("month").reset_index(drop=True) + + last_month = str(months[-1]) + future_months = _next_months(last_month, n=12) + + # --- Estimate rates from the last 12 months (or all if fewer). + tail_n = min(12, len(is_w)) + hist_is = is_w.tail(tail_n).copy() + hist_bs = bs_w.tail(tail_n).copy() + hist_cf = cf_w.tail(tail_n).copy() + + rev = hist_is["sales_revenue"].astype(float) + cogs = hist_is["cogs"].astype(float) + opex = hist_is["operating_expenses"].astype(float) + + cogs_rate = _median_clip(_safe_ratio(cogs, rev), lo=0.0, hi=2.0, default=0.55) + opex_rate = _median_clip(_safe_ratio(opex, rev), lo=0.0, hi=2.0, default=0.35) + + days = _estimate_days( + ar=hist_bs["accounts_receivable"], + inv=hist_bs["inventory"], + ap=hist_bs["accounts_payable"], + rev=rev, + cogs=cogs, + ) + + dep_avg = _median_clip(hist_cf["add_back_depreciation"], lo=0.0, hi=1e9, default=0.0) + capex_cash_avg = _median_clip(hist_cf["capex_cash"], lo=-1e9, hi=0.0, default=0.0) + owner_draw_cash_avg = _median_clip(hist_cf["owner_draw_cash"], lo=-1e9, hi=0.0, default=0.0) + + # Principal repayment heuristic from the debt schedule (if present). + principal_pay = 0.0 + if not debt.empty and "principal" in debt.columns: + principal_pay = _median_clip(pd.to_numeric(debt["principal"], errors="coerce"), lo=0.0, hi=1e9, default=0.0) + + # Baseline revenue forecast is seasonal naive. + rev_fc = _seasonal_naive_forecast(is_w["sales_revenue"], is_w["month"], future_months) + + # Taxes/wages payables held constant (small for NSO v1, but keeps the structure). + last_state = bs_w.loc[bs_w["month"] == last_month].iloc[0] + + state = { + "cash": float(last_state["cash"]), + "accounts_receivable": float(last_state["accounts_receivable"]), + "inventory": float(last_state["inventory"]), + "ppe_cost": float(last_state["ppe_cost"]), + "accumulated_depreciation": float(last_state["accumulated_depreciation"]), + "accounts_payable": float(last_state["accounts_payable"]), + "notes_payable": float(last_state["notes_payable"]), + "wages_payable": float(last_state["wages_payable"]), + "payroll_taxes_payable": float(last_state["payroll_taxes_payable"]), + "sales_tax_payable": float(last_state["sales_tax_payable"]), + "owner_capital": float(last_state["owner_capital"]), + "owner_draw": float(last_state["owner_draw"]), + "retained_earnings": float(last_state["retained_earnings"]), + } + + pnl_rows: list[dict[str, float | str]] = [] + bs_rows: list[dict[str, float | str]] = [] + cf_rows: list[dict[str, float | str]] = [] + + for m in future_months: + beginning = state.copy() + + sales_revenue = float(rev_fc.loc[m]) + cogs_m = float(sales_revenue * cogs_rate) + opex_m = float(sales_revenue * opex_rate) + net_income_m = float(sales_revenue - cogs_m - opex_m) + + # Working capital targets from “days” assumptions. + ar_end = float((sales_revenue / 30.0) * days["dso"] if sales_revenue != 0.0 else beginning["accounts_receivable"]) + inv_end = float((cogs_m / 30.0) * days["dio"] if cogs_m != 0.0 else beginning["inventory"]) + ap_end = float((cogs_m / 30.0) * days["dpo"] if cogs_m != 0.0 else beginning["accounts_payable"]) + + # PP&E and depreciation. + capex_cash = float(capex_cash_avg) + capex_increase = float(-capex_cash) # capex_cash is negative (cash out) + ppe_cost_end = float(beginning["ppe_cost"] + max(capex_increase, 0.0)) + accumulated_depreciation_end = float(beginning["accumulated_depreciation"] - dep_avg) + net_ppe_end = float(ppe_cost_end + accumulated_depreciation_end) + + # Debt and owner draws. + notes_end = float(max(0.0, beginning["notes_payable"] - principal_pay)) + net_borrowings = float(notes_end - beginning["notes_payable"]) + owner_draw_cash = float(owner_draw_cash_avg) + owner_draw_end = float(beginning["owner_draw"] + owner_draw_cash) + + # Equity roll-forward. + retained_end = float(beginning["retained_earnings"] + net_income_m) + owner_cap_end = float(beginning["owner_capital"]) # no new contributions in baseline + + # Liabilities held constant except AP and debt. + wages_payable_end = float(beginning["wages_payable"]) + payroll_taxes_payable_end = float(beginning["payroll_taxes_payable"]) + sales_tax_payable_end = float(beginning["sales_tax_payable"]) + + total_liabilities = float( + ap_end + + notes_end + + wages_payable_end + + payroll_taxes_payable_end + + sales_tax_payable_end + ) + total_equity = float(owner_cap_end + retained_end + owner_draw_end) + total_l_e = float(total_liabilities + total_equity) + + # Balance sheet: cash is the plug that makes A = L + E. + cash_end = float(total_l_e - (ar_end + inv_end + net_ppe_end)) + total_assets = float(cash_end + ar_end + inv_end + net_ppe_end) + + # Cash flow bridge from components. + delta_ar = float(ar_end - beginning["accounts_receivable"]) + delta_inv = float(inv_end - beginning["inventory"]) + delta_ap = float(ap_end - beginning["accounts_payable"]) + delta_wages = float(wages_payable_end - beginning["wages_payable"]) + delta_ptx = float(payroll_taxes_payable_end - beginning["payroll_taxes_payable"]) + delta_stx = float(sales_tax_payable_end - beginning["sales_tax_payable"]) + + cfo = float(net_income_m + dep_avg - delta_ar - delta_inv + delta_ap + delta_wages + delta_ptx + delta_stx) + cfi = float(capex_cash) + cff = float(net_borrowings + owner_draw_cash) + net_change_components = float(cfo + cfi + cff) + net_change_actual = float(cash_end - beginning["cash"]) + reconciliation_residual = float(net_change_actual - net_change_components) + net_change_in_cash = float(net_change_components + reconciliation_residual) + + ending_cash_bridge = float(beginning["cash"] + net_change_in_cash) + tieout_delta = float(ending_cash_bridge - cash_end) + + pnl_rows.append( + { + "month": m, + "sales_revenue": sales_revenue, + "cogs": cogs_m, + "operating_expenses": opex_m, + "net_income": net_income_m, + } + ) + + bs_rows.append( + { + "month": m, + "cash": cash_end, + "accounts_receivable": ar_end, + "inventory": inv_end, + "ppe_cost": ppe_cost_end, + "accumulated_depreciation": accumulated_depreciation_end, + "net_ppe": net_ppe_end, + "accounts_payable": ap_end, + "notes_payable": notes_end, + "wages_payable": wages_payable_end, + "payroll_taxes_payable": payroll_taxes_payable_end, + "sales_tax_payable": sales_tax_payable_end, + "owner_capital": owner_cap_end, + "retained_earnings": retained_end, + "owner_draw": owner_draw_end, + "total_assets": total_assets, + "total_liabilities": total_liabilities, + "total_equity": total_equity, + "total_liabilities_equity": total_l_e, + "balance_check": float(total_assets - total_l_e), + } + ) + + cf_rows.append( + { + "month": m, + "beginning_cash": float(beginning["cash"]), + "net_income": net_income_m, + "add_back_depreciation": dep_avg, + "delta_accounts_receivable": delta_ar, + "delta_inventory": delta_inv, + "delta_accounts_payable": delta_ap, + "delta_wages_payable": delta_wages, + "delta_payroll_taxes_payable": delta_ptx, + "delta_sales_tax_payable": delta_stx, + "net_cash_from_operations": cfo, + "capex_cash": capex_cash, + "net_cash_from_investing": cfi, + "net_borrowings": net_borrowings, + "owner_draw_cash": owner_draw_cash, + "net_cash_from_financing": cff, + "reconciliation_residual": reconciliation_residual, + "net_change_in_cash": net_change_in_cash, + "ending_cash_balance_sheet": cash_end, + "ending_cash_from_bridge": ending_cash_bridge, + "tieout_delta": tieout_delta, + } + ) + + # Update state for next month. + state.update( + { + "cash": cash_end, + "accounts_receivable": ar_end, + "inventory": inv_end, + "ppe_cost": ppe_cost_end, + "accumulated_depreciation": accumulated_depreciation_end, + "accounts_payable": ap_end, + "notes_payable": notes_end, + "wages_payable": wages_payable_end, + "payroll_taxes_payable": payroll_taxes_payable_end, + "sales_tax_payable": sales_tax_payable_end, + "owner_capital": owner_cap_end, + "owner_draw": owner_draw_end, + "retained_earnings": retained_end, + } + ) + + pnl_df = pd.DataFrame(pnl_rows) + bs_df = pd.DataFrame(bs_rows) + cf_df = pd.DataFrame(cf_rows) + + pnl_csv = outdir / "ch20_pnl_forecast_monthly.csv" + bs_csv = outdir / "ch20_balance_sheet_forecast_monthly.csv" + cf_csv = outdir / "ch20_cash_flow_forecast_monthly.csv" + assumptions_csv = outdir / "ch20_assumptions.csv" + design_json = outdir / "ch20_design.json" + memo_md = outdir / "ch20_memo.md" + figures_manifest_csv = outdir / "ch20_figures_manifest.csv" + + pnl_df.to_csv(pnl_csv, index=False) + bs_df.to_csv(bs_csv, index=False) + cf_df.to_csv(cf_csv, index=False) + + assumptions = pd.DataFrame( + [ + {"key": "last_actual_month", "value": last_month}, + {"key": "forecast_horizon_months", "value": 12}, + {"key": "revenue_method", "value": "seasonal_naive_month_of_year_mean"}, + {"key": "cogs_rate_median", "value": cogs_rate}, + {"key": "operating_expenses_rate_median", "value": opex_rate}, + {"key": "dso_days_median", "value": days["dso"]}, + {"key": "dio_days_median", "value": days["dio"]}, + {"key": "dpo_days_median", "value": days["dpo"]}, + {"key": "depreciation_monthly_median", "value": dep_avg}, + {"key": "capex_cash_monthly_median", "value": capex_cash_avg}, + {"key": "owner_draw_cash_monthly_median", "value": owner_draw_cash_avg}, + {"key": "principal_payment_monthly_median", "value": principal_pay}, + ] + ) + assumptions.to_csv(assumptions_csv, index=False) + + design = { + "chapter": CHAPTER, + "dataset": "NSO v1 (synthetic)", + "last_actual_month": last_month, + "horizon_months": 12, + "forecast_months": future_months, + "methods": { + "revenue": "seasonal naive (month-of-year mean)", + "cogs": "median cogs/revenue rate", + "operating_expenses": "median opex/revenue rate", + "working_capital": "AR/AP/Inventory via DSO/DPO/DIO medians", + "ppe": "capex cash median, depreciation median", + "debt": "principal payment median (no new borrowing baseline)", + "equity": "retained earnings roll-forward via net income; owner draw from history", + }, + "outputs": { + "pnl": pnl_csv.name, + "balance_sheet": bs_csv.name, + "cash_flow": cf_csv.name, + "assumptions": assumptions_csv.name, + "memo": memo_md.name, + "figures_manifest": figures_manifest_csv.name, + }, + "style_contract": "scripts/_reporting_style.py (Track D style contract)", + } + design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + + # --- Figures + manifest + manifest_rows: list[FigureManifestRow] = [] + + def _add_row(fig_path: Path, spec: FigureSpec) -> None: + manifest_rows.append( + FigureManifestRow( + filename=fig_path.name, + chart_type=spec.chart_type, + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + data_source="NSO v1 synthetic outputs", + guardrail_note=( + "Integrated forecasts must reconcile across statements. " + "Treat rates/days as baselines and stress test key assumptions." + ), + ) + ) + + # Figure 1: Ending cash (balance sheet) forecast + with style_context(): + fig = plot_time_series( + cf_df, + x="month", + series={"Ending cash": "ending_cash_balance_sheet"}, + title="Integrated forecast: ending cash by month", + x_label="Month", + y_label="Cash", + ) + spec = FigureSpec( + chart_type="line", + title="Integrated forecast: ending cash by month", + x_label="Month", + y_label="Cash", + data_source="statements_is_monthly.csv + statements_bs_monthly.csv", + notes="Cash is the reconciled outcome of profit, working capital, capex, and financing.", + ) + fig_path = figures_dir / "ch20_fig_ending_cash_forecast.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + # Figure 2: Net income forecast + with style_context(): + fig = plot_time_series( + pnl_df, + x="month", + series={"Net income": "net_income"}, + title="Integrated forecast: net income by month", + x_label="Month", + y_label="Net income", + show_zero_line=True, + ) + spec = FigureSpec( + chart_type="line", + title="Integrated forecast: net income by month", + x_label="Month", + y_label="Net income", + data_source="statements_is_monthly.csv", + notes="Net income is accrual; cash can differ due to working capital and capex.", + ) + fig_path = figures_dir / "ch20_fig_net_income_forecast.png" + save_figure(fig, fig_path, spec=spec) + _add_row(fig_path, spec) + + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv(figures_manifest_csv, index=False) + + # Memo: short, decision-focused. + residual_max = float(np.max(np.abs(cf_df["reconciliation_residual"].astype(float).to_numpy()))) + memo_md.write_text( + "\n".join( + [ + f"# {CHAPTER}: Integrated forecast summary", + "", + "This run produced an integrated 12-month forecast that ties the three statements:", + "- Profit (P&L) → retained earnings (equity)", + "- Working capital (AR/AP/Inventory) → operating cash flow", + "- Capex + debt + owner draws → investing/financing cash flow", + "", + "## What to look for", + "- If profit is improving but cash is tightening, check AR/Inventory growth and capex.", + "- If cash improves without profit, check payables policy, debt changes, or draws.", + "", + "## Guardrails", + "- Rates (COGS%, Opex%) and days (DSO/DIO/DPO) are descriptive baselines.", + "- Reconciliation residual should be near zero; large residuals indicate missing flows or inconsistent assumptions.", + "", + f"Reconciliation residual max (absolute): {residual_max:,.2f}", + ] + ), + encoding="utf-8", + ) + + return Outputs( + pnl_forecast_monthly_csv=pnl_csv, + balance_sheet_forecast_monthly_csv=bs_csv, + cash_flow_forecast_monthly_csv=cf_csv, + assumptions_csv=assumptions_csv, + design_json=design_json, + memo_md=memo_md, + figures_manifest_csv=figures_manifest_csv, + ) + + +def _build_cli() -> Any: + p = base_parser(description=CHAPTER) + p.add_argument("--datadir", type=Path, required=True) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + + analyze_ch20(datadir=args.datadir, outdir=args.outdir, seed=args.seed or 123) + print("Wrote Chapter 20 artifacts ->", args.outdir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch21_scenario_planning_sensitivity_stress.py b/workbooks/track_d_template/scripts/business_ch21_scenario_planning_sensitivity_stress.py new file mode 100644 index 0000000..0ade88c --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch21_scenario_planning_sensitivity_stress.py @@ -0,0 +1,784 @@ +"""Track D — Chapter 21: scenario planning, sensitivity, and stress testing. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch21_scenario_pack_monthly.csv +* ch21_sensitivity_cash_shortfall.csv +* ch21_assumptions.csv +* ch21_governance_template.csv +* ch21_figures_manifest.csv +* ch21_design.json +* ch21_memo.md + +This chapter builds an accountant-friendly scenario pack using the NSO v1 +simulator outputs. The goal is not to "predict the future" but to: + +- make assumptions explicit, +- tie profit, working capital, and cash together, +- quantify downside risk (cash buffer triggers), and +- identify the handful of levers that matter most. + +Outputs are deterministic and written to outputs/track_d/.""" + +from __future__ import annotations +from dataclasses import dataclass +from pathlib import Path +import json +from typing import Any + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser +from scripts._reporting_style import plot_bar + + +CHAPTER = "Track D — Chapter 21" +TEACHING_MONTH_DAYS = 28.0 + + +@dataclass(frozen=True) +class Outputs: + scenario_pack_monthly_csv: Path + sensitivity_summary_csv: Path + assumptions_csv: Path + governance_template_csv: Path + design_json: Path + memo_md: Path + figures_manifest_csv: Path + + +IS_LINE_MAP: dict[str, str] = { + "Sales Revenue": "sales_revenue", + "Cost of Goods Sold": "cogs", + "Operating Expenses": "operating_expenses", + "Net Income": "net_income", +} + +BS_LINE_MAP: dict[str, str] = { + "Cash": "cash", + "Accounts Receivable": "accounts_receivable", + "Inventory": "inventory", + "PP&E (Cost)": "ppe_cost", + "Accumulated Depreciation": "accumulated_depreciation", + "Net PP&E": "net_ppe", + "Accounts Payable": "accounts_payable", + "Notes Payable": "notes_payable", + "Wages Payable": "wages_payable", + "Payroll Taxes Payable": "payroll_taxes_payable", + "Sales Tax Payable": "sales_tax_payable", + "Owner Capital": "owner_capital", + "Owner Draw": "owner_draw", + "Retained Earnings (Cumulative, derived)": "retained_earnings", +} + +CF_LINE_MAP: dict[str, str] = { + "Add back Depreciation": "add_back_depreciation", + "Capital Expenditures (cash)": "capex_cash", + "Owner Draw (cash)": "owner_draw_cash", +} + + +def _read_csv(datadir: Path, name: str) -> pd.DataFrame: + path = datadir / name + if not path.exists(): + raise FileNotFoundError(f"Expected {name} at {path}, but it was not found.") + return pd.read_csv(path) + + +def _pivot_statement(df_long: pd.DataFrame, line_map: dict[str, str]) -> pd.DataFrame: + if df_long.empty: + return pd.DataFrame(columns=["month", *sorted(set(line_map.values()))]) + + df = df_long.copy() + df["month"] = df["month"].astype(str) + df = df[df["line"].astype(str).isin(line_map.keys())].copy() + df["line"] = df["line"].astype(str).map(line_map) + + wide = df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum", fill_value=0.0) + wide = wide.reset_index().sort_values("month").reset_index(drop=True) + + for col in sorted(set(line_map.values())): + if col not in wide.columns: + wide[col] = 0.0 + return wide + + +def _median_clip(s: pd.Series, lo: float, hi: float, default: float) -> float: + x = pd.to_numeric(s, errors="coerce").dropna().astype(float) + if x.empty: + return float(default) + v = float(np.median(x)) + return float(min(max(v, lo), hi)) + + +def _seasonal_naive_revenue(hist_is: pd.DataFrame, horizon_months: int) -> pd.Series: + """Seasonal naive: future month revenue = avg revenue of that calendar month in history.""" + if hist_is.empty: + return pd.Series(dtype=float) + + df = hist_is[["month", "sales_revenue"]].copy() + df["month"] = df["month"].astype(str) + df["cal_m"] = df["month"].str.slice(5, 7) + by_cal = df.groupby("cal_m", observed=True)["sales_revenue"].mean().to_dict() + + last_month = df["month"].max() + start = pd.Period(last_month, freq="M") + 1 + future = [str((start + i).strftime("%Y-%m")) for i in range(horizon_months)] + + vals = [] + for m in future: + cal = m[5:7] + vals.append(float(by_cal.get(cal, float(df["sales_revenue"].mean())))) + + return pd.Series(vals, index=future, dtype=float) + + +def _cash_history_weekly_from_bank(bank: pd.DataFrame) -> pd.DataFrame: + if bank.empty: + return pd.DataFrame(columns=["week_start", "cash_in_total", "cash_out_total", "net_cash_flow", "ending_cash"]) + + df = bank.copy() + df["posted_date"] = pd.to_datetime(df["posted_date"]) + df["amount"] = pd.to_numeric(df["amount"], errors="coerce").fillna(0.0).astype(float) + + df["week_start"] = df["posted_date"].dt.to_period("W-SUN").apply(lambda p: p.start_time) + weekly = df.groupby("week_start", observed=True)["amount"].sum().reset_index() + weekly = weekly.sort_values("week_start").reset_index(drop=True) + + weekly["cash_in_total"] = weekly["amount"].clip(lower=0.0) + weekly["cash_out_total"] = (-weekly["amount"].clip(upper=0.0)).astype(float) + weekly["net_cash_flow"] = weekly["amount"].astype(float) + weekly["ending_cash"] = weekly["net_cash_flow"].cumsum() + weekly["week_start"] = weekly["week_start"].dt.strftime("%Y-%m-%d") + + return weekly[["week_start", "cash_in_total", "cash_out_total", "net_cash_flow", "ending_cash"]] + + +def _buffer_target_weekly(history_weekly: pd.DataFrame) -> float: + if history_weekly.empty: + return 0.0 + net = pd.to_numeric(history_weekly["net_cash_flow"], errors="coerce").fillna(0.0).astype(float) + out = pd.to_numeric(history_weekly["cash_out_total"], errors="coerce").fillna(0.0).astype(float) + + bad = (-net.loc[net < 0]).astype(float) + if len(bad) >= 3: + return float(np.quantile(bad, 0.90)) + if len(out) >= 3: + return float(np.quantile(out, 0.75)) + return float(out.mean()) + + +def _safe_rate(x: float) -> float: + # Keep rates in a sane range for teaching. + return float(min(max(x, 0.01), 0.99)) + + +def _safe_days(x: float) -> float: + return float(min(max(x, 0.0), 120.0)) + + +def _scenario_table() -> pd.DataFrame: + """Scenario contract used for both outputs + documentation.""" + return pd.DataFrame( + [ + { + "scenario": "Base", + "revenue_multiplier": 1.00, + "cogs_rate_delta": 0.00, + "opex_rate_delta": 0.00, + "dso_days_delta": 0.0, + "dio_days_delta": 0.0, + "dpo_days_delta": 0.0, + "capex_multiplier": 1.00, + "owner_draw_multiplier": 1.00, + "stress_revenue_shock_months": "", + "stress_revenue_shock_multiplier": 1.00, + }, + { + "scenario": "Best", + "revenue_multiplier": 1.05, + "cogs_rate_delta": -0.01, + "opex_rate_delta": -0.01, + "dso_days_delta": -7.0, + "dio_days_delta": -7.0, + "dpo_days_delta": 7.0, + "capex_multiplier": 1.05, + "owner_draw_multiplier": 0.90, + "stress_revenue_shock_months": "", + "stress_revenue_shock_multiplier": 1.00, + }, + { + "scenario": "Worst", + "revenue_multiplier": 0.92, + "cogs_rate_delta": 0.02, + "opex_rate_delta": 0.02, + "dso_days_delta": 14.0, + "dio_days_delta": 14.0, + "dpo_days_delta": -7.0, + "capex_multiplier": 1.10, + "owner_draw_multiplier": 1.10, + "stress_revenue_shock_months": "", + "stress_revenue_shock_multiplier": 1.00, + }, + { + "scenario": "Stress_Revenue_Drop", + "revenue_multiplier": 1.00, + "cogs_rate_delta": 0.01, + "opex_rate_delta": 0.01, + "dso_days_delta": 7.0, + "dio_days_delta": 0.0, + "dpo_days_delta": 0.0, + "capex_multiplier": 1.00, + "owner_draw_multiplier": 1.00, + "stress_revenue_shock_months": "1,2", + "stress_revenue_shock_multiplier": 0.85, + }, + ] + ) + + +def _run_one_scenario( + *, + scenario_row: dict[str, Any], + future_months: list[str], + rev_fc: pd.Series, + base_rates: dict[str, float], + base_days: dict[str, float], + starting_state: dict[str, float], + dep_monthly: float, + capex_cash_monthly: float, + owner_draw_cash_monthly: float, + principal_payment_monthly: float, + buffer_target_monthly: float, +) -> pd.DataFrame: + scen = str(scenario_row["scenario"]) + + cogs_rate = _safe_rate(float(base_rates["cogs_rate"]) + float(scenario_row["cogs_rate_delta"])) + opex_rate = _safe_rate(float(base_rates["opex_rate"]) + float(scenario_row["opex_rate_delta"])) + rev_mult = float(scenario_row["revenue_multiplier"]) + + days = { + "dso": _safe_days(float(base_days["dso"]) + float(scenario_row["dso_days_delta"])), + "dio": _safe_days(float(base_days["dio"]) + float(scenario_row["dio_days_delta"])), + "dpo": _safe_days(float(base_days["dpo"]) + float(scenario_row["dpo_days_delta"])), + } + + capex = float(capex_cash_monthly) * float(scenario_row["capex_multiplier"]) + owner_draw = float(owner_draw_cash_monthly) * float(scenario_row["owner_draw_multiplier"]) + + shock_months: set[int] = set() + if str(scenario_row.get("stress_revenue_shock_months", "")).strip(): + shock_months = {int(x.strip()) for x in str(scenario_row["stress_revenue_shock_months"]).split(",") if x.strip()} + shock_mult = float(scenario_row.get("stress_revenue_shock_multiplier", 1.0)) + + state = starting_state.copy() + rows: list[dict[str, Any]] = [] + + for idx, m in enumerate(future_months, start=1): + beginning = state.copy() + + revenue = float(rev_fc.loc[m]) * rev_mult + if idx in shock_months: + revenue *= shock_mult + + cogs = float(revenue * cogs_rate) + opex = float(revenue * opex_rate) + net_income = float(revenue - cogs - opex) + + # Working capital targets (days). Use teaching month length (28 days). + ar_end = float((revenue / TEACHING_MONTH_DAYS) * days["dso"] if revenue != 0.0 else beginning["accounts_receivable"]) + inv_end = float((cogs / TEACHING_MONTH_DAYS) * days["dio"] if cogs != 0.0 else beginning["inventory"]) + ap_end = float(((cogs + opex) / TEACHING_MONTH_DAYS) * days["dpo"] if (cogs + opex) != 0.0 else beginning["accounts_payable"]) + + # PP&E and depreciation. + capex_increase = float(-capex) # capex is negative cash out + ppe_cost_end = float(beginning["ppe_cost"] + max(capex_increase, 0.0)) + accumulated_depreciation_end = float(beginning["accumulated_depreciation"] - dep_monthly) + net_ppe_end = float(ppe_cost_end + accumulated_depreciation_end) + + # Debt and owner draws. + notes_end = float(max(0.0, beginning["notes_payable"] - principal_payment_monthly)) + net_borrowings = float(notes_end - beginning["notes_payable"]) + owner_draw_end = float(beginning["owner_draw"] + owner_draw) + + retained_end = float(beginning["retained_earnings"] + net_income) + owner_cap_end = float(beginning["owner_capital"]) + + # Keep non-working-capital payables constant for Chapter 21 focus. + wages_payable_end = float(beginning["wages_payable"]) + payroll_taxes_payable_end = float(beginning["payroll_taxes_payable"]) + sales_tax_payable_end = float(beginning["sales_tax_payable"]) + + total_liabilities = float( + ap_end + + notes_end + + wages_payable_end + + payroll_taxes_payable_end + + sales_tax_payable_end + ) + total_equity = float(owner_cap_end + retained_end + owner_draw_end) + total_l_e = float(total_liabilities + total_equity) + + # Cash is the plug to enforce A = L + E. + cash_end = float(total_l_e - (ar_end + inv_end + net_ppe_end)) + + # Cash bridge components. + delta_ar = float(ar_end - beginning["accounts_receivable"]) + delta_inv = float(inv_end - beginning["inventory"]) + delta_ap = float(ap_end - beginning["accounts_payable"]) + delta_wages = float(wages_payable_end - beginning["wages_payable"]) + delta_ptx = float(payroll_taxes_payable_end - beginning["payroll_taxes_payable"]) + delta_stx = float(sales_tax_payable_end - beginning["sales_tax_payable"]) + + cfo = float(net_income + dep_monthly - delta_ar - delta_inv + delta_ap + delta_wages + delta_ptx + delta_stx) + cfi = float(capex) + cff = float(net_borrowings + owner_draw) + net_change_components = float(cfo + cfi + cff) + net_change_actual = float(cash_end - beginning["cash"]) + reconciliation_residual = float(net_change_actual - net_change_components) + + buffer_trigger = bool(cash_end < buffer_target_monthly) + + rows.append( + { + "month": m, + "scenario": scen, + "sales_revenue": revenue, + "net_income": net_income, + "ending_cash": cash_end, + "buffer_target_monthly": float(buffer_target_monthly), + "buffer_trigger": buffer_trigger, + "dso_days": float(days["dso"]), + "dio_days": float(days["dio"]), + "dpo_days": float(days["dpo"]), + "reconciliation_residual": reconciliation_residual, + "capex_cash": float(capex), + "owner_draw_cash": float(owner_draw), + } + ) + + # update state for next month + state.update( + { + "cash": cash_end, + "accounts_receivable": ar_end, + "inventory": inv_end, + "ppe_cost": ppe_cost_end, + "accumulated_depreciation": accumulated_depreciation_end, + "net_ppe": net_ppe_end, + "accounts_payable": ap_end, + "notes_payable": notes_end, + "owner_draw": owner_draw_end, + "retained_earnings": retained_end, + "owner_capital": owner_cap_end, + "wages_payable": wages_payable_end, + "payroll_taxes_payable": payroll_taxes_payable_end, + "sales_tax_payable": sales_tax_payable_end, + } + ) + + return pd.DataFrame(rows) + + +def _sensitivity_grid() -> list[dict[str, Any]]: + """One-at-a-time shocks for sensitivity analysis.""" + return [ + {"lever": "revenue_multiplier", "shock": -0.05}, + {"lever": "revenue_multiplier", "shock": 0.05}, + {"lever": "cogs_rate_delta", "shock": 0.01}, + {"lever": "opex_rate_delta", "shock": 0.01}, + {"lever": "dso_days_delta", "shock": 7.0}, + {"lever": "dpo_days_delta", "shock": -7.0}, + {"lever": "capex_multiplier", "shock": 0.20}, + {"lever": "owner_draw_multiplier", "shock": 0.20}, + ] + + +def analyze_ch21(*, datadir: Path, outdir: Path, seed: int = 123, horizon_months: int = 12) -> Outputs: + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + figs_dir = outdir / "figures" + figs_dir.mkdir(parents=True, exist_ok=True) + + is_long = _read_csv(datadir, "statements_is_monthly.csv") + bs_long = _read_csv(datadir, "statements_bs_monthly.csv") + cf_long = _read_csv(datadir, "statements_cf_monthly.csv") + debt = _read_csv(datadir, "debt_schedule.csv") + bank = _read_csv(datadir, "bank_statement.csv") + + hist_is = _pivot_statement(is_long, IS_LINE_MAP) + hist_bs = _pivot_statement(bs_long, BS_LINE_MAP) + hist_cf = _pivot_statement(cf_long, CF_LINE_MAP) + + if hist_is.empty or hist_bs.empty: + raise ValueError("NSO v1 statements are empty; cannot run Chapter 21.") + + # Baselines from recent history (last 6 months). + hist_is_num = hist_is.copy() + for c in ["sales_revenue", "cogs", "operating_expenses", "net_income"]: + hist_is_num[c] = pd.to_numeric(hist_is_num[c], errors="coerce").fillna(0.0).astype(float) + recent_is = hist_is_num.tail(6) + cogs_rate = _median_clip(recent_is["cogs"] / recent_is["sales_revenue"].replace(0.0, np.nan), 0.01, 0.99, 0.55) + opex_rate = _median_clip( + recent_is["operating_expenses"] / recent_is["sales_revenue"].replace(0.0, np.nan), 0.01, 0.99, 0.25 + ) + + hist_bs_num = hist_bs.copy() + for c in BS_LINE_MAP.values(): + if c != "month": + hist_bs_num[c] = pd.to_numeric(hist_bs_num[c], errors="coerce").fillna(0.0).astype(float) + recent_bs = hist_bs_num.tail(6) + + # Working capital "days" baselines. + dso = _median_clip( + (recent_bs["accounts_receivable"] / recent_is["sales_revenue"].replace(0.0, np.nan)) * TEACHING_MONTH_DAYS, + 0.0, + 120.0, + 28.0, + ) + dio = _median_clip( + (recent_bs["inventory"] / recent_is["cogs"].replace(0.0, np.nan)) * TEACHING_MONTH_DAYS, + 0.0, + 120.0, + 28.0, + ) + dpo = _median_clip( + (recent_bs["accounts_payable"] / (recent_is["cogs"] + recent_is["operating_expenses"]).replace(0.0, np.nan)) + * TEACHING_MONTH_DAYS, + 0.0, + 120.0, + 21.0, + ) + + dep_monthly = _median_clip(hist_cf["add_back_depreciation"], 0.0, 1e9, 0.0) + capex_cash_monthly = _median_clip(hist_cf["capex_cash"], -1e9, 0.0, 0.0) + owner_draw_cash_monthly = _median_clip(hist_cf["owner_draw_cash"], -1e9, 0.0, 0.0) + principal_payment_monthly = 0.0 + if not debt.empty and "principal" in debt.columns: + principal_payment_monthly = _median_clip(debt["principal"], 0.0, 1e9, 0.0) + + # Revenue forecast baseline. + rev_fc = _seasonal_naive_revenue(hist_is, horizon_months=horizon_months) + future_months = list(rev_fc.index.astype(str)) + + # Buffer policy based on weekly bank history. + hist_weekly = _cash_history_weekly_from_bank(bank) + buffer_weekly = _buffer_target_weekly(hist_weekly) + buffer_target_monthly = float(buffer_weekly * 4.0) + + # Starting state from last observed balance sheet. + last_bs = hist_bs_num.iloc[-1].to_dict() + starting_state = { + "cash": float(last_bs.get("cash", 0.0)), + "accounts_receivable": float(last_bs.get("accounts_receivable", 0.0)), + "inventory": float(last_bs.get("inventory", 0.0)), + "ppe_cost": float(last_bs.get("ppe_cost", 0.0)), + "accumulated_depreciation": float(last_bs.get("accumulated_depreciation", 0.0)), + "net_ppe": float(last_bs.get("net_ppe", 0.0)), + "accounts_payable": float(last_bs.get("accounts_payable", 0.0)), + "notes_payable": float(last_bs.get("notes_payable", 0.0)), + "wages_payable": float(last_bs.get("wages_payable", 0.0)), + "payroll_taxes_payable": float(last_bs.get("payroll_taxes_payable", 0.0)), + "sales_tax_payable": float(last_bs.get("sales_tax_payable", 0.0)), + "owner_capital": float(last_bs.get("owner_capital", 0.0)), + "owner_draw": float(last_bs.get("owner_draw", 0.0)), + "retained_earnings": float(last_bs.get("retained_earnings", 0.0)), + } + + base_rates = {"cogs_rate": float(cogs_rate), "opex_rate": float(opex_rate)} + base_days = {"dso": float(dso), "dio": float(dio), "dpo": float(dpo)} + + scenario_def = _scenario_table() + scenario_rows = scenario_def.to_dict(orient="records") + + scen_frames: list[pd.DataFrame] = [] + for row in scenario_rows: + scen_frames.append( + _run_one_scenario( + scenario_row=row, + future_months=future_months, + rev_fc=rev_fc, + base_rates=base_rates, + base_days=base_days, + starting_state=starting_state, + dep_monthly=dep_monthly, + capex_cash_monthly=capex_cash_monthly, + owner_draw_cash_monthly=owner_draw_cash_monthly, + principal_payment_monthly=principal_payment_monthly, + buffer_target_monthly=buffer_target_monthly, + ) + ) + + scenario_pack = pd.concat(scen_frames, ignore_index=True) + + scenario_pack_csv = outdir / "ch21_scenario_pack_monthly.csv" + scenario_pack.to_csv(scenario_pack_csv, index=False) + + # Sensitivity analysis (one-at-a-time shocks around the Base scenario row). + base_row = scenario_def.loc[scenario_def["scenario"] == "Base"].iloc[0].to_dict() + base_df = _run_one_scenario( + scenario_row=base_row, + future_months=future_months, + rev_fc=rev_fc, + base_rates=base_rates, + base_days=base_days, + starting_state=starting_state, + dep_monthly=dep_monthly, + capex_cash_monthly=capex_cash_monthly, + owner_draw_cash_monthly=owner_draw_cash_monthly, + principal_payment_monthly=principal_payment_monthly, + buffer_target_monthly=buffer_target_monthly, + ) + base_min_cash = float(pd.to_numeric(base_df["ending_cash"], errors="coerce").min()) + + sens_rows: list[dict[str, Any]] = [] + for g in _sensitivity_grid(): + row = base_row.copy() + lever = str(g["lever"]) + shock = float(g["shock"]) + if lever in {"revenue_multiplier", "capex_multiplier", "owner_draw_multiplier"}: + row[lever] = float(row[lever]) * (1.0 + shock) + else: + row[lever] = float(row[lever]) + shock + + df = _run_one_scenario( + scenario_row=row, + future_months=future_months, + rev_fc=rev_fc, + base_rates=base_rates, + base_days=base_days, + starting_state=starting_state, + dep_monthly=dep_monthly, + capex_cash_monthly=capex_cash_monthly, + owner_draw_cash_monthly=owner_draw_cash_monthly, + principal_payment_monthly=principal_payment_monthly, + buffer_target_monthly=buffer_target_monthly, + ) + + end_cash = pd.to_numeric(df["ending_cash"], errors="coerce").astype(float) + min_cash = float(end_cash.min()) + worst_idx = int(end_cash.idxmin()) + worst_month = str(df.loc[worst_idx, "month"]) + below = int((end_cash < buffer_target_monthly).sum()) + + sens_rows.append( + { + "lever": lever, + "shock": shock, + "min_ending_cash": min_cash, + "delta_min_cash_vs_base": float(min_cash - base_min_cash), + "months_below_buffer": below, + "worst_month": worst_month, + } + ) + + sens = pd.DataFrame(sens_rows) + sensitivity_csv = outdir / "ch21_sensitivity_cash_shortfall.csv" + sens.to_csv(sensitivity_csv, index=False) + + # Assumptions table + assumptions_rows: list[dict[str, Any]] = [] + assumptions_rows.append({"scenario": "BASELINES", "key": "cogs_rate_median", "value": float(cogs_rate), "note": "Median COGS/revenue (recent months)."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "opex_rate_median", "value": float(opex_rate), "note": "Median opex/revenue (recent months)."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "dso_days_median", "value": float(dso), "note": "AR days baseline (recent months)."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "dio_days_median", "value": float(dio), "note": "Inventory days baseline (recent months)."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "dpo_days_median", "value": float(dpo), "note": "AP days baseline (recent months)."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "buffer_target_monthly", "value": float(buffer_target_monthly), "note": "Cash buffer target = 4× weekly buffer from bank history."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "depreciation_monthly_median", "value": float(dep_monthly), "note": "From CF: Add back Depreciation."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "capex_cash_monthly_median", "value": float(capex_cash_monthly), "note": "From CF: Capital Expenditures (cash)."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "owner_draw_cash_monthly_median", "value": float(owner_draw_cash_monthly), "note": "From CF: Owner Draw (cash)."}) + assumptions_rows.append({"scenario": "BASELINES", "key": "principal_payment_monthly_median", "value": float(principal_payment_monthly), "note": "From debt schedule: principal median."}) + + for r in scenario_rows: + for k in [ + "revenue_multiplier", + "cogs_rate_delta", + "opex_rate_delta", + "dso_days_delta", + "dio_days_delta", + "dpo_days_delta", + "capex_multiplier", + "owner_draw_multiplier", + "stress_revenue_shock_months", + "stress_revenue_shock_multiplier", + ]: + assumptions_rows.append( + { + "scenario": str(r["scenario"]), + "key": k, + "value": r[k], + "note": "Scenario lever", + } + ) + + assumptions_csv = outdir / "ch21_assumptions.csv" + pd.DataFrame(assumptions_rows).to_csv(assumptions_csv, index=False) + + governance_rows = [ + { + "cadence": "Weekly", + "owner": "Controller / Finance", + "update_inputs": "AR collections, AP payment plan, payroll schedule", + "decision_trigger": "Ending cash < buffer target (any scenario)", + "actions": "Freeze discretionary spend, renegotiate terms, accelerate collections", + }, + { + "cadence": "Monthly", + "owner": "FP&A / CFO", + "update_inputs": "Revenue outlook, margin assumptions, working capital days", + "decision_trigger": "2+ months below buffer in Worst/Stress", + "actions": "Reset forecast, update scenario levers, communicate plan to leadership", + }, + ] + governance_csv = outdir / "ch21_governance_template.csv" + pd.DataFrame(governance_rows).to_csv(governance_csv, index=False) + + # Figures + fig_rows: list[dict[str, str]] = [] + + # Cash by scenario + cash_fig = figs_dir / "ch21_fig_cash_by_scenario.png" + plt.figure(figsize=(10, 5)) + for scen in scenario_def["scenario"].astype(str).tolist(): + s = scenario_pack.loc[scenario_pack["scenario"].astype(str) == scen].copy() + plt.plot(s["month"].astype(str), s["ending_cash"].astype(float), label=scen) + plt.axhline(buffer_target_monthly, linestyle="--") + plt.title("Ending cash by scenario (12-month horizon)") + plt.xlabel("Month") + plt.ylabel("Ending cash") + plt.xticks(rotation=45, ha="right") + plt.legend() + plt.tight_layout() + plt.savefig(cash_fig) + plt.close() + fig_rows.append({"filename": cash_fig.name, "title": "Ending cash by scenario", "kind": "time_series"}) + + # Sensitivity bar: delta in min cash vs base + sens_worst = sens.sort_values("delta_min_cash_vs_base").copy() + sens_fig = figs_dir / "ch21_fig_sensitivity_min_cash_delta.png" + sens_plot = sens_worst.assign( + label=[ + f"{a} ({b:+g})" + for a, b in zip( + sens_worst["lever"].astype(str), + sens_worst["shock"].astype(float), + ) + ] + ) + fig = plot_bar( + sens_plot, + x="label", + y="delta_min_cash_vs_base", + title="Sensitivity: impact on minimum ending cash (vs Base)", + x_label="Lever (shock)", + y_label="Delta min cash", + ) + fig.savefig(sens_fig, dpi=144, bbox_inches="tight") + plt.close(fig) + fig_rows.append({"filename": sens_fig.name, "title": "Sensitivity (delta min cash)", "kind": "bar"}) + + figures_manifest_csv = outdir / "ch21_figures_manifest.csv" + pd.DataFrame(fig_rows).to_csv(figures_manifest_csv, index=False) + + # Design + memo + design = { + "chapter": CHAPTER, + "dataset": "NSO_v1", + "horizon_months": int(horizon_months), + "scenarios": scenario_def["scenario"].astype(str).tolist(), + "scenario_names": scenario_def["scenario"].astype(str).tolist(), + "baseline_method": "Seasonal naive revenue; median rates for costs and working-capital days", + "buffer_policy": { + "weekly_target": float(buffer_weekly), + "monthly_target": float(buffer_target_monthly), + "note": "Monthly target = 4× weekly buffer (90th pct of bad weeks).", + }, + "levers": { + "revenue_multiplier": "Revenue level shift", + "cogs_rate_delta": "COGS rate change (percentage points)", + "opex_rate_delta": "Opex rate change (percentage points)", + "dso_days_delta": "AR collection speed", + "dio_days_delta": "Inventory turns", + "dpo_days_delta": "Supplier payment timing", + "capex_multiplier": "Capex intensity", + "owner_draw_multiplier": "Owner draws (cash)", + }, + } + design_json = outdir / "ch21_design.json" + design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + + # Memo: highlight triggers + top levers. + base_cash = scenario_pack.loc[scenario_pack["scenario"] == "Base", "ending_cash"].astype(float) + worst_cash = scenario_pack.loc[scenario_pack["scenario"] == "Worst", "ending_cash"].astype(float) + stress_cash = scenario_pack.loc[scenario_pack["scenario"] == "Stress_Revenue_Drop", "ending_cash"].astype(float) + + top_levers = ( + sens.sort_values("delta_min_cash_vs_base").head(3)[["lever", "shock", "delta_min_cash_vs_base"]].copy() + ) + lines = [ + f"# {CHAPTER}", + "", + "## What this pack does", + "- Produces a best/base/worst + stress scenario forecast that ties profit, working capital, and cash.", + "- Applies a simple cash buffer policy and flags months below the buffer.", + "- Runs one-at-a-time sensitivity shocks to identify the biggest cash shortfall drivers.", + "", + "## Key results (quick read)", + f"- Buffer target (monthly): **{buffer_target_monthly:,.0f}**", + f"- Base scenario min ending cash: **{float(base_cash.min()):,.0f}**", + f"- Worst scenario min ending cash: **{float(worst_cash.min()):,.0f}**", + f"- Stress scenario min ending cash: **{float(stress_cash.min()):,.0f}**", + "", + "## Top sensitivity levers (largest downside to min cash)", + ] + for _, r in top_levers.iterrows(): + lines.append(f"- {r['lever']} shock {r['shock']:+g} → Δmin cash {float(r['delta_min_cash_vs_base']):,.0f}") + + lines += [ + "", + "## Guardrails", + "- These are **descriptive baselines** (rates and days), not causal claims.", + "- If residuals are large, treat it as a modeling red flag: missing flows or inconsistent assumptions.", + "", + "## Suggested stress-test narrative", + "If Stress_Revenue_Drop falls below the buffer, write a plan with:", + "- Collection actions (DSO)", + "- Supplier term negotiations (DPO)", + "- Discretionary spend + capex deferrals", + "- Governance: who updates, how often, and escalation triggers", + ] + memo_md = outdir / "ch21_memo.md" + memo_md.write_text("\n".join(lines), encoding="utf-8") + + return Outputs( + scenario_pack_monthly_csv=scenario_pack_csv, + sensitivity_summary_csv=sensitivity_csv, + assumptions_csv=assumptions_csv, + governance_template_csv=governance_csv, + design_json=design_json, + memo_md=memo_md, + figures_manifest_csv=figures_manifest_csv, + ) + + +def _build_cli() -> Any: + p = base_parser(description=CHAPTER) + p.add_argument("--datadir", type=Path, required=True) + p.add_argument("--horizon-months", type=int, default=12) + return p + + +def main(argv: list[str] | None = None) -> int: + p = _build_cli() + args = p.parse_args(argv) + + analyze_ch21(datadir=args.datadir, outdir=args.outdir, seed=args.seed or 123, horizon_months=int(args.horizon_months)) + print("Wrote Chapter 21 artifacts ->", args.outdir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/business_ch22_financial_statement_analysis_toolkit.py b/workbooks/track_d_template/scripts/business_ch22_financial_statement_analysis_toolkit.py new file mode 100644 index 0000000..c13acd4 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch22_financial_statement_analysis_toolkit.py @@ -0,0 +1,588 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 22: Financial statement analysis toolkit. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch22_ratios_monthly.csv +* ch22_common_size_is.csv +* ch22_common_size_bs.csv +* ch22_variance_bridge_latest.csv +* ch22_assumptions.csv +* ch22_figures_manifest.csv +* ch22_memo.md +* ch22_design.json + +This chapter provides an accountant-friendly toolkit to move from *close* to *explain*: + +* **Ratios** (liquidity, leverage, profitability, efficiency) +* **Trends** (levels, percent changes, rolling summaries) +* **Common-size** statements (normalize to revenue / total assets) +* **Variance** bridges (what drove the change month-over-month) + +The NSO v1 simulator provides monthly income statement, balance sheet, and cash flow tables. +We treat these as the inputs, and produce deterministic, analysis-ready outputs under +``outputs/track_d``. + +Guardrails: + +* Ratios are **descriptive**, not causal. +* Always sanity-check denominators (small revenue months can create wild percentages). +* Use working-capital context (AR / Inventory / AP) before declaring a "real" improvement. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any +import json + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser +from scripts._reporting_style import ( + FigureManifestRow, + FigureSpec, + plot_time_series, + plot_waterfall_bridge, + save_figure, + style_context, +) + +CHAPTER = "Track D — Chapter 22" +DAYS_PER_MONTH = 28.0 # teaching-friendly constant (matches earlier Track D chapters) + + +@dataclass(frozen=True) +class Outputs: + ratios_monthly_csv: Path + common_size_is_csv: Path + common_size_bs_csv: Path + variance_bridge_latest_csv: Path + assumptions_csv: Path + design_json: Path + memo_md: Path + figures_manifest_csv: Path + + +def _read_statement(datadir: Path, filename: str) -> pd.DataFrame: + p = datadir / filename + if not p.exists(): + raise FileNotFoundError(f"Missing required NSO table: {p}") + df = pd.read_csv(p) + if not {"month", "line", "amount"}.issubset(df.columns): + raise ValueError(f"{filename} must have columns: month, line, amount") + df = df.copy() + df["month"] = df["month"].astype(str) + df["line"] = df["line"].astype(str) + df["amount"] = df["amount"].astype(float) + return df + + +def _wide_by_line(df: pd.DataFrame, *, lines: list[str]) -> pd.DataFrame: + """Return a month-indexed wide table for the requested statement lines.""" + sub = df.loc[df["line"].isin(lines)].copy() + wide = sub.pivot_table(index="month", columns="line", values="amount", aggfunc="sum") + # Ensure missing lines become explicit NaN columns (deterministic schema) + for ln in lines: + if ln not in wide.columns: + wide[ln] = np.nan + wide = wide[lines].reset_index() + wide = wide.sort_values("month").reset_index(drop=True) + return wide + + +def _safe_div(n: pd.Series, d: pd.Series) -> pd.Series: + d = d.replace(0.0, np.nan) + return n / d + + +def _build_ratios(is_wide: pd.DataFrame, bs_wide: pd.DataFrame) -> pd.DataFrame: + # Income statement + revenue = is_wide["Sales Revenue"].astype(float) + cogs = is_wide["Cost of Goods Sold"].astype(float) + opex = is_wide["Operating Expenses"].astype(float) + net_income = is_wide["Net Income"].astype(float) + + # Balance sheet + cash = bs_wide["Cash"].astype(float) + ar = bs_wide["Accounts Receivable"].astype(float) + inv = bs_wide["Inventory"].astype(float) + ap = bs_wide["Accounts Payable"].astype(float) + sales_tax = bs_wide["Sales Tax Payable"].astype(float) + wages_payable = bs_wide["Wages Payable"].astype(float) + payroll_taxes = bs_wide["Payroll Taxes Payable"].astype(float) + notes_payable = bs_wide["Notes Payable"].astype(float) + total_assets = bs_wide["Total Assets"].astype(float) + total_equity = bs_wide["Total Equity"].astype(float) + + current_assets = cash + ar + inv + current_liabilities = ap + sales_tax + wages_payable + payroll_taxes + + gross_profit = revenue - cogs + + # Core ratios + ratios = pd.DataFrame( + { + "month": is_wide["month"].astype(str), + "sales_revenue": revenue, + "cogs": cogs, + "operating_expenses": opex, + "net_income": net_income, + "gross_margin": _safe_div(gross_profit, revenue), + "operating_margin": _safe_div(net_income, revenue), + "opex_ratio": _safe_div(opex, revenue), + "current_ratio": _safe_div(current_assets, current_liabilities), + "quick_ratio": _safe_div(cash + ar, current_liabilities), + "debt_to_equity": _safe_div(notes_payable, total_equity), + "dso_days": _safe_div(ar, _safe_div(revenue, pd.Series(DAYS_PER_MONTH, index=revenue.index))), + "dio_days": _safe_div(inv, _safe_div(cogs, pd.Series(DAYS_PER_MONTH, index=cogs.index))), + "dpo_days": _safe_div(ap, _safe_div(cogs, pd.Series(DAYS_PER_MONTH, index=cogs.index))), + "cash_conversion_cycle_days": np.nan, + "total_assets": total_assets, + "total_equity": total_equity, + "notes_payable": notes_payable, + "current_assets": current_assets, + "current_liabilities": current_liabilities, + } + ) + + # Alias for tests/docs: use a generic "revenue" column name + if "revenue" not in ratios.columns: + ratios["revenue"] = ratios["sales_revenue"] + + ratios["cash_conversion_cycle_days"] = ratios["dso_days"] + ratios["dio_days"] - ratios["dpo_days"] + + # Add simple MoM deltas for a few KPIs + for col in ["sales_revenue", "net_income", "current_ratio", "debt_to_equity", "cash_conversion_cycle_days"]: + ratios[f"delta_{col}"] = ratios[col].diff() + + # Trailing 12-month rollups (annualized / averages) for a few stable measures + # These are useful for statements analysis where single months can be noisy. + ratios["ttm_revenue"] = ratios["sales_revenue"].rolling(12, min_periods=6).sum() + ratios["avg_total_assets_12m"] = ratios["total_assets"].rolling(12, min_periods=6).mean() + ratios["asset_turnover_annual"] = _safe_div(ratios["ttm_revenue"], ratios["avg_total_assets_12m"]) + ratios["roa_annual"] = _safe_div(ratios["net_income"].rolling(12, min_periods=6).sum(), ratios["avg_total_assets_12m"]) + + return ratios + + +def _common_size_is(is_df: pd.DataFrame) -> pd.DataFrame: + wide = is_df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum") + revenue = wide.get("Sales Revenue") + if revenue is None: + raise ValueError("Income statement missing 'Sales Revenue'") + + out_rows: list[dict[str, Any]] = [] + for month, row in wide.sort_index().iterrows(): + rev = float(row.get("Sales Revenue", np.nan)) + for line, amt in row.items(): + amt_f = float(amt) if pd.notna(amt) else np.nan + pct = (amt_f / rev) if (pd.notna(amt_f) and rev not in (0.0, np.nan)) else np.nan + out_rows.append( + { + "month": str(month), + "line": str(line), + "amount": amt_f, + "pct_of_revenue": pct, + } + ) + + out = pd.DataFrame(out_rows) + out = out.sort_values(["month", "line"]).reset_index(drop=True) + return out + + +def _common_size_bs(bs_df: pd.DataFrame) -> pd.DataFrame: + wide = bs_df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum") + total_assets = wide.get("Total Assets") + if total_assets is None: + raise ValueError("Balance sheet missing 'Total Assets'") + + out_rows: list[dict[str, Any]] = [] + for month, row in wide.sort_index().iterrows(): + ta = float(row.get("Total Assets", np.nan)) + for line, amt in row.items(): + amt_f = float(amt) if pd.notna(amt) else np.nan + pct = (amt_f / ta) if (pd.notna(amt_f) and ta not in (0.0, np.nan)) else np.nan + out_rows.append( + { + "month": str(month), + "line": str(line), + "amount": amt_f, + "pct_of_total_assets": pct, + } + ) + + out = pd.DataFrame(out_rows) + out = out.sort_values(["month", "line"]).reset_index(drop=True) + return out + + +def _variance_bridge_latest(is_wide: pd.DataFrame) -> pd.DataFrame: + if len(is_wide) < 2: + return pd.DataFrame(columns=["component", "amount"]) + + cur = is_wide.iloc[-1] + prev = is_wide.iloc[-2] + + # Components explained in a way that matches the income statement identity. + # Start at prior net income, then add changes in: + # + Revenue + # - COGS + # - Opex + start = float(prev["Net Income"]) + end = float(cur["Net Income"]) + + delta_rev = float(cur["Sales Revenue"] - prev["Sales Revenue"]) + delta_cogs = float(cur["Cost of Goods Sold"] - prev["Cost of Goods Sold"]) + delta_opex = float(cur["Operating Expenses"] - prev["Operating Expenses"]) + + comp_rows = [ + {"component": "Start (prior net income)", "amount": start}, + {"component": "Revenue change", "amount": delta_rev}, + {"component": "COGS change (higher costs reduce NI)", "amount": -delta_cogs}, + {"component": "Opex change (higher opex reduces NI)", "amount": -delta_opex}, + ] + + implied_end = start + delta_rev - delta_cogs - delta_opex + residual = float(end - implied_end) + if abs(residual) > 1e-6: + comp_rows.append({"component": "Other / rounding", "amount": residual}) + + comp_rows.append({"component": "End (current net income)", "amount": end}) + + return pd.DataFrame(comp_rows) + + +def _assumptions_table() -> pd.DataFrame: + rows = [ + ("gross_margin", "(Revenue - COGS) / Revenue", "Profitability"), + ("operating_margin", "Net Income / Revenue", "Profitability"), + ("opex_ratio", "Operating Expenses / Revenue", "Profitability"), + ("current_ratio", "(Cash + AR + Inventory) / Current Liabilities", "Liquidity"), + ("quick_ratio", "(Cash + AR) / Current Liabilities", "Liquidity"), + ("debt_to_equity", "Notes Payable / Total Equity", "Leverage"), + ("dso_days", "AR / (Revenue / 28)", "Efficiency"), + ("dio_days", "Inventory / (COGS / 28)", "Efficiency"), + ("dpo_days", "AP / (COGS / 28)", "Efficiency"), + ("cash_conversion_cycle_days", "DSO + DIO - DPO", "Efficiency"), + ("asset_turnover_annual", "TTM Revenue / avg Total Assets (12m)", "Efficiency"), + ("roa_annual", "TTM Net Income / avg Total Assets (12m)", "Profitability"), + ] + + return pd.DataFrame(rows, columns=["metric", "definition", "category"]) + + +def analyze_ch22(*, datadir: Path, outdir: Path, seed: int | None = None) -> Outputs: + apply_seed(seed) + + outdir.mkdir(parents=True, exist_ok=True) + figdir = outdir / "figures" + figdir.mkdir(parents=True, exist_ok=True) + + is_df = _read_statement(datadir, "statements_is_monthly.csv") + bs_df = _read_statement(datadir, "statements_bs_monthly.csv") + cf_df = _read_statement(datadir, "statements_cf_monthly.csv") + + is_lines = ["Sales Revenue", "Cost of Goods Sold", "Operating Expenses", "Net Income"] + bs_lines = [ + "Cash", + "Accounts Receivable", + "Inventory", + "Accounts Payable", + "Sales Tax Payable", + "Wages Payable", + "Payroll Taxes Payable", + "Notes Payable", + "Total Assets", + "Total Equity", + ] + + is_wide = _wide_by_line(is_df, lines=is_lines) + bs_wide = _wide_by_line(bs_df, lines=bs_lines) + + # Ensure month alignment (outer join, deterministic ordering) + wide = is_wide.merge(bs_wide, on="month", how="outer", suffixes=("_is", "_bs")) + wide = wide.sort_values("month").reset_index(drop=True) + + # Re-split wide for ratio builder (expects original columns) + is_wide_aligned = wide[["month"] + is_lines].copy() + bs_wide_aligned = wide[["month"] + bs_lines].copy() + + ratios = _build_ratios(is_wide_aligned, bs_wide_aligned) + ratios_monthly_csv = outdir / "ch22_ratios_monthly.csv" + ratios.to_csv(ratios_monthly_csv, index=False) + + cs_is = _common_size_is(is_df) + common_size_is_csv = outdir / "ch22_common_size_is.csv" + cs_is.to_csv(common_size_is_csv, index=False) + + cs_bs = _common_size_bs(bs_df) + common_size_bs_csv = outdir / "ch22_common_size_bs.csv" + cs_bs.to_csv(common_size_bs_csv, index=False) + + bridge = _variance_bridge_latest(is_wide_aligned) + variance_bridge_latest_csv = outdir / "ch22_variance_bridge_latest.csv" + bridge.to_csv(variance_bridge_latest_csv, index=False) + + assumptions = _assumptions_table() + assumptions_csv = outdir / "ch22_assumptions.csv" + assumptions.to_csv(assumptions_csv, index=False) + + # Small "cosmetic improvement" flag for the last month + cf_wide = _wide_by_line(cf_df, lines=["Net Change in Cash"]) + cf_wide = cf_wide.sort_values("month").reset_index(drop=True) + net_change_cash_last = float(cf_wide.iloc[-1]["Net Change in Cash"]) if len(cf_wide) else np.nan + + # ------------------------------- + # Figures + manifest + # ------------------------------- + manifest_rows: list[FigureManifestRow] = [] + + def _add_row(fig_path: Path, spec: FigureSpec) -> None: + manifest_rows.append( + FigureManifestRow( + filename=fig_path.name, + chart_type=spec.chart_type, + title=spec.title, + x_label=spec.x_label, + y_label=spec.y_label, + data_source=spec.data_source, + guardrail_note=( + "Ratios are signals, not proofs. Avoid causal overclaiming; " + "validate with reconciliations and operational detail." + ), + ) + ) + + with style_context(): + # Profitability (margins) + fig1 = plot_time_series( + ratios, + x="month", + series={ + "Gross margin": "gross_margin", + "Operating margin": "operating_margin", + }, + title="Chapter 22 — Profitability (gross vs operating margin)", + x_label="Month", + y_label="Ratio", + show_zero_line=True, + ) + spec1 = FigureSpec( + chart_type="line", + title="Chapter 22 — Profitability (gross vs operating margin)", + x_label="Month", + y_label="Ratio", + data_source="statements_is_monthly.csv", + notes="Margins summarize price/cost vs overhead; treat as signals.", + ) + fig1_path = figdir / "ch22_fig_profitability_margins.png" + save_figure(fig1, fig1_path, spec=spec1) + _add_row(fig1_path, spec1) + + # Liquidity + fig2 = plot_time_series( + ratios, + x="month", + series={ + "Current ratio": "current_ratio", + "Quick ratio": "quick_ratio", + }, + title="Chapter 22 — Liquidity (current vs quick ratio)", + x_label="Month", + y_label="Ratio", + show_zero_line=True, + ) + spec2 = FigureSpec( + chart_type="line", + title="Chapter 22 — Liquidity (current vs quick ratio)", + x_label="Month", + y_label="Ratio", + data_source="statements_bs_monthly.csv", + notes="Liquidity ratios depend on current assets and current liabilities.", + ) + fig2_path = figdir / "ch22_fig_liquidity_ratios.png" + save_figure(fig2, fig2_path, spec=spec2) + _add_row(fig2_path, spec2) + + # Leverage + fig3 = plot_time_series( + ratios, + x="month", + series={"Debt to equity": "debt_to_equity"}, + title="Chapter 22 — Leverage (debt-to-equity)", + x_label="Month", + y_label="Ratio", + show_zero_line=True, + ) + spec3 = FigureSpec( + chart_type="line", + title="Chapter 22 — Leverage (debt-to-equity)", + x_label="Month", + y_label="Ratio", + data_source="statements_bs_monthly.csv", + notes="Debt-to-equity uses Notes Payable and Total Equity.", + ) + fig3_path = figdir / "ch22_fig_debt_to_equity.png" + save_figure(fig3, fig3_path, spec=spec3) + _add_row(fig3_path, spec3) + + # Efficiency (cash conversion cycle) + fig4 = plot_time_series( + ratios, + x="month", + series={ + "DSO": "dso_days", + "DIO": "dio_days", + "DPO": "dpo_days", + "CCC": "cash_conversion_cycle_days", + }, + title="Chapter 22 — Working capital efficiency (days)", + x_label="Month", + y_label="Days", + show_zero_line=True, + ) + spec4 = FigureSpec( + chart_type="line", + title="Chapter 22 — Working capital efficiency (days)", + x_label="Month", + y_label="Days", + data_source="statements_is_monthly.csv + statements_bs_monthly.csv", + notes="Days metrics use a 28-day month teaching constant (Track D convention).", + ) + fig4_path = figdir / "ch22_fig_working_capital_days.png" + save_figure(fig4, fig4_path, spec=spec4) + _add_row(fig4_path, spec4) + + # Net income variance bridge (latest month vs prior) + if not bridge.empty: + start_value = float(bridge.iloc[0]["amount"]) + end_value = float(bridge.iloc[-1]["amount"]) + components = [ + (str(r["component"]), float(r["amount"])) + for _, r in bridge.iloc[1:-1].iterrows() + ] + fig5 = plot_waterfall_bridge( + start_label="Prior net income", + end_label="Current net income", + start_value=start_value, + end_value=end_value, + components=components, + title="Chapter 22 — Net income variance bridge (latest vs prior month)", + y_label="Net income", + x_label="Component", + ) + spec5 = FigureSpec( + chart_type="waterfall_bridge", + title="Chapter 22 — Net income variance bridge (latest vs prior month)", + x_label="Component", + y_label="Net income", + data_source="statements_is_monthly.csv", + notes="Bridge decomposes month-over-month net income change into key drivers.", + ) + fig5_path = figdir / "ch22_fig_net_income_bridge_latest.png" + save_figure(fig5, fig5_path, spec=spec5) + _add_row(fig5_path, spec5) + + figures_manifest_csv = outdir / "ch22_figures_manifest.csv" + pd.DataFrame([r.__dict__ for r in manifest_rows]).to_csv( + figures_manifest_csv, index=False + ) + + # ------------------------------- + # Design + memo + # ------------------------------- + latest_month = str(ratios["month"].iloc[-1]) if len(ratios) else "" + memo_lines = [ + f"# {CHAPTER} — Financial statement analysis toolkit\n", + f"Most recent month in dataset: **{latest_month}**\n", + "\n", + "## What to look at first\n", + "1. **Margins**: gross vs operating margin (price/cost vs overhead).\n", + "2. **Liquidity**: current and quick ratios (can we pay near-term bills?).\n", + "3. **Working capital days**: DSO/DIO/DPO (cash timing drivers).\n", + "4. **Variance bridge**: what changed month-over-month (revenue vs costs).\n", + "\n", + "## Guardrails\n", + "* Ratios do not *prove* causes. Treat them as signals to investigate.\n", + "* If revenue is small in a month, percent metrics can swing wildly.\n", + "* A profit improvement can be **cosmetic** if AR/inventory grow faster than sales.\n", + "\n", + "## Quick flags (latest month)\n", + ] + + if len(ratios) >= 2: + ni_delta = float(ratios["net_income"].iloc[-1] - ratios["net_income"].iloc[-2]) + ccc = float(ratios["cash_conversion_cycle_days"].iloc[-1]) + memo_lines.append(f"* Net income change vs prior month: {ni_delta:,.0f}\n") + memo_lines.append(f"* Cash conversion cycle (days): {ccc:,.1f}\n") + + if pd.notna(net_change_cash_last): + memo_lines.append(f"* Net change in cash (cash flow statement): {net_change_cash_last:,.0f}\n") + + memo_lines.append("\n") + + memo_md = outdir / "ch22_memo.md" + memo_md.write_text("".join(memo_lines), encoding="utf-8") + + design = { + "chapter": "22", + "chapter_name": "Financial statement analysis toolkit", + "inputs": { + "statements_is_monthly": "statements_is_monthly.csv", + "statements_bs_monthly": "statements_bs_monthly.csv", + "statements_cf_monthly": "statements_cf_monthly.csv", + }, + "outputs": { + "ratios_monthly_csv": str(ratios_monthly_csv), + "common_size_is_csv": str(common_size_is_csv), + "common_size_bs_csv": str(common_size_bs_csv), + "variance_bridge_latest_csv": str(variance_bridge_latest_csv), + "assumptions_csv": str(assumptions_csv), + "figures_manifest_csv": str(figures_manifest_csv), + "memo_md": str(memo_md), + }, + "definitions": { + "days_per_month": DAYS_PER_MONTH, + "current_liabilities": "AP + Sales Tax Payable + Wages Payable + Payroll Taxes Payable", + "current_assets": "Cash + Accounts Receivable + Inventory", + }, + "seed": seed, + "deterministic": True, + } + + design_json = outdir / "ch22_design.json" + design_json.write_text(json.dumps(design, indent=2), encoding="utf-8") + + return Outputs( + ratios_monthly_csv=ratios_monthly_csv, + common_size_is_csv=common_size_is_csv, + common_size_bs_csv=common_size_bs_csv, + variance_bridge_latest_csv=variance_bridge_latest_csv, + assumptions_csv=assumptions_csv, + design_json=design_json, + memo_md=memo_md, + figures_manifest_csv=figures_manifest_csv, + ) + + +def main(argv: list[str] | None = None) -> None: + p = base_parser(description=f"{CHAPTER}: statement analysis toolkit") + p.add_argument( + "--datadir", + type=Path, + default=Path("data/synthetic/nso_v1"), + help="Directory containing NSO v1 synthetic tables (default: data/synthetic/nso_v1)", + ) + args = p.parse_args(argv) + + out = analyze_ch22(datadir=args.datadir, outdir=args.outdir, seed=args.seed) + print(f"Wrote Chapter 22 artifacts -> {out.design_json.parent}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_ch23_communicating_results_governance.py b/workbooks/track_d_template/scripts/business_ch23_communicating_results_governance.py new file mode 100644 index 0000000..e000ae7 --- /dev/null +++ b/workbooks/track_d_template/scripts/business_ch23_communicating_results_governance.py @@ -0,0 +1,522 @@ +# SPDX-License-Identifier: MIT +"""Track D — Chapter 23: Communicating results (memos, dashboards, governance). + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* ch23_memo_template.md +* ch23_kpi_governance_template.csv +* ch23_dashboard_spec_template.csv +* ch23_red_team_checklist.md +* ch23_design.json + +This chapter is intentionally **lightweight**. Instead of introducing new +statistics, it generates decision-ready *templates* that students can fill in +after running Chapters 18–22. + +The goal: make analysis usable. + +- A memo that answers: *what happened, why, what next, risks.* +- A KPI governance table so teams stop debating definitions. +- A dashboard spec so plots have owners, thresholds, and update cadence. + +Data source: NSO v1 simulator outputs under a folder like ``data/synthetic/nso_v1``. + +Outputs are deterministic and written under ``outputs/track_d``. + +Guardrails +---------- +- These templates are planning/communication tools. +- Any numbers we pre-fill are descriptive snapshots from the synthetic dataset. +- Avoid causal claims: "associated with" beats "caused by".""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any +import json + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser + +CHAPTER = "Track D — Chapter 23" + + +@dataclass(frozen=True) +class Outputs: + memo_template_md: Path + kpi_governance_template_csv: Path + dashboard_spec_template_csv: Path + red_team_checklist_md: Path + design_json: Path + + +def _read_statement(datadir: Path, name: str) -> pd.DataFrame: + path = datadir / name + if not path.exists(): + raise FileNotFoundError(f"Expected {name} at {path}, but it was not found.") + df = pd.read_csv(path) + # Expected schema from simulator: month, line, amount + required = {"month", "line", "amount"} + missing = required.difference(df.columns) + if missing: + raise ValueError(f"{name} missing required columns: {sorted(missing)}") + return df + + +def _wide_by_line(df_long: pd.DataFrame, *, lines: list[str]) -> pd.DataFrame: + if df_long.empty: + return pd.DataFrame(columns=["month", *lines]) + + df = df_long.copy() + df["month"] = df["month"].astype(str) + df["line"] = df["line"].astype(str) + df = df[df["line"].isin(lines)].copy() + + wide = df.pivot_table(index="month", columns="line", values="amount", aggfunc="sum", fill_value=0.0) + wide = wide.reset_index().sort_values("month").reset_index(drop=True) + for col in lines: + if col not in wide.columns: + wide[col] = 0.0 + return wide + + +def _safe_div(num: float, den: float) -> float: + if den == 0.0: + return float("nan") + return float(num) / float(den) + + +def _fmt_currency(x: float) -> str: + if not np.isfinite(x): + return "(n/a)" + return f"${x:,.2f}" + + +def _fmt_pct(x: float) -> str: + if not np.isfinite(x): + return "(n/a)" + return f"{100.0 * x:,.1f}%" + + +def _fmt_days(x: float) -> str: + if not np.isfinite(x): + return "(n/a)" + return f"{x:,.0f} days" + + +def _snapshot_metrics(*, is_wide: pd.DataFrame, bs_wide: pd.DataFrame) -> dict[str, Any]: + """Compute a small "latest month" snapshot for pre-filling templates.""" + + if len(is_wide) == 0 or len(bs_wide) == 0: + return { + "month": "(unknown)", + "revenue": float("nan"), + "net_income": float("nan"), + "gross_margin": float("nan"), + "operating_margin": float("nan"), + "cash": float("nan"), + "dso_days": float("nan"), + "dio_days": float("nan"), + "dpo_days": float("nan"), + "ccc_days": float("nan"), + "current_ratio": float("nan"), + "quick_ratio": float("nan"), + } + + # Align months (outer join, deterministic ordering) + wide = is_wide.merge(bs_wide, on="month", how="outer") + wide = wide.sort_values("month").reset_index(drop=True) + last = wide.iloc[-1] + + rev = float(last.get("Sales Revenue", 0.0)) + cogs = float(last.get("Cost of Goods Sold", 0.0)) + net = float(last.get("Net Income", 0.0)) + + cash = float(last.get("Cash", 0.0)) + ar = float(last.get("Accounts Receivable", 0.0)) + inv = float(last.get("Inventory", 0.0)) + ap = float(last.get("Accounts Payable", 0.0)) + stp = float(last.get("Sales Tax Payable", 0.0)) + wp = float(last.get("Wages Payable", 0.0)) + ptp = float(last.get("Payroll Taxes Payable", 0.0)) + + gross_margin = _safe_div(rev - cogs, rev) + operating_margin = _safe_div(net, rev) + + # Days approximations (month-level): AR / (rev/30), etc. + dso = 30.0 * _safe_div(ar, _safe_div(rev, 30.0)) if rev != 0.0 else float("nan") + dio = 30.0 * _safe_div(inv, _safe_div(cogs, 30.0)) if cogs != 0.0 else float("nan") + dpo = 30.0 * _safe_div(ap, _safe_div(cogs, 30.0)) if cogs != 0.0 else float("nan") + ccc = dso + dio - dpo if np.isfinite(dso) and np.isfinite(dio) and np.isfinite(dpo) else float("nan") + + current_assets = cash + ar + inv + current_liabilities = ap + stp + wp + ptp + current_ratio = _safe_div(current_assets, current_liabilities) if current_liabilities != 0.0 else float("nan") + quick_ratio = _safe_div(cash + ar, current_liabilities) if current_liabilities != 0.0 else float("nan") + + return { + "month": str(last.get("month", "(unknown)")), + "revenue": rev, + "net_income": net, + "gross_margin": gross_margin, + "operating_margin": operating_margin, + "cash": cash, + "dso_days": dso, + "dio_days": dio, + "dpo_days": dpo, + "ccc_days": ccc, + "current_ratio": current_ratio, + "quick_ratio": quick_ratio, + } + + +def _kpi_governance_table() -> pd.DataFrame: + """A governance starter table: definitions + ownership + thresholds.""" + + # NOTE: Column names are part of the Track D contract (tests rely on them). + cols = [ + "kpi_name", + "definition", + "formula", + "source_table", + "source_columns", + "owner_role", + "update_cadence", + "threshold_green", + "threshold_yellow", + "threshold_red", + "notes", + ] + + rows: list[dict[str, Any]] = [ + { + "kpi_name": "Revenue (monthly)", + "definition": "Sales revenue for the month (accrual).", + "formula": "Sales Revenue", + "source_table": "statements_is_monthly.csv", + "source_columns": "month,line_item,value (Sales Revenue)", + "owner_role": "FP&A / Controller", + "update_cadence": "Monthly close", + "threshold_green": "(set by plan)", + "threshold_yellow": "(set by plan)", + "threshold_red": "(set by plan)", + "notes": "Confirm revenue recognition policy; document one-offs.", + }, + { + "kpi_name": "Gross margin %", + "definition": "Gross profit as a percent of revenue.", + "formula": "(Revenue - COGS) / Revenue", + "source_table": "statements_is_monthly.csv", + "source_columns": "month,line_item,value (Sales Revenue, Cost of Goods Sold)", + "owner_role": "FP&A / Controller", + "update_cadence": "Monthly close", + "threshold_green": ">= target", + "threshold_yellow": "near target", + "threshold_red": "below target", + "notes": "Track price/volume/mix; validate inventory/COGS timing.", + }, + { + "kpi_name": "Operating margin %", + "definition": "Net income as a percent of revenue (simplified).", + "formula": "Net Income / Revenue", + "source_table": "statements_is_monthly.csv", + "source_columns": "month,line_item,value (Net Income, Sales Revenue)", + "owner_role": "FP&A / Controller", + "update_cadence": "Monthly close", + "threshold_green": ">= target", + "threshold_yellow": "near target", + "threshold_red": "below target", + "notes": "Separate one-offs; avoid causal over-claims.", + }, + { + "kpi_name": "Net income (monthly)", + "definition": "Bottom-line profit for the month.", + "formula": "Net Income", + "source_table": "statements_is_monthly.csv", + "source_columns": "month,line_item,value (Net Income)", + "owner_role": "Controller", + "update_cadence": "Monthly close", + "threshold_green": "(set by plan)", + "threshold_yellow": "(set by plan)", + "threshold_red": "(set by plan)", + "notes": "Review unusual items; link to variance explanations.", + }, + { + "kpi_name": "Cash balance", + "definition": "Cash on hand at period end.", + "formula": "Cash", + "source_table": "statements_bs_monthly.csv", + "source_columns": "month,line_item,value (Cash)", + "owner_role": "Treasury / Controller", + "update_cadence": "Weekly (rolling) + Month-end", + "threshold_green": "> buffer", + "threshold_yellow": "near buffer", + "threshold_red": "below buffer", + "notes": "Tie to bank reconciliation; define trigger thresholds.", + }, + { + "kpi_name": "Current ratio", + "definition": "Short-term liquidity (current assets / current liabilities).", + "formula": "(Cash + AR + Inventory) / (AP + sales tax payable + wages payable + payroll taxes payable)", + "source_table": "statements_bs_monthly.csv", + "source_columns": "month,line_item,value (Cash, Accounts Receivable, Inventory, Accounts Payable, Sales Tax Payable, Wages Payable, Payroll Taxes Payable)", + "owner_role": "Controller", + "update_cadence": "Monthly close", + "threshold_green": ">= policy", + "threshold_yellow": "watch", + "threshold_red": "below policy", + "notes": "Inventory quality matters; adjust for slow/obsolete stock if needed.", + }, + { + "kpi_name": "Quick ratio", + "definition": "Liquidity excluding inventory (cash + AR) / current liabilities.", + "formula": "(Cash + AR) / (AP + sales tax payable + wages payable + payroll taxes payable)", + "source_table": "statements_bs_monthly.csv", + "source_columns": "month,line_item,value (Cash, Accounts Receivable, Accounts Payable, Sales Tax Payable, Wages Payable, Payroll Taxes Payable)", + "owner_role": "Controller", + "update_cadence": "Monthly close", + "threshold_green": ">= policy", + "threshold_yellow": "watch", + "threshold_red": "below policy", + "notes": "Good for short-horizon cash risk; still validate AR collectability.", + }, + { + "kpi_name": "DSO (days)", + "definition": "Days sales outstanding (collection speed; approximation).", + "formula": "Accounts Receivable / (Revenue / 30)", + "source_table": "statements_is_monthly.csv + statements_bs_monthly.csv", + "source_columns": "month,line_item,value (Accounts Receivable, Sales Revenue)", + "owner_role": "AR Lead / Controller", + "update_cadence": "Monthly close", + "threshold_green": "improving", + "threshold_yellow": "flat", + "threshold_red": "worsening", + "notes": "Cross-check with AR aging; separate disputed invoices.", + }, + { + "kpi_name": "Cash conversion cycle (days)", + "definition": "Approx. days cash is tied up in working capital.", + "formula": "DSO + DIO - DPO (month-level approximation)", + "source_table": "statements_is_monthly.csv + statements_bs_monthly.csv", + "source_columns": "month,line_item,value (AR, Inventory, AP, Sales Revenue, COGS)", + "owner_role": "Controller + Ops", + "update_cadence": "Monthly close", + "threshold_green": "improving", + "threshold_yellow": "flat", + "threshold_red": "worsening", + "notes": "Directional metric; validate with AR/AP aging and inventory turns.", + }, + ] + + return pd.DataFrame(rows, columns=cols) + + +def _dashboard_spec() -> pd.DataFrame: + """A small dashboard spec: what to show, how, and why.""" + + rows: list[dict[str, Any]] = [ + { + "panel": "Performance", + "metric": "Revenue", + "chart": "line", + "grain": "monthly", + "owner": "FP&A", + "decision": "Are we on plan?", + "guardrail": "Use consistent time windows; annotate one-offs.", + }, + { + "panel": "Margins", + "metric": "Gross margin %", + "chart": "line", + "grain": "monthly", + "owner": "FP&A", + "decision": "Is pricing/COGS behaving?", + "guardrail": "Validate inventory/COGS; do not hide reclasses.", + }, + { + "panel": "Cash", + "metric": "Cash balance", + "chart": "line", + "grain": "weekly/13-week", + "owner": "Controller", + "decision": "Do we need a cash action now?", + "guardrail": "Tie to bank; make buffer trigger explicit.", + }, + { + "panel": "Working capital", + "metric": "Cash conversion cycle (days)", + "chart": "line", + "grain": "monthly", + "owner": "Controller + Ops", + "decision": "Where is cash trapped?", + "guardrail": "Directional only; validate AR/AP aging.", + }, + ] + return pd.DataFrame(rows) + + +def _memo_template(*, snapshot: dict[str, Any]) -> str: + """Return a markdown executive memo template with a small pre-filled snapshot.""" + + month = snapshot.get("month", "(unknown)") + rev = _fmt_currency(float(snapshot.get("revenue", float("nan")))) + net = _fmt_currency(float(snapshot.get("net_income", float("nan")))) + cash = _fmt_currency(float(snapshot.get("cash", float("nan")))) + gm = _fmt_pct(float(snapshot.get("gross_margin", float("nan")))) + om = _fmt_pct(float(snapshot.get("operating_margin", float("nan")))) + ccc = _fmt_days(float(snapshot.get("ccc_days", float("nan")))) + dso = _fmt_days(float(snapshot.get("dso_days", float("nan")))) + + return ( + "# North Shore Outfitters — Executive Update (Template)\n\n" + "**Audience:** CFO / Owner / Leadership\n\n" + f"**Reporting period:** {month} (latest month in dataset)\n\n" + "---\n\n" + "## 1) What happened (facts)\n\n" + "Write 3–6 bullet points that describe *what changed* without guessing why.\n\n" + "**Snapshot (auto-filled from NSO synthetic data):**\n\n" + f"- Revenue: {rev}\n" + f"- Net income: {net}\n" + f"- Cash (end of month): {cash}\n" + f"- Gross margin: {gm}\n" + f"- Operating margin: {om}\n" + f"- DSO (approx): {dso}\n" + f"- Cash conversion cycle (approx): {ccc}\n\n" + "*(Replace this snapshot if you are using real data.)*\n\n" + "## 2) Why it happened (drivers, not blame)\n\n" + "Use one of these driver frames (pick 1–2):\n\n" + "- **Price / Volume / Mix** (revenue or gross margin)\n" + "- **Cost behavior** (fixed vs variable vs step costs)\n" + "- **Working capital** (AR collections, inventory, AP timing)\n\n" + "**Guardrail:** If you cannot rule out confounders, say \"associated with\" instead of \"caused by\".\n\n" + "## 3) What we recommend next (actions + owners)\n\n" + "List 3–5 actions. Each action must have: **owner, due date, expected impact range**.\n\n" + "| Action | Owner | When | Expected impact | KPI to monitor |\n" + "|---|---|---|---|---|\n" + "| | | | | |\n\n" + "## 4) Risks & uncertainty (don’t hide it)\n\n" + "- Biggest downside risks\n" + "- Early warning indicators\n" + "- Contingency plan (if worst-case scenario triggers)\n\n" + "## 5) Assumptions & audit trail\n\n" + "- Data sources used\n" + "- One-off adjustments (what, why, who approved)\n" + "- Version / run-id / links to artifacts\n\n" + "## 6) Governance\n\n" + "- Update cadence (weekly cash, monthly close)\n" + "- Who approves forecast changes\n" + "- Where templates and outputs live (shared folder / repo)\n" + ) + + +def _red_team_checklist() -> str: + return ( + "# Chapter 23 — Red team checklist (avoid overclaiming)\n\n" + "Use this list to critique your own memo/dashboard before sharing it.\n\n" + "## Interpretation guardrails\n\n" + "- Did we accidentally imply causation from correlation?\n" + "- Did we ignore seasonality, one-offs, or timing effects?\n" + "- Are we comparing like-for-like periods (same days, same cutoff)?\n\n" + "## Data quality guardrails\n\n" + "- Are bank rec / AR/AP ties / exception checks complete?\n" + "- Are reclasses documented and approved?\n" + "- Are definitions consistent with the KPI governance table?\n\n" + "## Communication guardrails\n\n" + "- Are actions specific (owner + due date + KPI)?\n" + "- Did we quantify impact as a range, not a single point?\n" + "- Is uncertainty clearly stated without being vague?\n" + ) + + +def analyze_ch23(*, datadir: Path, outdir: Path, seed: int | None = None) -> Outputs: + apply_seed(seed) + + outdir.mkdir(parents=True, exist_ok=True) + + is_df = _read_statement(datadir, "statements_is_monthly.csv") + bs_df = _read_statement(datadir, "statements_bs_monthly.csv") + + is_lines = ["Sales Revenue", "Cost of Goods Sold", "Operating Expenses", "Net Income"] + bs_lines = [ + "Cash", + "Accounts Receivable", + "Inventory", + "Accounts Payable", + "Sales Tax Payable", + "Wages Payable", + "Payroll Taxes Payable", + ] + + is_wide = _wide_by_line(is_df, lines=is_lines) + bs_wide = _wide_by_line(bs_df, lines=bs_lines) + + snapshot = _snapshot_metrics(is_wide=is_wide, bs_wide=bs_wide) + + memo_template_md = outdir / "ch23_memo_template.md" + memo_template_md.write_text(_memo_template(snapshot=snapshot), encoding="utf-8") + + kpi_governance_template_csv = outdir / "ch23_kpi_governance_template.csv" + _kpi_governance_table().to_csv(kpi_governance_template_csv, index=False) + + dashboard_spec_template_csv = outdir / "ch23_dashboard_spec_template.csv" + _dashboard_spec().to_csv(dashboard_spec_template_csv, index=False) + + red_team_checklist_md = outdir / "ch23_red_team_checklist.md" + red_team_checklist_md.write_text(_red_team_checklist(), encoding="utf-8") + + design_json = outdir / "ch23_design.json" + design: dict[str, Any] = { + "chapter": CHAPTER, + "seed": seed, + "datadir": str(datadir).replace("\\\\", "/"), + "outdir": str(outdir).replace("\\\\", "/"), + "artifacts": [ + memo_template_md.name, + kpi_governance_template_csv.name, + dashboard_spec_template_csv.name, + red_team_checklist_md.name, + ], + "snapshot": { + "month": snapshot.get("month"), + "revenue": float(snapshot.get("revenue", float("nan"))), + "net_income": float(snapshot.get("net_income", float("nan"))), + "cash": float(snapshot.get("cash", float("nan"))), + "gross_margin": float(snapshot.get("gross_margin", float("nan"))), + "operating_margin": float(snapshot.get("operating_margin", float("nan"))), + }, + "guardrails": [ + "Do not overclaim causality; prefer 'associated with'.", + "Document reclasses and one-offs in an assumptions log.", + "Tie cash numbers to a bank reconciliation.", + ], + } + design_json.write_text(json.dumps(design, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + return Outputs( + memo_template_md=memo_template_md, + kpi_governance_template_csv=kpi_governance_template_csv, + dashboard_spec_template_csv=dashboard_spec_template_csv, + red_team_checklist_md=red_team_checklist_md, + design_json=design_json, + ) + + +def main(argv: list[str] | None = None) -> None: + p = base_parser(description=f"{CHAPTER}: communication + governance templates") + p.add_argument( + "--datadir", + type=Path, + default=Path("data/synthetic/nso_v1"), + help="Directory containing NSO v1 synthetic tables (default: data/synthetic/nso_v1)", + ) + args = p.parse_args(argv) + + out = analyze_ch23(datadir=args.datadir, outdir=args.outdir, seed=args.seed) + print(f"Wrote Chapter 23 artifacts -> {out.design_json.parent}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/business_validate_dataset.py b/workbooks/track_d_template/scripts/business_validate_dataset.py new file mode 100644 index 0000000..9c9dd0c --- /dev/null +++ b/workbooks/track_d_template/scripts/business_validate_dataset.py @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: MIT +"""Track D utility: validate a dataset as a governed interface. + +This is intentionally light-weight: +- validate required tables + required columns (schema contract) +- check transactions balance (GL) +- check balance sheet equation balances (if BS statements exist) + +Later chapters can treat reconciliations as "dataset validation", not clerical work. +""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from scripts._business_schema import DATASET_NSO_V1, validate_schema +from scripts._cli import apply_seed + + +@dataclass(frozen=True) +class ValidateSummary: + checks: dict[str, Any] + metrics: dict[str, Any] + schema: dict[str, Any] + + +def _read_csv(path: Path, **kwargs: Any) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"Missing required table: {path}") + return pd.read_csv(path, **kwargs) + + +def check_transactions_balance(gl: pd.DataFrame) -> dict[str, Any]: + required = {"txn_id", "debit", "credit"} + if not required.issubset(gl.columns): + return { + "transactions_balanced": None, + "n_transactions": None, + "n_unbalanced": None, + "max_abs_diff": None, + } + + g = gl.groupby("txn_id", observed=True)[["debit", "credit"]].sum() + diff = (g["debit"].astype(float) - g["credit"].astype(float)).abs() + + n_txn = int(g.shape[0]) + n_unbalanced = int((diff > 1e-9).sum()) + max_abs_diff = float(diff.max()) if n_txn else 0.0 + + return { + "transactions_balanced": bool(n_unbalanced == 0), + "n_transactions": n_txn, + "n_unbalanced": n_unbalanced, + "max_abs_diff": max_abs_diff, + } + + +def _get_stmt_amount(bs: pd.DataFrame, month: str, line: str) -> float: + if bs.empty: + return 0.0 + hit = bs.loc[ + (bs["month"].astype(str) == str(month)) & (bs["line"].astype(str) == str(line)), + "amount", + ] + if hit.empty: + return 0.0 + return float(hit.iloc[0]) + + +def check_bs_equation(bs: pd.DataFrame) -> dict[str, Any]: + if bs.empty or not {"month", "line", "amount"}.issubset(bs.columns): + return { + "balance_sheet_equation_balances": None, + "balance_sheet_max_abs_diff": None, + "n_months_checked": None, + } + + months = sorted(set(bs["month"].astype(str))) + diffs: list[float] = [] + for m in months: + assets = _get_stmt_amount(bs, m, "Total Assets") + lpe = _get_stmt_amount(bs, m, "Total Liabilities + Equity") + diffs.append(abs(float(assets) - float(lpe))) + + max_diff = float(np.max(diffs)) if diffs else 0.0 + return { + "balance_sheet_equation_balances": bool(max_diff <= 1e-6), + "balance_sheet_max_abs_diff": max_diff, + "n_months_checked": int(len(diffs)), + } + + +def validate_dataset(datadir: Path, outdir: Path, dataset: str, seed: int | None = None) -> ValidateSummary: + apply_seed(seed) + outdir.mkdir(parents=True, exist_ok=True) + + schema_report = validate_schema(datadir, dataset=dataset) + + checks: dict[str, Any] = { + "schema_ok": bool(schema_report.get("ok", False)), + "schema_missing_tables": list(schema_report.get("missing_tables", [])), + } + + # Metrics: basic identity + row counts (if available from schema report) + metrics: dict[str, Any] = { + "dataset": dataset, + "datadir": str(datadir), + } + + tables_report: dict[str, Any] = dict(schema_report.get("tables", {})) + if tables_report: + metrics["table_row_counts"] = { + name: int(info.get("n_rows", 0)) for name, info in tables_report.items() if info.get("exists") + } + + # Deeper checks (only if the relevant tables exist) + gl_path = datadir / "gl_journal.csv" + if gl_path.exists(): + gl = _read_csv(gl_path, dtype={"txn_id": str, "account_id": str, "doc_id": str}) + checks.update(check_transactions_balance(gl)) + metrics["n_gl_rows"] = int(gl.shape[0]) + metrics["n_gl_transactions"] = int(gl["txn_id"].nunique()) + else: + checks.update( + { + "transactions_balanced": None, + "n_transactions": None, + "n_unbalanced": None, + "max_abs_diff": None, + } + ) + + bs_path = datadir / "statements_bs_monthly.csv" + if bs_path.exists(): + bs = _read_csv(bs_path) + checks.update(check_bs_equation(bs)) + if not bs.empty and "month" in bs.columns: + metrics["n_statement_months"] = int(bs["month"].astype(str).nunique()) + else: + checks.update( + { + "balance_sheet_equation_balances": None, + "balance_sheet_max_abs_diff": None, + "n_months_checked": None, + } + ) + + payload = {"schema": schema_report, "checks": checks, "metrics": metrics} + outpath = outdir / "business_validate_summary.json" + outpath.write_text(json.dumps(payload, indent=2), encoding="utf-8") + + return ValidateSummary(checks=checks, metrics=metrics, schema=schema_report) + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser(description="Track D: validate dataset schema + basic accounting checks.") + p.add_argument("--datadir", type=Path, required=True) + p.add_argument("--outdir", type=Path, required=True) + p.add_argument("--dataset", type=str, default=DATASET_NSO_V1) + p.add_argument("--seed", type=int, default=None) + return p.parse_args(argv) + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + summary = validate_dataset(args.datadir, args.outdir, dataset=args.dataset, seed=args.seed) + + print("\nSchema OK:", summary.checks.get("schema_ok")) + missing = summary.checks.get("schema_missing_tables", []) + if missing: + print("Missing tables:", ", ".join(map(str, missing))) + + print("Transactions balanced:", summary.checks.get("transactions_balanced")) + print("BS equation balances:", summary.checks.get("balance_sheet_equation_balances")) + print(f"\nWrote -> {args.outdir / 'business_validate_summary.json'}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d00_peek_data.py b/workbooks/track_d_template/scripts/d00_peek_data.py new file mode 100644 index 0000000..2d442d8 --- /dev/null +++ b/workbooks/track_d_template/scripts/d00_peek_data.py @@ -0,0 +1,125 @@ +"""Track D workbook helper: peek at the (canonical) datasets. + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* d00_peek_data_summary.md + +This script is meant to be run inside a Track D workbook folder created by: + + pystatsv1 workbook init --track d + +It looks for the two Track D synthetic datasets under: + + data/synthetic/ledgerlab_ch01/ + data/synthetic/nso_v1/ + +For the Track D student experience, these datasets are intended to be stable and +repeatable (seed=123). + +What it does: +- lists the available CSV tables +- prints shapes + column names +- prints a small preview of each table +- writes a summary report under outputs/track_d/""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +import pandas as pd + + +def _preview_csv(path: Path, n: int = 5) -> str: + df = pd.read_csv(path) + head = df.head(n) + return ( + f"{path.name}: rows={len(df)} cols={len(df.columns)}\n" + f"columns: {', '.join(map(str, df.columns))}\n" + f"preview:\n{head.to_string(index=False)}\n" + ) + + +def _peek_dataset(name: str, folder: Path, preview_rows: int) -> tuple[str, list[str]]: + if not folder.exists(): + msg = ( + f"⚠️ Missing dataset folder: {folder}\n" + "If you just created this workbook, you may be on an older PyStatsV1 version.\n" + "Update, then re-run workbook init:\n\n" + " python -m pip install -U pystatsv1\n" + " pystatsv1 workbook init --track d --dest pystatsv1_track_d --force\n" + ) + return msg, [msg] + + csvs = sorted(folder.glob("*.csv")) + if not csvs: + msg = ( + f"⚠️ No CSV files found in: {folder}\n" + "This workbook expects canonical datasets to exist under data/synthetic/.\n" + ) + return msg, [msg] + + lines: list[str] = [] + print(f"\n== {name} ==") + lines.append(f"## {name}\n") + lines.append(f"Folder: {folder}\n") + + for csv in csvs: + block = _preview_csv(csv, n=preview_rows) + print(block) + lines.append(f"### {csv.name}\n") + lines.append("```\n") + lines.append(block.rstrip()) + lines.append("\n```\n") + + return "OK", lines + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description="Peek at Track D datasets (seed=123).") + p.add_argument( + "--root", + default="data/synthetic", + help="Dataset root (default: data/synthetic).", + ) + p.add_argument( + "--outdir", + default="outputs/track_d", + help="Where to write the summary report (default: outputs/track_d).", + ) + p.add_argument( + "--preview-rows", + type=int, + default=5, + help="Number of rows to preview per table (default: 5).", + ) + + args = p.parse_args(argv) + + root = Path(args.root) + outdir = Path(args.outdir) + outdir.mkdir(parents=True, exist_ok=True) + + sections: list[str] = [] + sections.append("# Track D dataset peek (seed=123)\n") + + _status, lines = _peek_dataset( + "LedgerLab (Ch01)", root / "ledgerlab_ch01", preview_rows=args.preview_rows + ) + sections.extend(lines) + + _status, lines = _peek_dataset( + "NSO v1 running case", root / "nso_v1", preview_rows=args.preview_rows + ) + sections.extend(lines) + + report = outdir / "d00_peek_data_summary.md" + report.write_text("\n".join(sections).rstrip() + "\n", encoding="utf-8") + + print(f"\n✅ Wrote summary: {report}") + print("Tip: If you edited data/synthetic, run: pystatsv1 workbook run d00_setup_data --force") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/d00_setup_data.py b/workbooks/track_d_template/scripts/d00_setup_data.py new file mode 100644 index 0000000..3dbc7b4 --- /dev/null +++ b/workbooks/track_d_template/scripts/d00_setup_data.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: MIT +"""Track D workbook helper: (re)generate the synthetic datasets. + +Artifacts written to ``--outdir``: + +* data/synthetic/ledgerlab_ch01/ (LedgerLab seed=123 tables) +* data/synthetic/nso_v1/ (NSO v1 seed=123 tables) +* outputs/track_d/d00_setup_data_validate/ (optional validation artifacts) + +Normally you do **not** need this because Track D canonical datasets (seed=123) +are shipped and extracted during: + + pystatsv1 workbook init --track d + +Use this script if you deleted/modified files under data/synthetic and want to +reset them, or if you want to confirm determinism.""" + +from __future__ import annotations + +import argparse +import shutil +import subprocess +import sys +from pathlib import Path + + +def _rm_tree(path: Path) -> None: + if path.exists(): + shutil.rmtree(path) + + +def _run(script_path: Path, args: list[str]) -> None: + subprocess.run([sys.executable, str(script_path), *args], check=True) + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description="(Re)generate Track D datasets (deterministic).") + p.add_argument( + "--seed", + type=int, + default=123, + help="Random seed (default: 123). Keep 123 to match the canonical datasets.", + ) + p.add_argument( + "--root", + default="data/synthetic", + help="Dataset root folder (default: data/synthetic).", + ) + p.add_argument( + "--force", + action="store_true", + help="Delete existing dataset folders before regenerating.", + ) + p.add_argument( + "--no-validate", + action="store_true", + help="Skip the NSO dataset validation step.", + ) + + args = p.parse_args(argv) + + root = Path(args.root) + ledger_dir = root / "ledgerlab_ch01" + nso_dir = root / "nso_v1" + + if args.force: + _rm_tree(ledger_dir) + _rm_tree(nso_dir) + + ledger_dir.mkdir(parents=True, exist_ok=True) + nso_dir.mkdir(parents=True, exist_ok=True) + + scripts_dir = Path(__file__).resolve().parent + _run( + scripts_dir / "sim_business_ledgerlab.py", + ["--outdir", str(ledger_dir), "--seed", str(args.seed)], + ) + _run( + scripts_dir / "sim_business_nso_v1.py", + ["--outdir", str(nso_dir), "--seed", str(args.seed)], + ) + + if not args.no_validate: + outdir = Path("outputs/track_d") / "d00_setup_data_validate" + outdir.mkdir(parents=True, exist_ok=True) + _run( + scripts_dir / "business_validate_dataset.py", + [ + "--datadir", + str(nso_dir), + "--outdir", + str(outdir), + "--dataset", + "nso_v1", + "--seed", + str(args.seed), + ], + ) + + print("\n✅ Datasets ready under:", root) + print(" -", ledger_dir) + print(" -", nso_dir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/workbooks/track_d_template/scripts/d01.py b/workbooks/track_d_template/scripts/d01.py new file mode 100644 index 0000000..03ab054 --- /dev/null +++ b/workbooks/track_d_template/scripts/d01.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch01_accounting_measurement.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch01_cash_balance.png +* business_ch01_balance_sheet_bar.png +* business_ch01_summary.json + + +Use either: + pystatsv1 workbook run d01 +or: + pystatsv1 workbook run business_ch01_accounting_measurement +""" + +from __future__ import annotations + +from scripts.business_ch01_accounting_measurement import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d02.py b/workbooks/track_d_template/scripts/d02.py new file mode 100644 index 0000000..1742e85 --- /dev/null +++ b/workbooks/track_d_template/scripts/d02.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch02_double_entry_and_gl.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch02_gl_tidy.csv +* business_ch02_trial_balance.csv +* business_ch02_account_rollup.csv +* business_ch02_tb_by_account.png +* business_ch02_summary.json + + +Use either: + pystatsv1 workbook run d02 +or: + pystatsv1 workbook run business_ch02_double_entry_and_gl +""" + +from __future__ import annotations + +from scripts.business_ch02_double_entry_and_gl import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d03.py b/workbooks/track_d_template/scripts/d03.py new file mode 100644 index 0000000..e3f59eb --- /dev/null +++ b/workbooks/track_d_template/scripts/d03.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch03_statements_as_summaries.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch03_summary.json +* business_ch03_statement_bridge.csv +* business_ch03_trial_balance.csv +* business_ch03_net_income_vs_cash_change.png + + +Use either: + pystatsv1 workbook run d03 +or: + pystatsv1 workbook run business_ch03_statements_as_summaries +""" + +from __future__ import annotations + +from scripts.business_ch03_statements_as_summaries import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d04.py b/workbooks/track_d_template/scripts/d04.py new file mode 100644 index 0000000..df11cb8 --- /dev/null +++ b/workbooks/track_d_template/scripts/d04.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch04_assets_inventory_fixed_assets.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch04_inventory_rollforward.csv +* business_ch04_margin_bridge.csv +* business_ch04_depreciation_rollforward.csv +* business_ch04_summary.json +* business_ch04_gross_margin_over_time.png +* business_ch04_depreciation_over_time.png + + +Use either: + pystatsv1 workbook run d04 +or: + pystatsv1 workbook run business_ch04_assets_inventory_fixed_assets +""" + +from __future__ import annotations + +from scripts.business_ch04_assets_inventory_fixed_assets import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d05.py b/workbooks/track_d_template/scripts/d05.py new file mode 100644 index 0000000..5ecdc9f --- /dev/null +++ b/workbooks/track_d_template/scripts/d05.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch05_liabilities_payroll_taxes_equity.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* business_ch05_summary.json +* business_ch05_wages_payable_rollforward.csv +* business_ch05_payroll_taxes_payable_rollforward.csv +* business_ch05_sales_tax_payable_rollforward.csv +* business_ch05_notes_payable_rollforward.csv +* business_ch05_accounts_payable_rollforward.csv +* business_ch05_liabilities_over_time.png + + +Use either: + pystatsv1 workbook run d05 +or: + pystatsv1 workbook run business_ch05_liabilities_payroll_taxes_equity +""" + +from __future__ import annotations + +from scripts.business_ch05_liabilities_payroll_taxes_equity import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d06.py b/workbooks/track_d_template/scripts/d06.py new file mode 100644 index 0000000..27a1d3c --- /dev/null +++ b/workbooks/track_d_template/scripts/d06.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch06_reconciliations_quality_control.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* ar_rollforward.csv +* bank_recon_matches.csv +* bank_recon_exceptions.csv +* ch06_summary.json + + +Use either: + pystatsv1 workbook run d06 +or: + pystatsv1 workbook run business_ch06_reconciliations_quality_control +""" + +from __future__ import annotations + +from scripts.business_ch06_reconciliations_quality_control import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d07.py b/workbooks/track_d_template/scripts/d07.py new file mode 100644 index 0000000..cf2f726 --- /dev/null +++ b/workbooks/track_d_template/scripts/d07.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch07_preparing_accounting_data_for_analysis.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* gl_tidy.csv +* gl_monthly_summary.csv +* ch07_summary.json + + +Use either: + pystatsv1 workbook run d07 +or: + pystatsv1 workbook run business_ch07_preparing_accounting_data_for_analysis +""" + +from __future__ import annotations + +from scripts.business_ch07_preparing_accounting_data_for_analysis import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d08.py b/workbooks/track_d_template/scripts/d08.py new file mode 100644 index 0000000..689cd9f --- /dev/null +++ b/workbooks/track_d_template/scripts/d08.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch08_descriptive_statistics_financial_performance.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* gl_kpi_monthly.csv +* ar_monthly_metrics.csv +* ar_payment_slices.csv +* ar_days_stats.csv +* ch08_summary.json + + +Use either: + pystatsv1 workbook run d08 +or: + pystatsv1 workbook run business_ch08_descriptive_statistics_financial_performance +""" + +from __future__ import annotations + +from scripts.business_ch08_descriptive_statistics_financial_performance import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d09.py b/workbooks/track_d_template/scripts/d09.py new file mode 100644 index 0000000..4ead50e --- /dev/null +++ b/workbooks/track_d_template/scripts/d09.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch09_reporting_style_contract.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch09_style_contract.json +* ch09_figures_manifest.csv +* ch09_executive_memo.md +* ch09_summary.json + + +Use either: + pystatsv1 workbook run d09 +or: + pystatsv1 workbook run business_ch09_reporting_style_contract +""" + +from __future__ import annotations + +from scripts.business_ch09_reporting_style_contract import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d10.py b/workbooks/track_d_template/scripts/d10.py new file mode 100644 index 0000000..eb67399 --- /dev/null +++ b/workbooks/track_d_template/scripts/d10.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch10_probability_risk.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch10_figures_manifest.csv +* ch10_risk_memo.md +* ch10_risk_summary.json + + +Use either: + pystatsv1 workbook run d10 +or: + pystatsv1 workbook run business_ch10_probability_risk +""" + +from __future__ import annotations + +from scripts.business_ch10_probability_risk import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d11.py b/workbooks/track_d_template/scripts/d11.py new file mode 100644 index 0000000..a6cea97 --- /dev/null +++ b/workbooks/track_d_template/scripts/d11.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch11_sampling_estimation_audit_controls.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch11_sampling_plan.json +* ch11_sampling_summary.json +* ch11_audit_memo.md +* ch11_figures_manifest.csv + + +Use either: + pystatsv1 workbook run d11 +or: + pystatsv1 workbook run business_ch11_sampling_estimation_audit_controls +""" + +from __future__ import annotations + +from scripts.business_ch11_sampling_estimation_audit_controls import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d12.py b/workbooks/track_d_template/scripts/d12.py new file mode 100644 index 0000000..01a0799 --- /dev/null +++ b/workbooks/track_d_template/scripts/d12.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch12_hypothesis_testing_decisions.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch12_experiment_design.json +* ch12_hypothesis_testing_summary.json +* ch12_experiment_memo.md +* ch12_figures_manifest.csv + + +Use either: + pystatsv1 workbook run d12 +or: + pystatsv1 workbook run business_ch12_hypothesis_testing_decisions +""" + +from __future__ import annotations + +from scripts.business_ch12_hypothesis_testing_decisions import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d13.py b/workbooks/track_d_template/scripts/d13.py new file mode 100644 index 0000000..8236553 --- /dev/null +++ b/workbooks/track_d_template/scripts/d13.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch13_correlation_causation_controlled_comparisons.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch13_controlled_comparisons_design.json +* ch13_correlation_summary.json +* ch13_correlation_memo.md +* ch13_figures_manifest.csv + + +Use either: + pystatsv1 workbook run d13 +or: + pystatsv1 workbook run business_ch13_correlation_causation_controlled_comparisons +""" + +from __future__ import annotations + +from scripts.business_ch13_correlation_causation_controlled_comparisons import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d14.py b/workbooks/track_d_template/scripts/d14.py new file mode 100644 index 0000000..20e5401 --- /dev/null +++ b/workbooks/track_d_template/scripts/d14.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch14_regression_driver_analysis.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + + + +Use either: + pystatsv1 workbook run d14 +or: + pystatsv1 workbook run business_ch14_regression_driver_analysis +""" + +from __future__ import annotations + +from scripts.business_ch14_regression_driver_analysis import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d15.py b/workbooks/track_d_template/scripts/d15.py new file mode 100644 index 0000000..26dc22e --- /dev/null +++ b/workbooks/track_d_template/scripts/d15.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch15_forecasting_foundations.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + + + +Use either: + pystatsv1 workbook run d15 +or: + pystatsv1 workbook run business_ch15_forecasting_foundations +""" + +from __future__ import annotations + +from scripts.business_ch15_forecasting_foundations import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d16.py b/workbooks/track_d_template/scripts/d16.py new file mode 100644 index 0000000..eb816c6 --- /dev/null +++ b/workbooks/track_d_template/scripts/d16.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch16_seasonality_baselines.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + + + +Use either: + pystatsv1 workbook run d16 +or: + pystatsv1 workbook run business_ch16_seasonality_baselines +""" + +from __future__ import annotations + +from scripts.business_ch16_seasonality_baselines import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d17.py b/workbooks/track_d_template/scripts/d17.py new file mode 100644 index 0000000..8d882e5 --- /dev/null +++ b/workbooks/track_d_template/scripts/d17.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch17_revenue_forecasting_segmentation_drivers.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch17_ar_revenue_segment_monthly.csv +* ch17_series_monthly.csv +* ch17_customer_segments.csv +* ch17_backtest_metrics.csv +* ch17_backtest_total_revenue.csv +* ch17_forecast_next12.csv +* ch17_memo.md +* ch17_design.json +* ch17_known_events_template.json +* ch17_figures_manifest.csv +* ch17_manifest.json +* ch17_forecast_next_12m.csv +* ch17_forecast_memo.md + + +Use either: + pystatsv1 workbook run d17 +or: + pystatsv1 workbook run business_ch17_revenue_forecasting_segmentation_drivers +""" + +from __future__ import annotations + +from scripts.business_ch17_revenue_forecasting_segmentation_drivers import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d18.py b/workbooks/track_d_template/scripts/d18.py new file mode 100644 index 0000000..cfdfb14 --- /dev/null +++ b/workbooks/track_d_template/scripts/d18.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch18_expense_forecasting_fixed_variable_step_payroll.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch18_expense_monthly_by_account.csv +* ch18_expense_behavior_map.csv +* ch18_payroll_monthly.csv +* ch18_payroll_scenarios_forecast.csv +* ch18_expense_forecast_next12_detail.csv +* ch18_expense_forecast_next12_summary.csv +* ch18_control_plan_template.csv +* ch18_design.json +* ch18_memo.md +* ch18_figures_manifest.csv + + +Use either: + pystatsv1 workbook run d18 +or: + pystatsv1 workbook run business_ch18_expense_forecasting_fixed_variable_step_payroll +""" + +from __future__ import annotations + +from scripts.business_ch18_expense_forecasting_fixed_variable_step_payroll import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d19.py b/workbooks/track_d_template/scripts/d19.py new file mode 100644 index 0000000..0e9784a --- /dev/null +++ b/workbooks/track_d_template/scripts/d19.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch19_cash_flow_forecasting_direct_method_13_week.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch19_cash_history_weekly.csv +* ch19_cash_forecast_13w_scenarios.csv +* ch19_cash_assumptions.csv +* ch19_cash_governance_template.csv +* ch19_design.json +* ch19_memo.md +* ch19_figures_manifest.csv + + +Use either: + pystatsv1 workbook run d19 +or: + pystatsv1 workbook run business_ch19_cash_flow_forecasting_direct_method_13_week +""" + +from __future__ import annotations + +from scripts.business_ch19_cash_flow_forecasting_direct_method_13_week import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d20.py b/workbooks/track_d_template/scripts/d20.py new file mode 100644 index 0000000..14a4475 --- /dev/null +++ b/workbooks/track_d_template/scripts/d20.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch20_integrated_forecasting_three_statements.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch20_pnl_forecast_monthly.csv +* ch20_balance_sheet_forecast_monthly.csv +* ch20_cash_flow_forecast_monthly.csv +* ch20_assumptions.csv +* ch20_design.json +* ch20_memo.md +* ch20_figures_manifest.csv + + +Use either: + pystatsv1 workbook run d20 +or: + pystatsv1 workbook run business_ch20_integrated_forecasting_three_statements +""" + +from __future__ import annotations + +from scripts.business_ch20_integrated_forecasting_three_statements import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d21.py b/workbooks/track_d_template/scripts/d21.py new file mode 100644 index 0000000..a8d603c --- /dev/null +++ b/workbooks/track_d_template/scripts/d21.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch21_scenario_planning_sensitivity_stress.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch21_scenario_pack_monthly.csv +* ch21_sensitivity_cash_shortfall.csv +* ch21_assumptions.csv +* ch21_governance_template.csv +* ch21_figures_manifest.csv +* ch21_design.json +* ch21_memo.md + + +Use either: + pystatsv1 workbook run d21 +or: + pystatsv1 workbook run business_ch21_scenario_planning_sensitivity_stress +""" + +from __future__ import annotations + +from scripts.business_ch21_scenario_planning_sensitivity_stress import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d22.py b/workbooks/track_d_template/scripts/d22.py new file mode 100644 index 0000000..5655579 --- /dev/null +++ b/workbooks/track_d_template/scripts/d22.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch22_financial_statement_analysis_toolkit.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* figures/ +* ch22_ratios_monthly.csv +* ch22_common_size_is.csv +* ch22_common_size_bs.csv +* ch22_variance_bridge_latest.csv +* ch22_assumptions.csv +* ch22_figures_manifest.csv +* ch22_memo.md +* ch22_design.json + + +Use either: + pystatsv1 workbook run d22 +or: + pystatsv1 workbook run business_ch22_financial_statement_analysis_toolkit +""" + +from __future__ import annotations + +from scripts.business_ch22_financial_statement_analysis_toolkit import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/d23.py b/workbooks/track_d_template/scripts/d23.py new file mode 100644 index 0000000..d1d9985 --- /dev/null +++ b/workbooks/track_d_template/scripts/d23.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +"""Track D convenience wrapper. + +Runs: business_ch23_communicating_results_governance.py + +Artifacts written to ``--outdir`` (default: ``outputs/track_d``): + +* ch23_memo_template.md +* ch23_kpi_governance_template.csv +* ch23_dashboard_spec_template.csv +* ch23_red_team_checklist.md +* ch23_design.json + + +Use either: + pystatsv1 workbook run d23 +or: + pystatsv1 workbook run business_ch23_communicating_results_governance +""" + +from __future__ import annotations + +from scripts.business_ch23_communicating_results_governance import main + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/sim_business_ledgerlab.py b/workbooks/track_d_template/scripts/sim_business_ledgerlab.py new file mode 100644 index 0000000..1b79fc8 --- /dev/null +++ b/workbooks/track_d_template/scripts/sim_business_ledgerlab.py @@ -0,0 +1,516 @@ +# SPDX-License-Identifier: MIT +"""Track D simulator: LedgerLab (accounting-shaped synthetic dataset). + +This simulator generates a tiny month of bookkeeping activity with: + +- Chart of accounts (COA) +- Journal / general ledger detail (debit/credit lines) +- Derived month-level statements (IS/BS/CF) + +Design goals (aligned with PyStatsV1): +- Deterministic output via ``--seed`` +- Human-readable CSV artifacts +- Simple but realistic accounting story (sales, COGS, inventory, expenses) + +Chapter usage: +- Track D Ch01 uses the core tables produced here +- Track D Ch02/Ch03 use GL/TB/Statements for analysis and validation +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import date +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser + + +@dataclass(frozen=True) +class LedgerLabOutputs: + chart_of_accounts: pd.DataFrame + gl_journal: pd.DataFrame + trial_balance_monthly: pd.DataFrame + statements_is_monthly: pd.DataFrame + statements_bs_monthly: pd.DataFrame + statements_cf_monthly: pd.DataFrame + meta: dict[str, Any] + + +def _month_bounds(month: str) -> tuple[date, date]: + """Return (start_date, end_date) for a YYYY-MM month. + + Teaching simplification: all months are treated as 28 days. + """ + year_s, mon_s = month.split("-") + y = int(year_s) + m = int(mon_s) + start = date(y, m, 1) + end = date(y, m, 28) + return start, end + + +def build_chart_of_accounts() -> pd.DataFrame: + """Minimal chart of accounts suitable for early Track D chapters.""" + rows = [ + ("1000", "Cash", "Asset", "Debit"), + ("1100", "Accounts Receivable", "Asset", "Debit"), + ("1200", "Inventory", "Asset", "Debit"), + ("2000", "Accounts Payable", "Liability", "Credit"), + ("3000", "Owner Capital", "Equity", "Credit"), + ("3100", "Retained Earnings (Current Period)", "Equity", "Credit"), + ("4000", "Sales Revenue", "Revenue", "Credit"), + ("5000", "Cost of Goods Sold", "Expense", "Debit"), + ("6100", "Rent Expense", "Expense", "Debit"), + ("6200", "Utilities Expense", "Expense", "Debit"), + ("6300", "Payroll Expense", "Expense", "Debit"), + ] + return pd.DataFrame( + rows, columns=["account_id", "account_name", "account_type", "normal_side"] + ) + + +def _add_txn( + *, + lines: list[dict[str, Any]], + txn_id: int, + txn_date: date, + description: str, + doc_id: str, + entries: list[tuple[str, float, float]], +) -> None: + """Append journal lines for one transaction.""" + for account_id, debit, credit in entries: + lines.append( + { + "txn_id": int(txn_id), + "date": txn_date.isoformat(), + "doc_id": str(doc_id), + "description": str(description), + "account_id": str(account_id), + "debit": float(debit), + "credit": float(credit), + } + ) + + +def simulate_ledgerlab_month( + *, + month: str, + n_sales: int = 18, + mean_sale: float = 220.0, + sale_sd: float = 60.0, + pct_on_account: float = 0.45, + cogs_rate: float = 0.55, + initial_cash: float = 5000.0, + pay_rent: float = 1400.0, + payroll_runs: int = 2, + mean_utilities: float = 180.0, + random_state: int | None = None, +) -> LedgerLabOutputs: + """Simulate a small month of ledger activity and derive IS/BS/CF statements.""" + apply_seed(random_state) + rng = np.random.default_rng(random_state) + start, end = _month_bounds(month) + + coa = build_chart_of_accounts() + + lines: list[dict[str, Any]] = [] + txn_id = 0 + + def rand_day() -> date: + d = int(rng.integers(1, 29)) # 1..28 inclusive + return date(start.year, start.month, d) + + # 1) Owner invests cash (startup month) + txn_id += 1 + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=start, + description="Owner contribution (startup capital)", + doc_id="CAP0001", + entries=[("1000", initial_cash, 0.0), ("3000", 0.0, initial_cash)], + ) + + # 2) Sales amounts and whether on account (AR) vs cash + sale_amounts = np.maximum(rng.normal(mean_sale, sale_sd, size=n_sales), 20.0) + is_on_account = rng.random(size=n_sales) < pct_on_account + + total_sales = float(np.sum(sale_amounts)) + total_cogs = float(total_sales * cogs_rate) + + # 3) Inventory purchases (ensure inventory covers COGS) + purchase_total = total_cogs * 1.25 + n_purchases = 3 + purchase_splits = rng.dirichlet(np.ones(n_purchases)) * purchase_total + purchase_on_credit = [True, True, False] # deterministic pattern + + for i, amt in enumerate(purchase_splits, start=1): + txn_id += 1 + amt_f = float(amt) + if purchase_on_credit[i - 1]: + desc = "Inventory purchase on credit" + entries = [("1200", amt_f, 0.0), ("2000", 0.0, amt_f)] + else: + desc = "Inventory purchase (cash)" + entries = [("1200", amt_f, 0.0), ("1000", 0.0, amt_f)] + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=rand_day(), + description=desc, + doc_id=f"PO{i:04d}", + entries=entries, + ) + + # 4) Record sales + COGS (two txns per sale for clarity) + for i, amt in enumerate(sale_amounts, start=1): + sale_amt = float(amt) + cogs_amt = float(sale_amt * cogs_rate) + txn_date = rand_day() + doc_id = f"SALE{i:04d}" + + # a) Revenue side + txn_id += 1 + if bool(is_on_account[i - 1]): + debit_acct = "1100" # AR + desc = "Sale on account" + else: + debit_acct = "1000" # Cash + desc = "Cash sale" + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=txn_date, + description=desc, + doc_id=doc_id, + entries=[(debit_acct, sale_amt, 0.0), ("4000", 0.0, sale_amt)], + ) + + # b) COGS side + txn_id += 1 + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=txn_date, + description="Record cost of goods sold", + doc_id=doc_id, + entries=[("5000", cogs_amt, 0.0), ("1200", 0.0, cogs_amt)], + ) + + # 5) Collect some AR (assume 60% collected in-month) + ar_sales_total = float(np.sum(sale_amounts[is_on_account])) + collect_amt = 0.60 * ar_sales_total + if collect_amt > 0.0: + txn_id += 1 + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=end, + description="Collect on accounts receivable", + doc_id="ARCOLL01", + entries=[("1000", float(collect_amt), 0.0), ("1100", 0.0, float(collect_amt))], + ) + + # 6) Pay some AP (assume 50% of AP purchases paid in-month) + ap_purchases_total = float(np.sum(purchase_splits[:2])) + pay_ap_amt = 0.50 * ap_purchases_total + if pay_ap_amt > 0.0: + txn_id += 1 + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=end, + description="Pay accounts payable", + doc_id="APPAY01", + entries=[("2000", float(pay_ap_amt), 0.0), ("1000", 0.0, float(pay_ap_amt))], + ) + + # 7) Operating expenses (rent once; utilities once; payroll N times) + txn_id += 1 + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=start, + description="Pay monthly rent", + doc_id="RENT0001", + entries=[("6100", float(pay_rent), 0.0), ("1000", 0.0, float(pay_rent))], + ) + + util_amt = float(max(rng.normal(mean_utilities, 40.0), 40.0)) + txn_id += 1 + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=rand_day(), + description="Pay utilities", + doc_id="UTIL0001", + entries=[("6200", util_amt, 0.0), ("1000", 0.0, util_amt)], + ) + + for p in range(1, payroll_runs + 1): + pay_amt = float(max(rng.normal(1200.0, 120.0), 600.0)) + txn_id += 1 + _add_txn( + lines=lines, + txn_id=txn_id, + txn_date=rand_day(), + description="Run payroll (simplified: expense paid in cash)", + doc_id=f"PAY{p:04d}", + entries=[("6300", pay_amt, 0.0), ("1000", 0.0, pay_amt)], + ) + + gl = pd.DataFrame(lines) + gl = ( + gl.sort_values(["date", "txn_id", "account_id"], kind="mergesort") + .reset_index(drop=True) + ) + + # Join account metadata for convenience + gl = gl.merge(coa, on="account_id", how="left") + + # Trial balance (sum debits/credits per account) + tb = ( + gl.groupby(["account_id", "account_name", "account_type", "normal_side"], observed=True)[ + ["debit", "credit"] + ] + .sum() + .reset_index() + ) + tb["net"] = tb["debit"] - tb["credit"] + tb["ending_side"] = np.where(tb["net"] >= 0, "Debit", "Credit") + tb["ending_balance"] = tb["net"].abs() + tb = tb.drop(columns=["net"]) + tb.insert(0, "month", month) + + # Income statement (standard breakdown: Revenue, COGS, Operating Expenses, Net Income) + revenue = float( + tb.loc[tb["account_type"] == "Revenue", "credit"].sum() + - tb.loc[tb["account_type"] == "Revenue", "debit"].sum() + ) + + cogs = float( + tb.loc[tb["account_id"] == "5000", "debit"].sum() + - tb.loc[tb["account_id"] == "5000", "credit"].sum() + ) + + op_expenses = float( + tb.loc[ + (tb["account_type"] == "Expense") & (tb["account_id"] != "5000"), + "debit", + ].sum() + - tb.loc[ + (tb["account_type"] == "Expense") & (tb["account_id"] != "5000"), + "credit", + ].sum() + ) + + gross_profit = revenue - cogs + net_income = gross_profit - op_expenses + + is_df = pd.DataFrame( + [ + {"month": month, "line": "Sales Revenue", "amount": revenue}, + {"month": month, "line": "Cost of Goods Sold", "amount": cogs}, + {"month": month, "line": "Gross Profit", "amount": gross_profit}, + {"month": month, "line": "Operating Expenses", "amount": op_expenses}, + {"month": month, "line": "Net Income", "amount": net_income}, + ] + ) + + + # Balance sheet (assets/liabilities from TB; equity = owner capital + current-period earnings) + def _ending_balance(acct_id: str) -> float: + """Return balance in its normal direction (positive if normal-side).""" + row = tb.loc[tb["account_id"] == acct_id] + if row.empty: + return 0.0 + normal = str(row.iloc[0]["normal_side"]) + ending_side = str(row.iloc[0]["ending_side"]) + bal = float(row.iloc[0]["ending_balance"]) + return bal if ending_side == normal else -bal + + cash = _ending_balance("1000") + ar = _ending_balance("1100") + inv = _ending_balance("1200") + ap = _ending_balance("2000") + owner_cap = _ending_balance("3000") + retained = net_income # simplified: current-period NI sits in equity + + total_assets = float(cash + ar + inv) + total_liab = float(ap) + total_equity = float(owner_cap + retained) + + bs_df = pd.DataFrame( + [ + {"month": month, "line": "Cash", "amount": cash}, + {"month": month, "line": "Accounts Receivable", "amount": ar}, + {"month": month, "line": "Inventory", "amount": inv}, + {"month": month, "line": "Total Assets", "amount": total_assets}, + {"month": month, "line": "Accounts Payable", "amount": total_liab}, + {"month": month, "line": "Total Liabilities", "amount": total_liab}, + {"month": month, "line": "Owner Capital", "amount": owner_cap}, + {"month": month, "line": "Retained Earnings (Current Period)", "amount": retained}, + {"month": month, "line": "Total Equity", "amount": total_equity}, + {"month": month, "line": "Total Liabilities + Equity", "amount": total_liab + total_equity}, + ] + ) + + # Cash flow statement (simple bridge; startup-month friendly) + net_income_stmt = float(is_df.loc[is_df["line"] == "Net Income", "amount"].iloc[0]) + cash_end = float(bs_df.loc[bs_df["line"] == "Cash", "amount"].iloc[0]) + ar_end = float(bs_df.loc[bs_df["line"] == "Accounts Receivable", "amount"].iloc[0]) + inv_end = float(bs_df.loc[bs_df["line"] == "Inventory", "amount"].iloc[0]) + ap_end = float(bs_df.loc[bs_df["line"] == "Accounts Payable", "amount"].iloc[0]) + + # Beginning balances assumed 0 for teaching “month 1” + cash_begin = 0.0 + + # CFO using ΔWC with beginning = 0 + cfo = float(net_income_stmt - ar_end - inv_end + ap_end) + + # CFF = owner contribution (startup month) + cff = float(owner_cap) + + net_change_cash = float(cfo + cff) + cash_end_from_bridge = float(cash_begin + net_change_cash) + + cf_rows = [ + (month, "Net Income", net_income_stmt), + (month, "Change in Accounts Receivable", -ar_end), + (month, "Change in Inventory", -inv_end), + (month, "Change in Accounts Payable", ap_end), + (month, "Net Cash from Operations", cfo), + (month, "Owner Contribution", owner_cap), + (month, "Net Cash from Financing", cff), + (month, "Net Change in Cash", net_change_cash), + (month, "Beginning Cash (assumed)", cash_begin), + (month, "Ending Cash (from bridge)", cash_end_from_bridge), + (month, "Ending Cash (balance sheet)", cash_end), + ] + cf_df = pd.DataFrame(cf_rows, columns=["month", "line", "amount"]) + + meta: dict[str, Any] = { + "dataset": "LedgerLab", + "month": month, + "seed": random_state, + "n_sales": int(n_sales), + "assumptions": { + "pct_on_account": float(pct_on_account), + "cogs_rate": float(cogs_rate), + "ar_collected_in_month": 0.60, + "ap_paid_in_month": 0.50, + "month_length_days": 28, + "cash_begin_assumed": 0.0, + }, + "notes": [ + "Equity simplified: Owner Capital + current-period net income (no closing entries).", + "Cash flow uses a simple NI + working-capital bridge with beginning balances assumed 0.", + ], + } + + return LedgerLabOutputs( + chart_of_accounts=coa, + gl_journal=gl, + trial_balance_monthly=tb, + statements_is_monthly=is_df, + statements_bs_monthly=bs_df, + statements_cf_monthly=cf_df, + meta=meta, + ) + + +def write_ledgerlab(outputs: LedgerLabOutputs, outdir: Path) -> None: + outdir.mkdir(parents=True, exist_ok=True) + + outputs.chart_of_accounts.to_csv(outdir / "chart_of_accounts.csv", index=False) + outputs.gl_journal.to_csv(outdir / "gl_journal.csv", index=False) + outputs.trial_balance_monthly.to_csv(outdir / "trial_balance_monthly.csv", index=False) + outputs.statements_is_monthly.to_csv(outdir / "statements_is_monthly.csv", index=False) + outputs.statements_bs_monthly.to_csv(outdir / "statements_bs_monthly.csv", index=False) + outputs.statements_cf_monthly.to_csv(outdir / "statements_cf_monthly.csv", index=False) + + (outdir / "ledgerlab_meta.json").write_text( + json.dumps(outputs.meta, indent=2), + encoding="utf-8", + ) + + +def main() -> None: + parser = base_parser("Track D Simulator: LedgerLab (Chapter 1+ core tables)") + parser.set_defaults(outdir=Path("data/synthetic/ledgerlab_ch01")) + + parser.add_argument( + "--month", + type=str, + default="2025-01", + help="Month to simulate (YYYY-MM). Uses a 28-day teaching calendar.", + ) + parser.add_argument("--n-sales", type=int, default=18, help="Number of sales in the month") + parser.add_argument( + "--pct-on-account", + type=float, + default=0.45, + help="Share of sales on account (AR) rather than cash", + ) + parser.add_argument("--mean-sale", type=float, default=220.0, help="Average sale amount") + parser.add_argument( + "--sale-sd", + type=float, + default=60.0, + help="Sale amount standard deviation", + ) + parser.add_argument( + "--cogs-rate", + type=float, + default=0.55, + help="COGS as a fraction of sales (e.g., 0.55)", + ) + parser.add_argument( + "--initial-cash", + type=float, + default=5000.0, + help="Owner contribution at start of month (startup cash).", + ) + parser.add_argument("--pay-rent", type=float, default=1400.0, help="Monthly rent paid (cash).") + parser.add_argument( + "--payroll-runs", + type=int, + default=2, + help="Number of payroll runs (each is a cash-paid expense).", + ) + parser.add_argument( + "--mean-utilities", + type=float, + default=180.0, + help="Mean utilities expense (cash).", + ) + + args = parser.parse_args() + + outs = simulate_ledgerlab_month( + month=args.month, + n_sales=args.n_sales, + mean_sale=args.mean_sale, + sale_sd=args.sale_sd, + pct_on_account=args.pct_on_account, + cogs_rate=args.cogs_rate, + initial_cash=args.initial_cash, + pay_rent=args.pay_rent, + payroll_runs=args.payroll_runs, + mean_utilities=args.mean_utilities, + random_state=args.seed, + ) + write_ledgerlab(outs, args.outdir) + + print(f"Wrote LedgerLab core tables -> {args.outdir}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/scripts/sim_business_nso_v1.py b/workbooks/track_d_template/scripts/sim_business_nso_v1.py new file mode 100644 index 0000000..7abb108 --- /dev/null +++ b/workbooks/track_d_template/scripts/sim_business_nso_v1.py @@ -0,0 +1,1281 @@ +# SPDX-License-Identifier: MIT +"""Track D simulator: North Shore Outfitters (NSO) running case, v1. + +Goal (v1): multi-month dataset that can support Chapters 4+ without breaking the +clean structure that made LedgerLab work in Chapters 1–3. + +Outputs to: data/synthetic/nso_v1/ + +Core tables: +- chart_of_accounts.csv +- gl_journal.csv +- trial_balance_monthly.csv +- statements_is_monthly.csv +- statements_bs_monthly.csv +- statements_cf_monthly.csv (simple CFO/CFI/CFF bridge) + +Ch04 subledgers: +- inventory_movements.csv +- fixed_assets.csv +- depreciation_schedule.csv + +Ch05 subledgers (added for liabilities/payroll/taxes/equity): +- payroll_events.csv +- sales_tax_events.csv +- debt_schedule.csv +- equity_events.csv +- ap_events.csv + +Ch06 subledgers (added for reconciliations/quality control): +- ar_events.csv +- bank_statement.csv + +Design constraints: +- deterministic via --seed / seed= +- small + readable +- tie-out friendly (subledgers have txn_id links into GL) +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import date +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +from scripts._cli import apply_seed, base_parser + + +@dataclass(frozen=True) +class NSOV1Outputs: + chart_of_accounts: pd.DataFrame + gl_journal: pd.DataFrame + trial_balance_monthly: pd.DataFrame + statements_is_monthly: pd.DataFrame + statements_bs_monthly: pd.DataFrame + statements_cf_monthly: pd.DataFrame + inventory_movements: pd.DataFrame + fixed_assets: pd.DataFrame + depreciation_schedule: pd.DataFrame + payroll_events: pd.DataFrame + sales_tax_events: pd.DataFrame + debt_schedule: pd.DataFrame + equity_events: pd.DataFrame + ap_events: pd.DataFrame + ar_events: pd.DataFrame + bank_statement: pd.DataFrame + meta: dict[str, Any] + + +def _parse_month(month: str) -> tuple[int, int]: + y_s, m_s = month.split("-") + return int(y_s), int(m_s) + + +def _fmt_month(y: int, m: int) -> str: + return f"{y:04d}-{m:02d}" + + +def _add_months(month: str, delta: int) -> str: + y, m = _parse_month(month) + total = (y * 12 + (m - 1)) + delta + y2 = total // 12 + m2 = (total % 12) + 1 + return _fmt_month(y2, m2) + + +def _month_bounds(month: str) -> tuple[date, date]: + """Teaching simplification: all months are treated as 28 days.""" + y, m = _parse_month(month) + return date(y, m, 1), date(y, m, 28) + + +def build_chart_of_accounts() -> pd.DataFrame: + rows = [ + ("1000", "Cash", "Asset", "Debit"), + ("1100", "Accounts Receivable", "Asset", "Debit"), + ("1200", "Inventory", "Asset", "Debit"), + ("1300", "Property, Plant & Equipment (Cost)", "Asset", "Debit"), + ("1350", "Accumulated Depreciation", "Contra Asset", "Credit"), + ("2000", "Accounts Payable", "Liability", "Credit"), + ("2100", "Sales Tax Payable", "Liability", "Credit"), + ("2110", "Wages Payable", "Liability", "Credit"), + ("2120", "Payroll Taxes Payable", "Liability", "Credit"), + ("2200", "Notes Payable", "Liability", "Credit"), + ("3000", "Owner Capital", "Equity", "Credit"), + ("3100", "Retained Earnings (Cumulative, derived)", "Equity", "Credit"), + ("3200", "Owner Draw", "Equity", "Debit"), + ("4000", "Sales Revenue", "Revenue", "Credit"), + ("5000", "Cost of Goods Sold", "Expense", "Debit"), + ("6100", "Rent Expense", "Expense", "Debit"), + ("6200", "Utilities Expense", "Expense", "Debit"), + ("6300", "Payroll Expense", "Expense", "Debit"), + ("6400", "Depreciation Expense", "Expense", "Debit"), + ("6500", "Payroll Tax Expense", "Expense", "Debit"), + ("6600", "Interest Expense", "Expense", "Debit"), + ] + return pd.DataFrame(rows, columns=["account_id", "account_name", "account_type", "normal_side"]) + + +def _add_txn( + *, + lines: list[dict[str, Any]], + txn_id: int, + txn_date: date, + doc_id: str, + description: str, + entries: list[tuple[str, float, float]], +) -> None: + for account_id, debit, credit in entries: + lines.append( + { + "txn_id": int(txn_id), + "date": txn_date.isoformat(), + "doc_id": str(doc_id), + "description": str(description), + "account_id": str(account_id), + "debit": float(debit), + "credit": float(credit), + } + ) + + +def _ending_balance_from_tb(tb: pd.DataFrame, account_id: str) -> float: + """Return balance in its normal direction (positive if normal-side).""" + hit = tb.loc[tb["account_id"].astype(str) == str(account_id)] + if hit.empty: + return 0.0 + normal = str(hit.iloc[0]["normal_side"]) + ending_side = str(hit.iloc[0]["ending_side"]) + bal = float(hit.iloc[0]["ending_balance"]) + return bal if ending_side == normal else -bal + + +def _compute_tb_for_cutoff(gl: pd.DataFrame, coa: pd.DataFrame, month: str) -> pd.DataFrame: + _, end = _month_bounds(month) + cutoff = pd.to_datetime(end.isoformat()) + dts = pd.to_datetime(gl["date"]) + df = gl.loc[dts <= cutoff].copy() + + # ensure metadata exists + for col in ["account_name", "account_type", "normal_side"]: + if col in df.columns: + df = df.drop(columns=[col]) + df = df.merge(coa, on="account_id", how="left") + + tb = ( + df.groupby(["account_id", "account_name", "account_type", "normal_side"], observed=True)[["debit", "credit"]] + .sum() + .reset_index() + ) + tb["net"] = tb["debit"] - tb["credit"] + tb["ending_side"] = np.where(tb["net"] >= 0, "Debit", "Credit") + tb["ending_balance"] = tb["net"].abs() + tb = tb.drop(columns=["net"]) + tb.insert(0, "month", month) + return tb + + +def _compute_is_for_month(gl: pd.DataFrame, coa: pd.DataFrame, month: str) -> tuple[pd.DataFrame, float]: + start, end = _month_bounds(month) + dts = pd.to_datetime(gl["date"]) + df = gl.loc[(dts >= pd.to_datetime(start)) & (dts <= pd.to_datetime(end))].copy() + + for col in ["account_name", "account_type", "normal_side"]: + if col in df.columns: + df = df.drop(columns=[col]) + df = df.merge(coa, on="account_id", how="left") + + revenue = float( + df.loc[df["account_type"] == "Revenue", "credit"].sum() + - df.loc[df["account_type"] == "Revenue", "debit"].sum() + ) + + cogs = float( + df.loc[df["account_id"].astype(str) == "5000", "debit"].sum() + - df.loc[df["account_id"].astype(str) == "5000", "credit"].sum() + ) + + # all expenses except COGS are "operating expenses" for now + op_exp = float( + df.loc[(df["account_type"] == "Expense") & (df["account_id"].astype(str) != "5000"), "debit"].sum() + - df.loc[(df["account_type"] == "Expense") & (df["account_id"].astype(str) != "5000"), "credit"].sum() + ) + + gross_profit = revenue - cogs + net_income = gross_profit - op_exp + + is_df = pd.DataFrame( + [ + {"month": month, "line": "Sales Revenue", "amount": revenue}, + {"month": month, "line": "Cost of Goods Sold", "amount": cogs}, + {"month": month, "line": "Gross Profit", "amount": gross_profit}, + {"month": month, "line": "Operating Expenses", "amount": op_exp}, + {"month": month, "line": "Net Income", "amount": net_income}, + ] + ) + return is_df, float(net_income) + + +def _compute_bs_for_month(tb: pd.DataFrame, month: str, retained_cum: float) -> pd.DataFrame: + cash = _ending_balance_from_tb(tb, "1000") + ar = _ending_balance_from_tb(tb, "1100") + inv = _ending_balance_from_tb(tb, "1200") + ppe_cost = _ending_balance_from_tb(tb, "1300") + accum_dep = _ending_balance_from_tb(tb, "1350") # positive in normal credit direction + + ap = _ending_balance_from_tb(tb, "2000") + sales_tax_payable = _ending_balance_from_tb(tb, "2100") + wages_payable = _ending_balance_from_tb(tb, "2110") + payroll_taxes_payable = _ending_balance_from_tb(tb, "2120") + notes_payable = _ending_balance_from_tb(tb, "2200") + + owner_cap = _ending_balance_from_tb(tb, "3000") + owner_draw_bal = _ending_balance_from_tb(tb, "3200") # normal debit, positive in debit direction + + # Present accum dep as a negative line for reporting clarity + accum_dep_line = -float(accum_dep) + net_ppe = float(ppe_cost + accum_dep_line) + + total_assets = float(cash + ar + inv + ppe_cost + accum_dep_line) + + total_liab = float(ap + sales_tax_payable + wages_payable + payroll_taxes_payable + notes_payable) + owner_draw_line = -float(owner_draw_bal) # show draws as negative equity + total_equity = float(owner_cap + owner_draw_line + retained_cum) + + bs_df = pd.DataFrame( + [ + {"month": month, "line": "Cash", "amount": cash}, + {"month": month, "line": "Accounts Receivable", "amount": ar}, + {"month": month, "line": "Inventory", "amount": inv}, + {"month": month, "line": "PP&E (Cost)", "amount": ppe_cost}, + {"month": month, "line": "Accumulated Depreciation", "amount": accum_dep_line}, + {"month": month, "line": "Net PP&E", "amount": net_ppe}, + {"month": month, "line": "Total Assets", "amount": total_assets}, + {"month": month, "line": "Accounts Payable", "amount": ap}, + {"month": month, "line": "Sales Tax Payable", "amount": sales_tax_payable}, + {"month": month, "line": "Wages Payable", "amount": wages_payable}, + {"month": month, "line": "Payroll Taxes Payable", "amount": payroll_taxes_payable}, + {"month": month, "line": "Notes Payable", "amount": notes_payable}, + {"month": month, "line": "Total Liabilities", "amount": total_liab}, + {"month": month, "line": "Owner Capital", "amount": owner_cap}, + {"month": month, "line": "Owner Draw", "amount": owner_draw_line}, + {"month": month, "line": "Retained Earnings (Cumulative, derived)", "amount": retained_cum}, + {"month": month, "line": "Total Equity", "amount": total_equity}, + {"month": month, "line": "Total Liabilities + Equity", "amount": float(total_liab + total_equity)}, + ] + ) + return bs_df + + +def _compute_cf_for_month( + month: str, + cash_begin: float, + cash_end: float, + ar_begin: float, + ar_end: float, + inv_begin: float, + inv_end: float, + ap_begin: float, + ap_end: float, + sales_tax_begin: float, + sales_tax_end: float, + wages_pay_begin: float, + wages_pay_end: float, + payroll_taxes_begin: float, + payroll_taxes_end: float, + notes_pay_begin: float, + notes_pay_end: float, + net_income: float, + dep_expense: float, + capex_cash: float, + owner_contrib: float, + owner_draw_cash: float, +) -> pd.DataFrame: + # Working capital (current assets / current liabilities) + delta_ar = float(ar_end - ar_begin) + delta_inv = float(inv_end - inv_begin) + delta_ap = float(ap_end - ap_begin) + delta_sales_tax = float(sales_tax_end - sales_tax_begin) + delta_wages_pay = float(wages_pay_end - wages_pay_begin) + delta_payroll_taxes = float(payroll_taxes_end - payroll_taxes_begin) + + cfo = float( + net_income + + dep_expense + - delta_ar + - delta_inv + + delta_ap + + delta_sales_tax + + delta_wages_pay + + delta_payroll_taxes +) + + cfi = float(-capex_cash) + + # Financing: contributions, draws, net borrowings (change in notes payable) + delta_notes = float(notes_pay_end - notes_pay_begin) + cff = float(owner_contrib - owner_draw_cash + delta_notes) + + net_change = float(cfo + cfi + cff) + end_from_bridge = float(cash_begin + net_change) + + rows = [ + (month, "Net Income", net_income), + (month, "Add back Depreciation", dep_expense), + (month, "Change in Accounts Receivable", -delta_ar), + (month, "Change in Inventory", -delta_inv), + (month, "Change in Accounts Payable", delta_ap), + (month, "Change in Sales Tax Payable", delta_sales_tax), + (month, "Change in Wages Payable", delta_wages_pay), + (month, "Change in Payroll Taxes Payable", delta_payroll_taxes), + (month, "Net Cash from Operations", cfo), + (month, "Capital Expenditures (cash)", -capex_cash), + (month, "Net Cash from Investing", cfi), + (month, "Owner Contribution", owner_contrib), + (month, "Owner Draw (cash)", -owner_draw_cash), + (month, "Net Borrowings (Δ Notes Payable)", delta_notes), + (month, "Net Cash from Financing", cff), + (month, "Net Change in Cash", net_change), + (month, "Beginning Cash", cash_begin), + (month, "Ending Cash (from bridge)", end_from_bridge), + (month, "Ending Cash (balance sheet)", cash_end), + ] + return pd.DataFrame(rows, columns=["month", "line", "amount"]) + + +def simulate_nso_v1( + *, + # Backwards-compatible kwargs (tests may pass these even if we don't use them) + outdir: Path | None = None, + seed: int | None = None, + start_month: str = "2025-01", + n_months: int = 24, + n_sales_per_month: int = 12, + pct_on_account: float = 0.35, + sku_list: tuple[str, ...] = ("SKU-TEE", "SKU-HAT"), + mean_sale_qty: float = 2.0, + unit_cost_mean: float = 22.0, + unit_cost_sd: float = 2.0, + unit_price_mean: float = 40.0, + unit_price_sd: float = 4.0, + sales_tax_rate: float = 0.07, + rent: float = 1800.0, + payroll_mean: float = 2400.0, + utilities_mean: float = 260.0, + # Preferred kwarg (older code used this name) + random_state: int | None = None, +) -> NSOV1Outputs: + # seed precedence: explicit random_state, else seed + if random_state is None: + random_state = seed + + apply_seed(random_state) + rng = np.random.default_rng(random_state) + + coa = build_chart_of_accounts() + + gl_lines: list[dict[str, Any]] = [] + inv_moves: list[dict[str, Any]] = [] + payroll_events: list[dict[str, Any]] = [] + sales_tax_events: list[dict[str, Any]] = [] + debt_schedule: list[dict[str, Any]] = [] + equity_events: list[dict[str, Any]] = [] + ap_events: list[dict[str, Any]] = [] + txn_id = 0 + + months = [_add_months(start_month, i) for i in range(n_months)] + + # fixed assets (small, deterministic register) + fixed_assets = pd.DataFrame( + [ + { + "asset_id": "FA001", + "asset_name": "Delivery Van", + "in_service_month": _add_months(start_month, 1), + "cost": 12000.0, + "useful_life_months": 60, + "salvage_value": 2000.0, + "method": "SL", + }, + { + "asset_id": "FA002", + "asset_name": "Point-of-Sale Terminal", + "in_service_month": _add_months(start_month, 3), + "cost": 1800.0, + "useful_life_months": 36, + "salvage_value": 0.0, + "method": "SL", + }, + ] + ) + + # depreciation schedule (built for the requested horizon) + dep_rows: list[dict[str, Any]] = [] + for _, a in fixed_assets.iterrows(): + cost = float(a["cost"]) + salvage = float(a["salvage_value"]) + life = int(a["useful_life_months"]) + in_service = str(a["in_service_month"]) + monthly = (cost - salvage) / float(life) + + accum = 0.0 + for m in months: + if m < in_service: + dep = 0.0 + else: + dep = float(monthly) + # stop at full depreciation + if accum + dep > (cost - salvage) + 1e-9: + dep = max(0.0, (cost - salvage) - accum) + + accum = float(accum + dep) + nbv = float(cost - accum) + dep_rows.append( + { + "month": m, + "asset_id": str(a["asset_id"]), + "dep_expense": dep, + "accum_dep": accum, + "net_book_value": nbv, + } + ) + depreciation_schedule = pd.DataFrame(dep_rows) + + # helper: random day within teaching month + def rand_day(month: str) -> date: + y, m = _parse_month(month) + d = int(rng.integers(1, 29)) # 1..28 + return date(y, m, d) + + # Equity tracking for CF + owner_contrib_by_month = {m: 0.0 for m in months} + owner_draw_by_month = {m: 0.0 for m in months} + + # 0) Owner contribution on day 1 of first month + first_start, _ = _month_bounds(months[0]) + txn_id += 1 + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=first_start, + doc_id="CAP0001", + description="Owner contribution (startup capital)", + entries=[("1000", 25000.0, 0.0), ("3000", 0.0, 25000.0)], + ) + owner_contrib_by_month[months[0]] = 25000.0 + equity_events.append( + { + "month": months[0], + "txn_id": txn_id, + "date": first_start.isoformat(), + "event_type": "contribution", + "amount": 25000.0, + } + ) + + # Simple loan: originate in month 2, amortize with fixed principal payments + loan_id = "LN001" + loan_principal = 20000.0 + loan_rate_m = 0.01 # 1% per teaching month + principal_payment = 5000.0 + loan_balance = 0.0 + + # Payroll lags (pay + remit next month) + prior_net_wages = 0.0 + prior_payroll_taxes = 0.0 + prior_sales_tax = 0.0 + + # run-month simulation + for mi, month in enumerate(months): + # 1) Inventory purchases (2 per month) + total_credit_purchases = 0.0 + + for p in range(1, 3): + sku = str(rng.choice(sku_list)) + unit_cost = float(max(rng.normal(unit_cost_mean, unit_cost_sd), 8.0)) + qty = int(max(1, round(rng.normal(mean_sale_qty * n_sales_per_month / 6.0, 2.0)))) + amount = float(qty * unit_cost) + + txn_id += 1 + on_credit = bool(rng.random() < 0.65) + if on_credit: + entries = [("1200", amount, 0.0), ("2000", 0.0, amount)] + total_credit_purchases += amount + desc = "Inventory purchase on credit" + else: + entries = [("1200", amount, 0.0), ("1000", 0.0, amount)] + desc = "Inventory purchase (cash)" + + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=rand_day(month), + doc_id=f"{month}-PO{p:02d}", + description=desc, + entries=entries, + ) + + inv_moves.append( + { + "month": month, + "txn_id": int(txn_id), + "date": rand_day(month).isoformat(), + "sku": sku, + "movement_type": "purchase", + "qty": float(qty), + "unit_cost": unit_cost, + "amount": float(amount), # + increases inventory + } + ) + + if on_credit: + ap_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": rand_day(month).isoformat(), + "vendor": "Various", + "invoice_id": f"{month}-PO{p:02d}", + "event_type": "invoice", + "amount": float(amount), + "ap_delta": float(amount), + "cash_paid": 0.0, + } + ) + + # 2) Sales + COGS (two txns per sale) + sales tax embedded in the sale + ar_sales = 0.0 + tax_total_this_month = 0.0 + + for s in range(1, n_sales_per_month + 1): + sku = str(rng.choice(sku_list)) + qty = int(max(1, round(rng.normal(mean_sale_qty, 0.8)))) + unit_cost = float(max(rng.normal(unit_cost_mean, unit_cost_sd), 8.0)) + unit_price = float(max(rng.normal(unit_price_mean, unit_price_sd), unit_cost + 4.0)) + + sale_amt = float(qty * unit_price) + tax_amt = float(sale_amt * sales_tax_rate) + total_receipt = float(sale_amt + tax_amt) + cogs_amt = float(qty * unit_cost) + tax_total_this_month += tax_amt + + d = rand_day(month) + doc = f"{month}-SALE{s:03d}" + + # revenue + tax side + txn_id += 1 + on_account = bool(rng.random() < pct_on_account) + if on_account: + debit_acct = "1100" + ar_sales += total_receipt + desc = "Sale on account (incl. sales tax)" + else: + debit_acct = "1000" + desc = "Cash sale (incl. sales tax)" + + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=d, + doc_id=doc, + description=desc, + entries=[(debit_acct, total_receipt, 0.0), ("4000", 0.0, sale_amt), ("2100", 0.0, tax_amt)], + ) + + # cogs side (link this txn_id into inventory movements) + txn_id += 1 + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=d, + doc_id=doc, + description="Record cost of goods sold", + entries=[("5000", cogs_amt, 0.0), ("1200", 0.0, cogs_amt)], + ) + + inv_moves.append( + { + "month": month, + "txn_id": int(txn_id), + "date": d.isoformat(), + "sku": sku, + "movement_type": "sale_issue", + "qty": float(-qty), # - reduces inventory + "unit_cost": unit_cost, + "amount": float(-cogs_amt), # - reduces inventory + } + ) + + # sales tax collection summary event (ties to GL via 2100 credits in the sale txns) + sales_tax_events.append( + { + "month": month, + "txn_id": None, + "date": None, + "event_type": "collection", + "taxable_sales": float(tax_total_this_month / sales_tax_rate) if sales_tax_rate else 0.0, + "tax_amount": float(tax_total_this_month), + "cash_paid": 0.0, + "sales_tax_payable_delta": float(tax_total_this_month), + } + ) + + # 3) Count adjustment every 3 months (inventory shrink/overage) + if (mi + 1) % 3 == 0: + sku = str(rng.choice(sku_list)) + unit_cost = float(max(rng.normal(unit_cost_mean, unit_cost_sd), 8.0)) + adj_qty = int(rng.integers(-4, 3)) # small shrink/overage + if adj_qty != 0: + adj_amt = float(adj_qty * unit_cost) # + increases inv, - decreases inv + txn_id += 1 + d = date(_parse_month(month)[0], _parse_month(month)[1], 28) + if adj_amt < 0: + # shrinkage: debit COGS, credit Inventory + entries = [("5000", float(-adj_amt), 0.0), ("1200", 0.0, float(-adj_amt))] + desc = "Inventory count adjustment (shrinkage)" + else: + # overage: debit Inventory, credit COGS (reduces expense) + entries = [("1200", adj_amt, 0.0), ("5000", 0.0, adj_amt)] + desc = "Inventory count adjustment (overage)" + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=d, + doc_id=f"{month}-COUNT", + description=desc, + entries=entries, + ) + inv_moves.append( + { + "month": month, + "txn_id": int(txn_id), + "date": d.isoformat(), + "sku": sku, + "movement_type": "count_adjustment", + "qty": float(adj_qty), + "unit_cost": unit_cost, + "amount": float(adj_amt), + } + ) + + # 4) Collect some AR at month end (60% of this month's AR sales) + if ar_sales > 0: + collect = float(0.60 * ar_sales) + txn_id += 1 + _, end = _month_bounds(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=end, + doc_id=f"{month}-ARCOLL", + description="Collect on accounts receivable (partial)", + entries=[("1000", collect, 0.0), ("1100", 0.0, collect)], + ) + + # 5) Pay some AP at month end (50% of credit purchases this month) + ap_pay_amt = 0.0 + if total_credit_purchases > 0: + ap_pay_amt = float(0.50 * total_credit_purchases) + txn_id += 1 + _, end = _month_bounds(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=end, + doc_id=f"{month}-APPAY", + description="Pay accounts payable (partial)", + entries=[("2000", ap_pay_amt, 0.0), ("1000", 0.0, ap_pay_amt)], + ) + ap_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": end.isoformat(), + "vendor": "Various", + "invoice_id": f"{month}-APPAY", + "event_type": "payment", + "amount": float(ap_pay_amt), + "ap_delta": float(-ap_pay_amt), + "cash_paid": float(ap_pay_amt), + } + ) + + # 6) Operating expenses (rent/utilities) + txn_id += 1 + start, _ = _month_bounds(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=start, + doc_id=f"{month}-RENT", + description="Pay monthly rent", + entries=[("6100", float(rent), 0.0), ("1000", 0.0, float(rent))], + ) + + util = float(max(rng.normal(utilities_mean, 60.0), 60.0)) + txn_id += 1 + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=rand_day(month), + doc_id=f"{month}-UTIL", + description="Pay utilities", + entries=[("6200", util, 0.0), ("1000", 0.0, util)], + ) + + # 7) Payroll: accrue in this month, pay/remit in the next month (lag = 1) + gross_wages = float(max(rng.normal(payroll_mean, 180.0), 900.0)) + employee_withholding = float(0.10 * gross_wages) + net_wages = float(gross_wages - employee_withholding) + employer_tax = float(0.08 * gross_wages) + payroll_tax_total = float(employee_withholding + employer_tax) + + # Accrue wages + employee withholding + txn_id += 1 + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=date(_parse_month(month)[0], _parse_month(month)[1], 25), + doc_id=f"{month}-PAYACCR", + description="Accrue payroll (gross wages; create payables)", + entries=[("6300", gross_wages, 0.0), ("2110", 0.0, net_wages), ("2120", 0.0, employee_withholding)], + ) + payroll_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": date(_parse_month(month)[0], _parse_month(month)[1], 25).isoformat(), + "event_type": "payroll_accrual", + "gross_wages": gross_wages, + "employee_withholding": employee_withholding, + "employer_tax": 0.0, + "cash_paid": 0.0, + "wages_payable_delta": float(net_wages), + "payroll_taxes_payable_delta": float(employee_withholding), + } + ) + + # Accrue employer payroll tax + txn_id += 1 + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=date(_parse_month(month)[0], _parse_month(month)[1], 25), + doc_id=f"{month}-PAYTAXACCR", + description="Accrue employer payroll taxes", + entries=[("6500", employer_tax, 0.0), ("2120", 0.0, employer_tax)], + ) + payroll_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": date(_parse_month(month)[0], _parse_month(month)[1], 25).isoformat(), + "event_type": "payroll_tax_accrual", + "gross_wages": 0.0, + "employee_withholding": 0.0, + "employer_tax": employer_tax, + "cash_paid": 0.0, + "wages_payable_delta": 0.0, + "payroll_taxes_payable_delta": float(employer_tax), + } + ) + + # Pay prior month net wages (if any) + if mi > 0 and prior_net_wages > 0: + txn_id += 1 + _, end = _month_bounds(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=end, + doc_id=f"{month}-PAYNET", + description="Pay prior-month net wages", + entries=[("2110", prior_net_wages, 0.0), ("1000", 0.0, prior_net_wages)], + ) + payroll_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": end.isoformat(), + "event_type": "wage_payment", + "gross_wages": 0.0, + "employee_withholding": 0.0, + "employer_tax": 0.0, + "cash_paid": float(prior_net_wages), + "wages_payable_delta": float(-prior_net_wages), + "payroll_taxes_payable_delta": 0.0, + } + ) + + # Remit prior month payroll taxes (if any) + if mi > 0 and prior_payroll_taxes > 0: + txn_id += 1 + _, end = _month_bounds(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=end, + doc_id=f"{month}-PAYTAXREM", + description="Remit prior-month payroll taxes", + entries=[("2120", prior_payroll_taxes, 0.0), ("1000", 0.0, prior_payroll_taxes)], + ) + payroll_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": end.isoformat(), + "event_type": "tax_remittance", + "gross_wages": 0.0, + "employee_withholding": 0.0, + "employer_tax": 0.0, + "cash_paid": float(prior_payroll_taxes), + "wages_payable_delta": 0.0, + "payroll_taxes_payable_delta": float(-prior_payroll_taxes), + } + ) + + # Remit prior month sales tax (if any) + if mi > 0 and prior_sales_tax > 0: + txn_id += 1 + _, end = _month_bounds(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=end, + doc_id=f"{month}-SALSTAXREM", + description="Remit prior-month sales tax", + entries=[("2100", prior_sales_tax, 0.0), ("1000", 0.0, prior_sales_tax)], + ) + sales_tax_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": end.isoformat(), + "event_type": "remittance", + "taxable_sales": 0.0, + "tax_amount": 0.0, + "cash_paid": float(prior_sales_tax), + "sales_tax_payable_delta": float(-prior_sales_tax), + } + ) + + # Update lagged items for next month + prior_net_wages = float(net_wages) + prior_payroll_taxes = float(payroll_tax_total) + prior_sales_tax = float(tax_total_this_month) + + # 8) Notes payable: originate in month 2, then pay down starting month 3 + if mi == 1: + # Origination + txn_id += 1 + d = date(_parse_month(month)[0], _parse_month(month)[1], 3) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=d, + doc_id=f"{month}-LOAN", + description="Borrow on note payable (loan origination)", + entries=[("1000", loan_principal, 0.0), ("2200", 0.0, loan_principal)], + ) + loan_balance = float(loan_principal) + debt_schedule.append( + { + "month": month, + "loan_id": loan_id, + "txn_id": int(txn_id), + "beginning_balance": 0.0, + "payment": 0.0, + "interest": 0.0, + "principal": 0.0, + "ending_balance": float(loan_balance), + } + ) + + if mi >= 2 and loan_balance > 0: + beg_bal = float(loan_balance) + interest = float(beg_bal * loan_rate_m) + principal = float(min(principal_payment, beg_bal)) + payment = float(principal + interest) + txn_id += 1 + _, end = _month_bounds(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=end, + doc_id=f"{month}-DEBTPAY", + description="Pay note payable (split principal and interest)", + entries=[("6600", interest, 0.0), ("2200", principal, 0.0), ("1000", 0.0, payment)], + ) + loan_balance = float(beg_bal - principal) + debt_schedule.append( + { + "month": month, + "loan_id": loan_id, + "txn_id": int(txn_id), + "beginning_balance": beg_bal, + "payment": payment, + "interest": interest, + "principal": principal, + "ending_balance": float(loan_balance), + } + ) + + # 9) Equity activity: owner draws every 6 months + if (mi + 1) % 6 == 0: + eq_amt = 1000.0 + txn_id += 1 + d = rand_day(month) + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=d, + doc_id=f"{month}-DRAW", + description="Owner draw (cash withdrawal)", + entries=[("3200", eq_amt, 0.0), ("1000", 0.0, eq_amt)], + ) + owner_draw_by_month[month] = float(owner_draw_by_month[month] + eq_amt) + equity_events.append( + { + "month": month, + "txn_id": int(txn_id), + "date": d.isoformat(), + "event_type": "draw", + "amount": float(eq_amt), + } + ) + + # 10) Capex purchases on their in-service months (cash) + for _, a in fixed_assets.iterrows(): + if str(a["in_service_month"]) != month: + continue + cost = float(a["cost"]) + txn_id += 1 + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=date(_parse_month(month)[0], _parse_month(month)[1], 2), + doc_id=f"{month}-{a['asset_id']}", + description=f"Acquire fixed asset: {a['asset_name']}", + entries=[("1300", cost, 0.0), ("1000", 0.0, cost)], + ) + + # 11) Depreciation entry for this month (sum across assets) + dep_this_month = float( + depreciation_schedule.loc[depreciation_schedule["month"].astype(str) == month, "dep_expense"].sum() + ) + if dep_this_month > 0: + txn_id += 1 + _add_txn( + lines=gl_lines, + txn_id=txn_id, + txn_date=date(_parse_month(month)[0], _parse_month(month)[1], 28), + doc_id=f"{month}-DEP", + description="Record depreciation expense", + entries=[("6400", dep_this_month, 0.0), ("1350", 0.0, dep_this_month)], + ) + + # Build GL dataframe, join COA metadata + gl = pd.DataFrame(gl_lines).sort_values(["date", "txn_id", "account_id"], kind="mergesort").reset_index(drop=True) + gl = gl.merge(coa, on="account_id", how="left") + + # Build monthly TB + statements + tb_all: list[pd.DataFrame] = [] + is_all: list[pd.DataFrame] = [] + bs_all: list[pd.DataFrame] = [] + cf_all: list[pd.DataFrame] = [] + + retained_cum = 0.0 + + # beginning balances for CF bridge + cash_begin = 0.0 + ar_begin = 0.0 + inv_begin = 0.0 + ap_begin = 0.0 + sales_tax_begin = 0.0 + wages_pay_begin = 0.0 + payroll_taxes_begin = 0.0 + notes_pay_begin = 0.0 + + for month in months: + tb_m = _compute_tb_for_cutoff(gl, coa, month) + tb_all.append(tb_m) + + is_m, ni_m = _compute_is_for_month(gl, coa, month) + is_all.append(is_m) + retained_cum = float(retained_cum + ni_m) + + bs_m = _compute_bs_for_month(tb_m, month, retained_cum=retained_cum) + bs_all.append(bs_m) + + cash_end = float(bs_m.loc[bs_m["line"] == "Cash", "amount"].iloc[0]) + ar_end = float(bs_m.loc[bs_m["line"] == "Accounts Receivable", "amount"].iloc[0]) + inv_end = float(bs_m.loc[bs_m["line"] == "Inventory", "amount"].iloc[0]) + ap_end = float(bs_m.loc[bs_m["line"] == "Accounts Payable", "amount"].iloc[0]) + sales_tax_end = float(bs_m.loc[bs_m["line"] == "Sales Tax Payable", "amount"].iloc[0]) + wages_pay_end = float(bs_m.loc[bs_m["line"] == "Wages Payable", "amount"].iloc[0]) + payroll_taxes_end = float(bs_m.loc[bs_m["line"] == "Payroll Taxes Payable", "amount"].iloc[0]) + notes_pay_end = float(bs_m.loc[bs_m["line"] == "Notes Payable", "amount"].iloc[0]) + + dep_exp = float( + depreciation_schedule.loc[depreciation_schedule["month"].astype(str) == month, "dep_expense"].sum() + ) + capex_cash = float(fixed_assets.loc[fixed_assets["in_service_month"].astype(str) == month, "cost"].sum()) + owner_contrib = float(owner_contrib_by_month.get(month, 0.0)) + owner_draw_cash = float(owner_draw_by_month.get(month, 0.0)) + + cf_m = _compute_cf_for_month( + month=month, + cash_begin=cash_begin, + cash_end=cash_end, + ar_begin=ar_begin, + ar_end=ar_end, + inv_begin=inv_begin, + inv_end=inv_end, + ap_begin=ap_begin, + ap_end=ap_end, + sales_tax_begin=sales_tax_begin, + sales_tax_end=sales_tax_end, + wages_pay_begin=wages_pay_begin, + wages_pay_end=wages_pay_end, + payroll_taxes_begin=payroll_taxes_begin, + payroll_taxes_end=payroll_taxes_end, + notes_pay_begin=notes_pay_begin, + notes_pay_end=notes_pay_end, + net_income=float(ni_m), + dep_expense=dep_exp, + capex_cash=capex_cash, + owner_contrib=owner_contrib, + owner_draw_cash=owner_draw_cash, + ) + cf_all.append(cf_m) + + cash_begin = cash_end + ar_begin = ar_end + inv_begin = inv_end + ap_begin = ap_end + sales_tax_begin = sales_tax_end + wages_pay_begin = wages_pay_end + payroll_taxes_begin = payroll_taxes_end + notes_pay_begin = notes_pay_end + + tb_df = pd.concat(tb_all, ignore_index=True) + is_df = pd.concat(is_all, ignore_index=True) + bs_df = pd.concat(bs_all, ignore_index=True) + cf_df = pd.concat(cf_all, ignore_index=True) + + inv_df = pd.DataFrame(inv_moves).sort_values(["date", "txn_id"], kind="mergesort").reset_index(drop=True) + + # ------------------------------------------------------------------ + # Chapter 6 additions: AR events + bank statement feed + # ------------------------------------------------------------------ + ar_gl = gl.loc[ + gl["account_id"].astype(str) == "1100", + ["txn_id", "date", "doc_id", "description", "debit", "credit"], + ].copy() + + if ar_gl.empty: + ar_events_df = pd.DataFrame( + columns=[ + "month", + "txn_id", + "date", + "customer", + "invoice_id", + "event_type", + "amount", + "ar_delta", + "cash_received", + ] + ) + else: + def _stable_bucket(x: str, k: int) -> int: + return sum(ord(ch) for ch in str(x)) % k + + customers = ["AquaSports", "Mariner", "Summit", "Northwind"] + ar_gl["month"] = ar_gl["date"].astype(str).str.slice(0, 7) + ar_gl["customer"] = ar_gl["doc_id"].astype(str).apply( + lambda d: customers[_stable_bucket(d, len(customers))] + ) + ar_gl["invoice_id"] = ar_gl["doc_id"].astype(str) + ar_gl["event_type"] = np.where(ar_gl["debit"] > 0, "invoice", "collection") + ar_gl["amount"] = np.where(ar_gl["debit"] > 0, ar_gl["debit"], ar_gl["credit"]) + ar_gl["ar_delta"] = ar_gl["debit"] - ar_gl["credit"] + ar_gl["cash_received"] = np.where(ar_gl["event_type"] == "collection", ar_gl["credit"], 0.0) + ar_events_df = ( + ar_gl[ + [ + "month", + "txn_id", + "date", + "customer", + "invoice_id", + "event_type", + "amount", + "ar_delta", + "cash_received", + ] + ] + .sort_values(["date", "txn_id"], kind="mergesort") + .reset_index(drop=True) + ) + + # Bank statement feed (external truth): derive from Cash 1000 activity + cash_gl = gl.loc[ + gl["account_id"].astype(str) == "1000", + ["txn_id", "date", "description", "debit", "credit"], + ].copy() + + if cash_gl.empty: + bank_statement_df = pd.DataFrame( + columns=["month", "bank_txn_id", "posted_date", "description", "amount", "gl_txn_id"] + ) + else: + cash_gl["cash_net"] = cash_gl["debit"] - cash_gl["credit"] + cash_txn = ( + cash_gl.groupby("txn_id", observed=True) + .agg(date=("date", "min"), description=("description", "first"), amount=("cash_net", "sum")) + .reset_index() + ) + cash_txn = cash_txn.loc[cash_txn["amount"].abs() > 1e-9].copy() + cash_txn = cash_txn.sort_values(["date", "txn_id"], kind="mergesort").reset_index(drop=True) + + seed_for_bank = int(0 if random_state is None else random_state) + 8675309 + rng_bank = np.random.default_rng(seed_for_bank) + lags = rng_bank.choice([0, 1, 3, 5, 10], size=len(cash_txn), replace=True) + posted_dt = pd.to_datetime(cash_txn["date"]) + pd.to_timedelta(lags, unit="D") + + bank_statement_df = pd.DataFrame( + { + "month": posted_dt.dt.strftime("%Y-%m"), + "bank_txn_id": cash_txn["txn_id"].apply(lambda x: f"B{int(x):06d}"), + "posted_date": posted_dt.dt.strftime("%Y-%m-%d"), + "description": cash_txn["description"].astype(str), + "amount": cash_txn["amount"].astype(float), + "gl_txn_id": cash_txn["txn_id"].astype(int), + } + ) + + # Inject anomalies for deterministic exception testing + if len(months) > 0: + fee_row = { + "month": months[-1], + "bank_txn_id": "B-FEE-0001", + "posted_date": f"{months[-1]}-15", + "description": "Bank fee (no GL link)", + "amount": -25.0, + "gl_txn_id": np.nan, + } + bank_statement_df = pd.concat([bank_statement_df, pd.DataFrame([fee_row])], ignore_index=True) + + if not bank_statement_df.empty: + dup = bank_statement_df.iloc[[0]].copy() + dup["posted_date"] = (pd.to_datetime(dup["posted_date"]) + pd.Timedelta(days=1)).dt.strftime("%Y-%m-%d") + dup["month"] = pd.to_datetime(dup["posted_date"]).dt.strftime("%Y-%m") + dup["description"] = "Duplicate bank txn id (injected)" + bank_statement_df = pd.concat([bank_statement_df, dup], ignore_index=True) + + bank_statement_df = bank_statement_df.sort_values( + ["posted_date", "bank_txn_id"], kind="mergesort" + ).reset_index(drop=True) + + + meta: dict[str, Any] = { + "dataset": "NSO_v1", + "start_month": start_month, + "n_months": int(n_months), + "seed": random_state, + "assumptions": { + "teaching_month_days": 28, + "pct_on_account": float(pct_on_account), + "sales_tax_rate": float(sales_tax_rate), + "inventory_system": "perpetual", + "inventory_count_adjustment_every_months": 3, + "depreciation_method": "straight-line", + "payroll_lag_months": 1, + "sales_tax_remit_lag_months": 1, + "loan_origination_month_index": 1, # month 2 + }, + "notes": [ + "Retained earnings is derived (cumulative net income) for teaching clarity; no closing entries are posted.", + "Cash flow uses an indirect method bridge that includes working-capital changes for current payables.", + "Subledgers are designed for tie-outs: payroll/tax/debt/equity/AP events link to GL via txn_id where applicable.", + ], + } + + return NSOV1Outputs( + chart_of_accounts=coa, + gl_journal=gl, + trial_balance_monthly=tb_df, + statements_is_monthly=is_df, + statements_bs_monthly=bs_df, + statements_cf_monthly=cf_df, + inventory_movements=inv_df, + fixed_assets=fixed_assets, + depreciation_schedule=depreciation_schedule, + payroll_events=pd.DataFrame(payroll_events), + sales_tax_events=pd.DataFrame(sales_tax_events), + debt_schedule=pd.DataFrame(debt_schedule), + equity_events=pd.DataFrame(equity_events), + ap_events=pd.DataFrame(ap_events), + ar_events=ar_events_df, + bank_statement=bank_statement_df, + meta=meta, + ) + + +def write_nso_v1(outputs: NSOV1Outputs, outdir: Path) -> None: + outdir.mkdir(parents=True, exist_ok=True) + + outputs.chart_of_accounts.to_csv(outdir / "chart_of_accounts.csv", index=False) + outputs.gl_journal.to_csv(outdir / "gl_journal.csv", index=False) + outputs.trial_balance_monthly.to_csv(outdir / "trial_balance_monthly.csv", index=False) + outputs.statements_is_monthly.to_csv(outdir / "statements_is_monthly.csv", index=False) + outputs.statements_bs_monthly.to_csv(outdir / "statements_bs_monthly.csv", index=False) + outputs.statements_cf_monthly.to_csv(outdir / "statements_cf_monthly.csv", index=False) + + outputs.inventory_movements.to_csv(outdir / "inventory_movements.csv", index=False) + outputs.fixed_assets.to_csv(outdir / "fixed_assets.csv", index=False) + outputs.depreciation_schedule.to_csv(outdir / "depreciation_schedule.csv", index=False) + + # --- Chapter 5 additions --- + outputs.payroll_events.to_csv(outdir / "payroll_events.csv", index=False) + outputs.sales_tax_events.to_csv(outdir / "sales_tax_events.csv", index=False) + outputs.debt_schedule.to_csv(outdir / "debt_schedule.csv", index=False) + outputs.equity_events.to_csv(outdir / "equity_events.csv", index=False) + outputs.ap_events.to_csv(outdir / "ap_events.csv", index=False) + # --- Chapter 6 additions --- + outputs.ar_events.to_csv(outdir / "ar_events.csv", index=False) + outputs.bank_statement.to_csv(outdir / "bank_statement.csv", index=False) + + (outdir / "nso_v1_meta.json").write_text(json.dumps(outputs.meta, indent=2), encoding="utf-8") + + +def main() -> None: + p = base_parser("Track D Simulator: North Shore Outfitters (NSO) v1 (multi-month running case)") + p.set_defaults(outdir=Path("data/synthetic/nso_v1")) + p.add_argument("--start-month", type=str, default="2025-01", help="Start month (YYYY-MM)") + p.add_argument("--n-months", type=int, default=24, help="Number of months to generate") + p.add_argument("--n-sales-per-month", type=int, default=12) + p.add_argument("--pct-on-account", type=float, default=0.35) + p.add_argument("--sales-tax-rate", type=float, default=0.07) + args = p.parse_args() + + outs = simulate_nso_v1( + start_month=args.start_month, + n_months=args.n_months, + n_sales_per_month=args.n_sales_per_month, + pct_on_account=args.pct_on_account, + sales_tax_rate=args.sales_tax_rate, + random_state=args.seed, + ) + write_nso_v1(outs, args.outdir) + print(f"Wrote NSO v1 dataset -> {args.outdir}") + + +if __name__ == "__main__": + main() diff --git a/workbooks/track_d_template/tests/test_business_smoke.py b/workbooks/track_d_template/tests/test_business_smoke.py new file mode 100644 index 0000000..3a95206 --- /dev/null +++ b/workbooks/track_d_template/tests/test_business_smoke.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +import importlib + +import pytest + + +MODULES = [ + # A few representative Track D scripts (imports should be safe). + "scripts.d00_peek_data", + "scripts.business_ch01_accounting_measurement", + "scripts.business_ch02_double_entry_and_gl", + "scripts.business_ch14_regression_driver_analysis", + "scripts.business_ch23_communicating_results_governance", + # Simulators + "scripts.sim_business_ledgerlab", + "scripts.sim_business_nso_v1", +] + + +@pytest.mark.parametrize("mod_name", MODULES) +def test_track_d_modules_import(mod_name: str) -> None: + mod = importlib.import_module(mod_name) + assert mod is not None + + +@pytest.mark.parametrize( + "mod_name", + [ + "scripts.business_ch01_accounting_measurement", + "scripts.business_ch02_double_entry_and_gl", + "scripts.business_ch14_regression_driver_analysis", + "scripts.business_ch23_communicating_results_governance", + "scripts.sim_business_ledgerlab", + "scripts.sim_business_nso_v1", + ], +) +def test_track_d_modules_define_main(mod_name: str) -> None: + mod = importlib.import_module(mod_name) + assert hasattr(mod, "main"), f"{mod_name} has no main()" + assert callable(getattr(mod, "main"))