Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,23 @@ You should now be able to run:
pytest
make lint
```

without errors.

#### Track D workbook template editing

Track D ships a workbook template ZIP used by `pystatsv1 workbook init --track d`.

**Please do not edit the ZIP by hand.** Instead:

1. Edit the source template under `workbooks/track_d_template/`.
2. Rebuild the committed ZIP:

```bash
python tools/build_workbook_zip.py
```

3. Re-run checks (`pytest -q` is enough to confirm the ZIP drift guard).

### 3. Create a Branch

Use a short, descriptive branch name:
Expand Down
58 changes: 58 additions & 0 deletions tests/test_workbook_track_d_zip_is_current.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

import hashlib
import subprocess
import sys
import tempfile
from pathlib import Path
from zipfile import ZipFile


def _zip_payload_hashes(zip_path: Path) -> dict[str, str]:
"""Return sha256 hashes of *decompressed* member bytes.

This intentionally ignores ZIP metadata (timestamps, compression levels, etc.).
"""
out: dict[str, str] = {}
with ZipFile(zip_path, "r") as zf:
for name in sorted(n for n in zf.namelist() if not n.endswith("/")):
data = zf.read(name)
out[name] = hashlib.sha256(data).hexdigest()
return out


def test_workbook_track_d_zip_is_current() -> None:
"""Guardrail: committed Track D workbook ZIP must match the template source."""
root = Path(__file__).resolve().parents[1]
template_dir = root / "workbooks" / "track_d_template"
committed_zip = root / "src" / "pystatsv1" / "assets" / "workbook_track_d.zip"
builder = root / "tools" / "build_workbook_zip.py"

assert template_dir.exists(), (
f"Missing template source directory: {template_dir}. "
"Did you forget to apply the Track D template source-of-truth patch?"
)
assert committed_zip.exists(), f"Missing committed ZIP: {committed_zip}"
assert builder.exists(), f"Missing ZIP builder script: {builder}"

with tempfile.TemporaryDirectory() as td:
built_zip = Path(td) / "workbook_track_d.zip"
subprocess.run(
[
sys.executable,
str(builder),
"--src",
str(template_dir),
"--dest",
str(built_zip),
],
check=True,
)

built = _zip_payload_hashes(built_zip)
committed = _zip_payload_hashes(committed_zip)

assert built == committed, (
"Committed Track D workbook ZIP is stale or mismatched vs template source-of-truth.\n\n"
"Fix: run `python tools/build_workbook_zip.py` and commit the updated ZIP."
)
113 changes: 113 additions & 0 deletions tools/build_workbook_zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Build committed workbook template ZIPs from source-of-truth directories.

Why this exists
--------------
Historically, workbook templates were edited by manually opening a ZIP.
That inevitably causes drift and makes refactors scary.

For Track D, the source template lives at:
workbooks/track_d_template/

This script (re)builds:
src/pystatsv1/assets/workbook_track_d.zip

Usage
-----
python tools/build_workbook_zip.py

Or explicitly:
python tools/build_workbook_zip.py --src workbooks/track_d_template --dest src/pystatsv1/assets/workbook_track_d.zip
"""

from __future__ import annotations

import argparse
import hashlib
import sys
from pathlib import Path
from typing import Iterable
from zipfile import ZIP_DEFLATED, ZipInfo, ZipFile


_FIXED_ZIP_DT = (2020, 1, 1, 0, 0, 0)
_SKIP_BASENAMES = {".DS_Store", "Thumbs.db"}


def _iter_files(src_dir: Path) -> Iterable[tuple[Path, str]]:
"""Yield (path, zip_relpath) for all files under src_dir (sorted)."""
candidates = [p for p in src_dir.rglob("*") if p.is_file()]
for p in sorted(candidates):
if p.name in _SKIP_BASENAMES:
continue
if "__pycache__" in p.parts:
continue
rel = p.relative_to(src_dir).as_posix()
yield p, rel


def build_zip(src_dir: Path, dest_zip: Path) -> list[str]:
"""Build a deterministic ZIP from src_dir.

Returns a list of archived file paths (POSIX style) in the ZIP.
"""
if not src_dir.exists():
raise FileNotFoundError(f"Template directory not found: {src_dir}")
if not src_dir.is_dir():
raise NotADirectoryError(f"Template path is not a directory: {src_dir}")

files = list(_iter_files(src_dir))
if not files:
raise ValueError(f"Template directory is empty: {src_dir}")

dest_zip.parent.mkdir(parents=True, exist_ok=True)
archived: list[str] = []
with ZipFile(dest_zip, "w") as zf:
for fs_path, arc_path in files:
info = ZipInfo(arc_path)
info.date_time = _FIXED_ZIP_DT
info.compress_type = ZIP_DEFLATED
# Preserve unix permission bits where meaningful.
info.external_attr = (fs_path.stat().st_mode & 0o777) << 16

data = fs_path.read_bytes()
zf.writestr(info, data, compress_type=ZIP_DEFLATED)
archived.append(arc_path)

return archived


def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()


def main(argv: list[str] | None = None) -> int:
root = Path(__file__).resolve().parents[1]
default_src = root / "workbooks" / "track_d_template"
default_dest = root / "src" / "pystatsv1" / "assets" / "workbook_track_d.zip"

p = argparse.ArgumentParser(description="Build workbook ZIP templates from source directories.")
p.add_argument("--src", type=Path, default=default_src, help=f"Template directory (default: {default_src})")
p.add_argument("--dest", type=Path, default=default_dest, help=f"Destination ZIP path (default: {default_dest})")
p.add_argument(
"--list",
action="store_true",
help="Print archived file list after building.",
)

ns = p.parse_args(argv)

archived = build_zip(ns.src, ns.dest)
print(f"Wrote: {ns.dest} ({len(archived)} files, sha256={_sha256(ns.dest)[:12]}…)")
if ns.list:
for name in archived:
print(name)

return 0


if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))
27 changes: 27 additions & 0 deletions workbooks/track_d_template/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
PYTHON ?= python

.PHONY: help
help:
@echo "Targets:"
@echo " setup - (re)generate deterministic datasets under data/synthetic (seed=123)"
@echo " peek - print a quick preview of Track D datasets"
@echo " test - run workbook smoke tests"
@echo " d01..d23 - run a Track D chapter via wrapper (example: make d14)"

.PHONY: setup
setup:
$(PYTHON) scripts/d00_setup_data.py

.PHONY: peek
peek:
$(PYTHON) scripts/d00_peek_data.py

.PHONY: test
test:
pytest -q

# Pattern rule: make d14 -> runs scripts/d14.py
.PHONY: d01 d02 d03 d04 d05 d06 d07 d08 d09 d10 d11 d12 d13 d14 d15 d16 d17 d18 d19 d20 d21 d22 d23

d%:
$(PYTHON) scripts/$@.py
88 changes: 88 additions & 0 deletions workbooks/track_d_template/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# PyStatsV1 Track D Workbook Starter (Business Statistics)

This folder is a **Track D-only workbook** built around an accounting running case.
It is designed for students to:

- run a chapter script
- inspect outputs under `outputs/track_d/`
- repeat with confidence (datasets are deterministic with seed=123)

It works on Linux, macOS, and Windows, and it does **not** require `make`.

## 0) Setup

Create and activate a virtual environment, then install PyStatsV1:

```bash
python -m pip install -U pip
python -m pip install "pystatsv1[workbook]"

# pytest is included via the [workbook] extra
```

## 1) Create this workbook

If you already have this folder, you can skip this.

```bash
pystatsv1 workbook init --track d --dest pystatsv1_track_d
cd pystatsv1_track_d
```

## 2) Peek at the data (recommended)

```bash
pystatsv1 workbook run d00_peek_data
```

That script looks for the two Track D datasets under:

- `data/synthetic/ledgerlab_ch01/`
- `data/synthetic/nso_v1/`

and writes a friendly summary to:

- `outputs/track_d/d00_peek_data_summary.md`

## 3) Reset the datasets (optional)

If you ever delete/edit files under `data/synthetic/`, you can regenerate them.
This keeps the default **seed=123** (same values as the canonical datasets).

```bash
pystatsv1 workbook run d00_setup_data
# or (clean reset)
pystatsv1 workbook run d00_setup_data --force
```

## 4) Run a Track D chapter

First, see the available Track D chapters:

```bash
pystatsv1 workbook list --track d
```

Then run a chapter using the short wrapper names `d01` ... `d23`.
For example:

```bash
pystatsv1 workbook run d01
pystatsv1 workbook run d14
pystatsv1 workbook run d23
```

You can also run the full script names directly (same result):

```bash
pystatsv1 workbook run business_ch01_accounting_measurement
pystatsv1 workbook run business_ch14_regression_driver_analysis
```

## 5) Check your environment (smoke test)

```bash
pystatsv1 workbook check business_smoke
```

If you ever get stuck, see the PyStatsV1 docs on ReadTheDocs.
Empty file.
Empty file.
Empty file.
14 changes: 14 additions & 0 deletions workbooks/track_d_template/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Track D workbook dependencies (optional)
#
# If you installed PyStatsV1 via:
# pip install pystatsv1[workbook]
# you already have everything you need.

numpy>=1.24
pandas>=2.0
scipy>=1.10
statsmodels>=0.14
matplotlib>=3.8
pingouin>=0.5
scikit-learn>=1.3
pytest>=8.2
Loading
Loading