From dd82dedf1e4b773c14a80404c1065d9b312208d4 Mon Sep 17 00:00:00 2001 From: Nicholas Karlson Date: Mon, 10 Nov 2025 23:09:18 -0800 Subject: [PATCH] feat(cli): standardize --outdir/--seed; add smoke tests; Makefile & CI updates --- .github/workflows/ci.yml | 40 +++++--- Makefile | 54 +++++++--- scripts/__init__.py | 0 scripts/_cli.py | 31 ++++++ scripts/ch13_fitness_mixed.py | 128 ++++++++++++++++-------- scripts/ch13_stroop_within.py | 115 +++++++++++++++------ scripts/sim_fitness_2x2.py | 153 ++++++++++++++++++---------- scripts/sim_stroop.py | 183 ++++++++++++++++++++-------------- tests/test_cli_smoke.py | 15 +++ 9 files changed, 489 insertions(+), 230 deletions(-) create mode 100644 scripts/__init__.py create mode 100644 scripts/_cli.py create mode 100644 tests/test_cli_smoke.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1836341..2189145 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,5 @@ name: ci + on: workflow_dispatch: push: @@ -7,24 +8,33 @@ on: branches: [ main ] jobs: - test: - runs-on: windows-latest # matches your local environment + windows-py310: + runs-on: windows-latest + timeout-minutes: 20 env: PYTHONIOENCODING: utf-8 + defaults: + run: + shell: pwsh + steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: "3.10" - cache: "pip" + python-version: '3.10' + cache: pip + + - name: Install make (Chocolatey) + run: choco install make -y - - name: Install deps - shell: bash + - name: Install dependencies run: | python -m pip install --upgrade pip - if [ -f requirements.txt ]; then pip install -r requirements.txt; else pip install numpy pandas statsmodels matplotlib scipy; fi - if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi + if (Test-Path requirements.txt) { pip install -r requirements.txt } else { pip install numpy pandas statsmodels matplotlib scipy } + if (Test-Path requirements-dev.txt) { pip install -r requirements-dev.txt } else { pip install ruff pytest } - name: Lint run: make lint @@ -35,10 +45,12 @@ jobs: - name: Tiny Chapter 13 smoke run: make ch13-ci - - name: Upload plots (artifact) + - name: Upload artifacts (plots & data) if: always() uses: actions/upload-artifact@v4 with: - name: ch13-plots - path: outputs/ - if-no-files-found: ignore \ No newline at end of file + name: ch13-artifacts + if-no-files-found: ignore + path: | + data/synthetic/** + outputs/** diff --git a/Makefile b/Makefile index 3f191b4..3db22a5 100644 --- a/Makefile +++ b/Makefile @@ -1,29 +1,53 @@ +# Default target .DEFAULT_GOAL := help +# Config +PYTHON := python +SEED ?= 123 +OUT_SYN := data/synthetic +OUT_CH13 := outputs/ch13 + .PHONY: help help: @echo "Available targets:" - @echo " ch13 - full Chapter 13 run (plots saved)" - @echo " ch13-ci - tiny smoke (fast) for CI" + @echo " ch13 - full Chapter 13 run (sim + analysis + plots)" + @echo " ch13-ci - tiny smoke run for CI (fast)" + @echo " lint - run ruff checks" + @echo " lint-fix - auto-fix with ruff" + @echo " test - run pytest" + @echo " clean - remove generated outputs" +# ---- Fast CI smoke (small n, deterministic) ---- .PHONY: ch13-ci ch13-ci: - python scripts/sim_stroop.py --n-subjects 6 --n-trials 10 --seed 2025 - python scripts/ch13_stroop_within.py --save-plots - python scripts/sim_fitness_2x2.py --n-per-group 40 --seed 2025 - python scripts/ch13_fitness_mixed.py --save-plots + $(PYTHON) -m scripts.sim_stroop --n-subjects 6 --n-trials 10 --seed $(SEED) --outdir $(OUT_SYN) + $(PYTHON) -m scripts.ch13_stroop_within --data $(OUT_SYN)/psych_stroop_trials.csv --outdir $(OUT_CH13) --save-plots --seed $(SEED) + $(PYTHON) -m scripts.sim_fitness_2x2 --n-per-group 10 --seed $(SEED) --outdir $(OUT_SYN) + $(PYTHON) -m scripts.ch13_fitness_mixed --data $(OUT_SYN)/fitness_long.csv --outdir $(OUT_CH13) --save-plots --seed $(SEED) +# ---- Full Chapter 13 demo (default sizes) ---- .PHONY: ch13 ch13: - python scripts/sim_stroop.py - python scripts/ch13_stroop_within.py --save-plots - python scripts/sim_fitness_2x2.py - python scripts/ch13_fitness_mixed.py --save-plots + $(PYTHON) -m scripts.sim_stroop --seed $(SEED) --outdir $(OUT_SYN) + $(PYTHON) -m scripts.ch13_stroop_within --data $(OUT_SYN)/psych_stroop_trials.csv --outdir $(OUT_CH13) --save-plots --seed $(SEED) + $(PYTHON) -m scripts.sim_fitness_2x2 --seed $(SEED) --outdir $(OUT_SYN) + $(PYTHON) -m scripts.ch13_fitness_mixed --data $(OUT_SYN)/fitness_long.csv --outdir $(OUT_CH13) --save-plots --seed $(SEED) -.PHONY: lint format test +# ---- Quality gates ---- +.PHONY: lint lint: - ruff check tests/ -format: - black . + ruff check . + +.PHONY: lint-fix +lint-fix: + ruff check . --fix + +.PHONY: test test: - pytest -q || true \ No newline at end of file + pytest -q + +# ---- Utilities ---- +.PHONY: clean +clean: + @echo "Removing generated outputs in $(OUT_SYN) and $(OUT_CH13)" + -@rm -rf $(OUT_SYN) $(OUT_CH13) diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/_cli.py b/scripts/_cli.py new file mode 100644 index 0000000..524b1d4 --- /dev/null +++ b/scripts/_cli.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import argparse +import pathlib +import random + +import numpy as np + + +def base_parser(description: str) -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description=description) + p.add_argument( + "--outdir", + type=pathlib.Path, + default=pathlib.Path("outputs"), + help="Where to write outputs (plots, csv). Default: ./outputs", + ) + p.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility") + return p + + +def apply_seed(seed: int | None) -> None: + if seed is None: + return + np.random.seed(seed) + random.seed(seed) + try: # optional: seed torch if present + import torch # pragma: no cover + torch.manual_seed(seed) + except Exception: + pass diff --git a/scripts/ch13_fitness_mixed.py b/scripts/ch13_fitness_mixed.py index 5127cf3..bcfd4ce 100644 --- a/scripts/ch13_fitness_mixed.py +++ b/scripts/ch13_fitness_mixed.py @@ -1,86 +1,130 @@ # SPDX-License-Identifier: MIT -import argparse, os, math -import numpy as np, pandas as pd -import statsmodels.formula.api as smf -import matplotlib; matplotlib.use("Agg") +""" +Chapter 13 — Mixed fitness study analysis: +- LMM with random intercepts +- Within-group paired tests +- Between-group Welch t + Hedges g + +Usage: + python -m scripts.ch13_fitness_mixed \ + --data data/synthetic/fitness_long.csv \ + --outdir outputs/ch13 --save-plots --seed 123 +""" + +import argparse +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") # safe for CI/headless import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import statsmodels.api as sm +from scipy import stats + # --- make Windows console UTF-8 friendly --- import sys -if os.name == "nt": +if sys.platform.startswith("win"): try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass -def hedges_g(a, b): + +def hedges_g(a: pd.Series, b: pd.Series) -> float: na, nb = len(a), len(b) sa, sb = a.var(ddof=1), b.var(ddof=1) - s_p = np.sqrt(((na-1)*sa + (nb-1)*sb)/(na+nb-2)) - d = (a.mean()-b.mean())/s_p - J = 1 - 3/(4*(na+nb)-9) - return d*J - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--data", default="data/synthetic/fitness_long.csv") - ap.add_argument("--save-plots", action="store_true") + s_p = np.sqrt(((na - 1) * sa + (nb - 1) * sb) / (na + nb - 2)) + d = (a.mean() - b.mean()) / s_p + J = 1 - 3 / (4 * (na + nb) - 9) + return float(d * J) + + +def main() -> None: + ap = argparse.ArgumentParser(description="Analyze mixed-design fitness data") + ap.add_argument( + "--data", + type=Path, + default=Path("data/synthetic/fitness_long.csv"), + help="Path to long-format CSV.", + ) + ap.add_argument("--save-plots", action="store_true", help="Save summary plot") + ap.add_argument( + "--outdir", + type=Path, + default=Path("outputs"), + help="Where to write plots (if --save-plots).", + ) + ap.add_argument("--seed", type=int, default=None, help="Optional RNG seed") args = ap.parse_args() + if args.seed is not None: + np.random.seed(args.seed) # kept for CLI consistency; not required by current code + df = pd.read_csv(args.data) - df["time"] = pd.Categorical(df["time"], categories=["pre","post"], ordered=True) + df["time"] = pd.Categorical(df["time"], categories=["pre", "post"], ordered=True) df["group"] = pd.Categorical(df["group"]) # Mixed model with random intercept per subject - import statsmodels.api as sm - md = sm.MixedLM.from_formula("strength ~ time*group + age + sex + bmi", - groups="id", re_formula="1", data=df) + md = sm.MixedLM.from_formula( + "strength ~ time*group + age + sex + bmi", + groups="id", + re_formula="1", + data=df, + ) m = md.fit(method="lbfgs") print("=== LMM: strength ~ time*group + age + sex + bmi + (1|id) ===") print(m.summary()) # Within-group pre→post paired tests - from scipy import stats print("\n=== Within-group pre→post (paired) ===") - for g in df["group"].cat.categories: - wide = (df[df.group==g] - .pivot_table(index="id", columns="time", values="strength", observed=False)) + for gname in df["group"].cat.categories: + wide = ( + df[df.group == gname] + .pivot_table(index="id", columns="time", values="strength", observed=False) + ) pre, post = wide["pre"], wide["post"] diff = (post - pre).dropna() - mean = diff.mean(); sd = diff.std(ddof=1); n = diff.shape[0] - t = mean / (sd/np.sqrt(n)) - p = 2*stats.t.sf(abs(t), df=n-1) + mean = diff.mean() + sd = diff.std(ddof=1) + n = diff.shape[0] + t = mean / (sd / np.sqrt(n)) + p = 2 * stats.t.sf(abs(t), df=n - 1) d_paired = mean / sd - print(f"{g:8s} n={n:2d} Δ={mean:6.2f} t={t:6.2f} p={p:.3g} d_paired={d_paired:.2f}") + print(f"{gname:8s} n={n:2d} Δ={mean:6.2f} t={t:6.2f} p={p:.3g} d_paired={d_paired:.2f}") # Between-group at post (Welch t) + Hedges g - post = df[df.time=="post"] + post = df[df.time == "post"] cats = post.group.cat.categories - gA = post[post.group==cats[0]]["strength"] - gB = post[post.group==cats[1]]["strength"] + gA = post[post.group == cats[0]]["strength"] + gB = post[post.group == cats[1]]["strength"] t2, p2 = stats.ttest_ind(gA, gB, equal_var=False) g = hedges_g(gA, gB) print("\n=== Between groups at post (Welch t) ===") print(f"t={t2:.2f} p={p2:.3g} Hedges g={g:.2f}") if args.save_plots: - os.makedirs("outputs", exist_ok=True) - fig, ax = plt.subplots(figsize=(6,4)) - # spaghetti - for (grp, sid), sub in df.sort_values("time").groupby(["group","id"], observed=False): - ax.plot([0,1], sub["strength"].values, alpha=0.2) + args.outdir.mkdir(parents=True, exist_ok=True) + fig, ax = plt.subplots(figsize=(6, 4)) + # spaghetti per subject + for (_, sid), sub in df.sort_values("time").groupby(["group", "id"], observed=False): + ax.plot([0, 1], sub["strength"].values, alpha=0.2) # thick means per group - mean_pre = df[df.time=="pre" ].groupby("group", observed=False)["strength"].mean() - mean_post = df[df.time=="post"].groupby("group", observed=False)["strength"].mean() + mean_pre = df[df.time == "pre"].groupby("group", observed=False)["strength"].mean() + mean_post = df[df.time == "post"].groupby("group", observed=False)["strength"].mean() for gname in df["group"].cat.categories: - ax.plot([0,1], [mean_pre[gname], mean_post[gname]], linewidth=3, label=gname) - ax.set_xticks([0,1]); ax.set_xticklabels(["pre","post"]) + ax.plot([0, 1], [mean_pre[gname], mean_post[gname]], linewidth=3, label=gname) + ax.set_xticks([0, 1]) + ax.set_xticklabels(["pre", "post"]) ax.set_ylabel("Strength") ax.legend() fig.tight_layout() - fp = "outputs/ch13_fitness_spaghetti.png"; fig.savefig(fp, dpi=150) - print("Saved plot ->", fp) + out_path = args.outdir / "ch13_fitness_spaghetti.png" + fig.savefig(out_path, dpi=150) + print(f"Saved plot -> {out_path}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/ch13_stroop_within.py b/scripts/ch13_stroop_within.py index 0826a24..f155de4 100644 --- a/scripts/ch13_stroop_within.py +++ b/scripts/ch13_stroop_within.py @@ -1,59 +1,103 @@ # SPDX-License-Identifier: MIT """ Chapter 13 — Within-Subjects (Stroop): paired t-test vs. mixed model + Usage: - python scripts/ch13_stroop_within.py --save-plots + python -m scripts.ch13_stroop_within --save-plots \ + --data data/synthetic/psych_stroop_trials.csv \ + --outdir outputs/ch13 --seed 123 + Options: - --data data/synthetic/psych_stroop_trials.csv + --data Path to trial-level CSV (default: data/synthetic/psych_stroop_trials.csv) + --outdir Where to write plots (default: outputs) + --seed RNG seed (not required; included for consistency across scripts) --min-rt 200 --max-rt 2000 # trial-level trims """ -import argparse, os, math, numpy as np, pandas as pd -import statsmodels.api as sm -import statsmodels.formula.api as smf -from scipy import stats + +import argparse +import pathlib +from pathlib import Path + import matplotlib -matplotlib.use("Agg") +matplotlib.use("Agg") # Safe for CI/headless environments import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import statsmodels.api as sm # noqa: F401 (imported for users who inspect models) +import statsmodels.formula.api as smf +from scipy import stats + -def load_trials(path): +def load_trials(path: Path) -> pd.DataFrame: df = pd.read_csv(path) df["condition"] = df["condition"].astype("category") return df -def cohen_dz(a, b): - # within-subject effect size + +def cohen_dz(a: pd.Series, b: pd.Series) -> float: + """Within-subject effect size (dz) for paired data.""" diff = a - b - return diff.mean()/diff.std(ddof=1) - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--data", default="data/synthetic/psych_stroop_trials.csv") - ap.add_argument("--min-rt", type=float, default=200.0) - ap.add_argument("--max-rt", type=float, default=2000.0) - ap.add_argument("--save-plots", action="store_true") + return float(diff.mean() / diff.std(ddof=1)) + + +def main() -> None: + ap = argparse.ArgumentParser(description="Paired t-test vs. mixed model on Stroop data") + ap.add_argument( + "--data", + type=pathlib.Path, + default=Path("data/synthetic/psych_stroop_trials.csv"), + help="Path to trial-level CSV.", + ) + ap.add_argument("--min-rt", type=float, default=200.0, help="Min RT (ms) to keep") + ap.add_argument("--max-rt", type=float, default=2000.0, help="Max RT (ms) to keep") + ap.add_argument("--save-plots", action="store_true", help="Save summary plots") + ap.add_argument( + "--outdir", + type=pathlib.Path, + default=Path("outputs"), + help="Where to write plots (if --save-plots).", + ) + # Included for CLI consistency; not used by the current analyses but harmless. + ap.add_argument("--seed", type=int, default=None, help="RNG seed (optional)") args = ap.parse_args() - os.makedirs("outputs", exist_ok=True) + # Optional seeding keeps behavior consistent with other scripts + if args.seed is not None: + np.random.seed(args.seed) + + # Ensure output directory exists when saving plots + if args.save_plots: + args.outdir.mkdir(parents=True, exist_ok=True) + df = load_trials(args.data) # Keep correct trials and trim RTs before = len(df) - df = df[(df["correct"] == 1) & (df["rt_ms"].between(args.min_rt, args.max_rt))] - print(f"Filtered: kept {len(df)}/{before} trials (correct & {args.min_rt}-{args.max_rt} ms)") + df = df[(df["correct"] == 1) & (df["rt_ms"].between(args.min_rt, args.max_rt))].copy() + print( + f"Filtered: kept {len(df)}/{before} trials " + f"(correct & {args.min_rt}-{args.max_rt} ms)" + ) # Log-transform RT (typical for skew) df["log_rt"] = np.log(df["rt_ms"]) # Subject means per condition (paired t-test) - subj_means = df.groupby(["subject", "condition"], as_index=False, observed=False)["log_rt"].mean() + subj_means = ( + df.groupby(["subject", "condition"], as_index=False, observed=False)["log_rt"] + .mean() + ) wide = subj_means.pivot(index="subject", columns="condition", values="log_rt") + # paired t-test (incongruent > congruent expected) - tstat, pval = stats.ttest_rel(wide["incongruent"], wide["congruent"], nan_policy="omit") + tstat, pval = stats.ttest_rel( + wide["incongruent"], wide["congruent"], nan_policy="omit" + ) dz = cohen_dz(wide["incongruent"], wide["congruent"]) print("\n=== Paired t-test on subject means (log RT) ===") print(f"t = {tstat:.3f}, p = {pval:.3g}, Cohen's dz = {dz:.3f}") - delta_ms = (np.exp(wide["incongruent"].mean()) - np.exp(wide["congruent"].mean())) + delta_ms = np.exp(wide["incongruent"].mean()) - np.exp(wide["congruent"].mean()) print(f"Back-translated mean RT difference ≈ {delta_ms:.1f} ms (incongruent - congruent)") # Mixed model on trial-level log RT @@ -72,24 +116,29 @@ def main(): # Plots if args.save_plots: # per-subject spaghetti of means - plt.figure(figsize=(6,4)) + plt.figure(figsize=(6, 4)) for _, row in wide.iterrows(): - plt.plot([0,1], [row["congruent"], row["incongruent"]], alpha=0.3) - plt.xticks([0,1], ["congruent","incongruent"]) + plt.plot([0, 1], [row["congruent"], row["incongruent"]], alpha=0.3) + plt.xticks([0, 1], ["congruent", "incongruent"]) plt.ylabel("Mean log RT") plt.title("Within-subject means (log RT)") - plt.tight_layout(); plt.savefig("outputs/ch13_stroop_subject_means.png", dpi=150) + plt.tight_layout() + p1 = args.outdir / "ch13_stroop_subject_means.png" + plt.savefig(p1, dpi=150) # violin/box of trial-level log RT by condition - plt.figure(figsize=(6,4)) - data_c = [df.loc[df["condition"]==c,"log_rt"].values for c in ["congruent","incongruent"]] + plt.figure(figsize=(6, 4)) + data_c = [df.loc[df["condition"] == c, "log_rt"].values for c in ["congruent", "incongruent"]] plt.violinplot(data_c, showmeans=True) - plt.xticks([1,2], ["congruent","incongruent"]) + plt.xticks([1, 2], ["congruent", "incongruent"]) plt.ylabel("Trial log RT") plt.title("Trial-level distribution by condition") - plt.tight_layout(); plt.savefig("outputs/ch13_stroop_violins.png", dpi=150) + plt.tight_layout() + p2 = args.outdir / "ch13_stroop_violins.png" + plt.savefig(p2, dpi=150) + + print(f"Saved plots -> {p1}, {p2}") - print("Saved plots -> outputs/ch13_stroop_subject_means.png, ch13_stroop_violins.png") if __name__ == "__main__": main() diff --git a/scripts/sim_fitness_2x2.py b/scripts/sim_fitness_2x2.py index c21aa4f..18010ec 100644 --- a/scripts/sim_fitness_2x2.py +++ b/scripts/sim_fitness_2x2.py @@ -1,74 +1,123 @@ # SPDX-License-Identifier: MIT -import os, json, time, argparse -import numpy as np, pandas as pd +""" +Simulate a 2x2 mixed design (Group between × Time within) fitness study. +Creates in --outdir (default: data/synthetic): + fitness_subjects.csv + fitness_long.csv + fitness_meta.json +""" -# globals configured in main() -rng = np.random.default_rng() -N = 80 -GROUPS = ["ProgramA","ProgramB"] +import argparse +import json +import time +import pathlib +from pathlib import Path + +import numpy as np +import pandas as pd + +# Defaults DEFAULT_SEED = 7 +GROUPS = ["ProgramA", "ProgramB"] +N_PER_GROUP_DEFAULT = 40 # total N = n_per_group * 2 -def simulate(n_per_group=None, seed=DEFAULT_SEED): - global rng, N - # Allow overriding globals via args, primarily for testing - if n_per_group is not None: - N = n_per_group * len(GROUPS) # Adjust N based on n_per_group - + +def simulate( + n_per_group: int = N_PER_GROUP_DEFAULT, + seed: int = DEFAULT_SEED, + outdir: Path = Path("data/synthetic"), +): + """Simulates a simple strength-training experiment and writes CSV/JSON to outdir.""" rng = np.random.default_rng(seed) + outdir.mkdir(parents=True, exist_ok=True) + + n_total = n_per_group * len(GROUPS) + + subjects = pd.DataFrame( + { + "id": np.arange(1, n_total + 1), + "group": rng.choice(GROUPS, size=n_total, replace=True), + "age": rng.integers(18, 45, size=n_total), # 18..44 + "sex": rng.choice(["F", "M"], size=n_total, p=[0.5, 0.5]), + "bmi": rng.normal(24, 3.5, size=n_total).clip(17, 38), + } + ) + + # Baseline strength with small covariate effects + base = ( + 80 + + 0.8 * (subjects["age"] - 30) + + 1.2 * (subjects["bmi"] - 24) + + rng.normal(0, 8, size=n_total) + ) - os.makedirs("data/synthetic", exist_ok=True) - subjects = pd.DataFrame({ - "id": np.arange(1, N+1), - "group": rng.choice(GROUPS, size=N, replace=True), - "age": rng.integers(18, 45, size=N), - "sex": rng.choice(["F","M"], size=N, p=[0.5,0.5]), - "bmi": rng.normal(24, 3.5, size=N).clip(17, 38), - }) - - # baseline strength with small covariate effects - base = (80 - + 0.8*(subjects["age"]-30) - + 1.2*(subjects["bmi"]-24) - + rng.normal(0, 8, size=N)) - - # program gains - gain_A = rng.normal(8, 4, size=N) - gain_B = rng.normal(12, 4, size=N) - gain = np.where(subjects["group"].to_numpy()=="ProgramA", gain_A, gain_B) - - long = pd.concat([ - pd.DataFrame({"id": subjects["id"], "time":"pre", - "strength": base + rng.normal(0, 3, size=N)}), - pd.DataFrame({"id": subjects["id"], "time":"post", - "strength": base + gain + rng.normal(0, 3, size=N)}) - ], ignore_index=True).merge(subjects, on="id") - - subjects.to_csv("data/synthetic/fitness_subjects.csv", index=False) - long.to_csv("data/synthetic/fitness_long.csv", index=False) + # Program gains (B > A on average) + gain_A = rng.normal(8, 4, size=n_total) + gain_B = rng.normal(12, 4, size=n_total) + gain = np.where(subjects["group"].to_numpy() == "ProgramA", gain_A, gain_B) + + long = pd.concat( + [ + pd.DataFrame( + { + "id": subjects["id"], + "time": "pre", + "strength": base + rng.normal(0, 3, size=n_total), + } + ), + pd.DataFrame( + { + "id": subjects["id"], + "time": "post", + "strength": base + gain + rng.normal(0, 3, size=n_total), + } + ), + ], + ignore_index=True, + ).merge(subjects, on="id") + + # Write artifacts + subjects_path = outdir / "fitness_subjects.csv" + long_path = outdir / "fitness_long.csv" + subjects.to_csv(subjects_path, index=False) + long.to_csv(long_path, index=False) meta = dict( seed=seed, - n=int(N), + n=int(n_total), + n_per_group=int(n_per_group), design="2x2 mixed (Group between × Time within)", programs=GROUPS, dv="strength_1RM_like", - generated_utc=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + generated_utc=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), ) - with open("data/synthetic/fitness_meta.json","w") as f: + meta_path = outdir / "fitness_meta.json" + with meta_path.open("w") as f: json.dump(meta, f, indent=2) - print("Wrote fitness_subjects.csv, fitness_long.csv, fitness_meta.json") - # Return for testing convenience + print(f"Wrote {subjects_path}, {long_path}, {meta_path}") return subjects, long + def main(): - ap = argparse.ArgumentParser() + ap = argparse.ArgumentParser(description="Simulate mixed-design fitness study data") ap.add_argument("--seed", type=int, default=DEFAULT_SEED, help="RNG seed") - ap.add_argument("--n-per-group", type=int, default=40, help="Number of subjects per group") + ap.add_argument( + "--n-per-group", + type=int, + default=N_PER_GROUP_DEFAULT, + help="Number of subjects per program", + ) + ap.add_argument( + "--outdir", + type=pathlib.Path, + default=Path("data/synthetic"), + help="Where to write outputs (CSV/JSON).", + ) args = ap.parse_args() - - # Pass CLI args to simulate - simulate(n_per_group=args.n_per_group, seed=args.seed) + + simulate(n_per_group=args.n_per_group, seed=args.seed, outdir=args.outdir) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/sim_stroop.py b/scripts/sim_stroop.py index 607b69e..35a22ce 100644 --- a/scripts/sim_stroop.py +++ b/scripts/sim_stroop.py @@ -1,34 +1,40 @@ -import argparse # SPDX-License-Identifier: MIT """ Simulate trial-level Stroop data (plausible undergrad study). -Creates: - data/synthetic/psych_stroop_trials.csv - data/synthetic/psych_stroop_subjects.csv - data/synthetic/psych_stroop_meta.json +Creates in --outdir (default: data/synthetic): + psych_stroop_trials.csv + psych_stroop_subjects.csv + psych_stroop_meta.json """ -import os, time, json + +import argparse +import json +import time +import pathlib +from pathlib import Path + import numpy as np import pandas as pd -RNG_SEED = 42 -rng = np.random.default_rng(RNG_SEED) # Will be reset by simulate() - -N_SUBJ = 60 -N_TRIALS_PER_COND = 100 +# Defaults (kept explicit so meta stays in sync) +RNG_SEED_DEFAULT = 42 +N_SUBJ_DEFAULT = 60 +N_TRIALS_PER_COND_DEFAULT = 100 CONDITIONS = ["congruent", "incongruent"] -# Parameters (kept explicit so meta stays in sync) -BASE_RT_CONGRUENT = 550 # ms -BASE_RT_INCONGRUENT = 650 # ms (=> 100 ms penalty) -BETWEEN_PERSON_RT_SD = 80 # ms subject intercept variation -TRIAL_NOISE_SD = 120 # ms trial-to-trial noise -BASELINE_SPEED_EFFECT = 25 # ms per +1 SD baseline_speed (faster -> shave time) +# Parameters +BASE_RT_CONGRUENT = 550 # ms +BASE_RT_INCONGRUENT = 650 # ms (=> 100 ms penalty) +BETWEEN_PERSON_RT_SD = 80 # ms subject intercept variation +TRIAL_NOISE_SD = 120 # ms trial-to-trial noise +BASELINE_SPEED_EFFECT = 25 # ms per +1 SD baseline_speed (faster -> shave time) + ACC_BASE_CONGRUENT = 0.97 -ACC_BASE_INCONGRUENT = 0.90 # (=> 0.07 penalty) -ACC_BSL_SPEED_PENALTY = 0.03 # subtract per +1 SD if positive +ACC_BASE_INCONGRUENT = 0.90 # (=> 0.07 penalty) +ACC_BSL_SPEED_PENALTY = 0.03 # subtract per +1 SD if positive ACC_ERR_BIAS_SD = 0.05 ACC_CLIP = (0.75, 0.99) + OUTLIER_RATE_ANTICIP = 0.03 OUTLIER_RT_ANTICIP_MU = 150 OUTLIER_RT_ANTICIP_SD = 30 @@ -37,35 +43,37 @@ OUTLIER_RT_LAPSE_SD = 200 RT_FLOOR = 80 -def simulate(n_subjects=N_SUBJ, n_trials=N_TRIALS_PER_COND, seed=RNG_SEED): - global rng, N_SUBJ, N_TRIALS_PER_COND, RNG_SEED - - # Set globals based on args, primarily for testing - N_SUBJ = n_subjects - N_TRIALS_PER_COND = n_trials - RNG_SEED = seed - rng = np.random.default_rng(RNG_SEED) - - os.makedirs("data/synthetic", exist_ok=True) + +def simulate( + n_subjects: int = N_SUBJ_DEFAULT, + n_trials: int = N_TRIALS_PER_COND_DEFAULT, + seed: int = RNG_SEED_DEFAULT, + outdir: Path = Path("data/synthetic"), +): + """Simulates Stroop data and writes CSVs to outdir.""" + rng = np.random.default_rng(seed) + outdir.mkdir(parents=True, exist_ok=True) # subject-level covariates - subjects = pd.DataFrame({ - "subject": np.arange(1, N_SUBJ + 1), - "gender": rng.choice(["F", "M"], size=N_SUBJ, p=[0.6, 0.4]), - "handedness": rng.choice(["R", "L"], size=N_SUBJ, p=[0.9, 0.1]), - # baseline processing speed (lower = faster RT offset) - "baseline_speed": rng.normal(loc=0.0, scale=1.0, size=N_SUBJ) - }) + subjects = pd.DataFrame( + { + "subject": np.arange(1, n_subjects + 1), + "gender": rng.choice(["F", "M"], size=n_subjects, p=[0.6, 0.4]), + "handedness": rng.choice(["R", "L"], size=n_subjects, p=[0.9, 0.1]), + # baseline processing speed (lower = faster RT offset) + "baseline_speed": rng.normal(loc=0.0, scale=1.0, size=n_subjects), + } + ) rows = [] for _, s in subjects.iterrows(): # random subject intercept (ms) - subj_offset = rng.normal(0, BETWEEN_PERSON_RT_SD) # between-person variability + subj_offset = rng.normal(0, BETWEEN_PERSON_RT_SD) # accuracy tendency (fast-but-error-prone archetype) err_bias = rng.normal(0, ACC_ERR_BIAS_SD) for cond in CONDITIONS: - for t in range(N_TRIALS_PER_COND): + for t in range(n_trials): # base RTs base = BASE_RT_CONGRUENT if cond == "congruent" else BASE_RT_INCONGRUENT # trial noise @@ -81,37 +89,54 @@ def simulate(n_subjects=N_SUBJ, n_trials=N_TRIALS_PER_COND, seed=RNG_SEED): # sprinkle outliers (anticipations & lapses) if rng.random() < OUTLIER_RATE_ANTICIP: - rt = rng.normal(OUTLIER_RT_ANTICIP_MU, OUTLIER_RT_ANTICIP_SD) # anticipatory + rt = rng.normal(OUTLIER_RT_ANTICIP_MU, OUTLIER_RT_ANTICIP_SD) # anticipatory if rng.random() < OUTLIER_RATE_LAPSE: - rt = rng.normal(OUTLIER_RT_LAPSE_MU, OUTLIER_RT_LAPSE_SD) # lapse + rt = rng.normal(OUTLIER_RT_LAPSE_MU, OUTLIER_RT_LAPSE_SD) # lapse rt = max(RT_FLOOR, float(rt)) # keep sane - rows.append({ - "subject": int(s["subject"]), - "condition": cond, - "trial": int(t + 1), - "rt_ms": rt, - "correct": int(correct), - }) + rows.append( + { + "subject": int(s["subject"]), + "condition": cond, + "trial": int(t + 1), + "rt_ms": rt, + "correct": int(correct), + } + ) trials = pd.DataFrame(rows) # write CSVs - subjects.to_csv("data/synthetic/psych_stroop_subjects.csv", index=False) - trials.to_csv("data/synthetic/psych_stroop_trials.csv", index=False) - print("Wrote data/synthetic/psych_stroop_subjects.csv and psych_stroop_trials.csv") - + subjects_path = outdir / "psych_stroop_subjects.csv" + trials_path = outdir / "psych_stroop_trials.csv" + subjects.to_csv(subjects_path, index=False) + trials.to_csv(trials_path, index=False) + print(f"Wrote {subjects_path} and {trials_path}") + # Write meta *after* simulation is done - write_meta(subjects, trials) - + write_meta( + subjects=subjects, + trials=trials, + outdir=outdir, + seed=seed, + n_trials_per_cond=n_trials, + ) + return subjects, trials -def write_meta(subjects: pd.DataFrame, trials: pd.DataFrame): + +def write_meta( + subjects: pd.DataFrame, + trials: pd.DataFrame, + outdir: Path, + seed: int, + n_trials_per_cond: int, +): meta = dict( - seed=RNG_SEED, + seed=seed, n_subjects=int(len(subjects)), - n_trials_per_cond=int(N_TRIALS_PER_COND), + n_trials_per_cond=int(n_trials_per_cond), conditions=CONDITIONS, rt_ms=dict( base_congruent=BASE_RT_CONGRUENT, @@ -121,10 +146,12 @@ def write_meta(subjects: pd.DataFrame, trials: pd.DataFrame): trial_noise_sd=TRIAL_NOISE_SD, baseline_speed_effect_ms_per_sd=BASELINE_SPEED_EFFECT, outliers=dict( - anticipations=dict(rate=OUTLIER_RATE_ANTICIP, - mu=OUTLIER_RT_ANTICIP_MU, sd=OUTLIER_RT_ANTICIP_SD), - lapses=dict(rate=OUTLIER_RATE_LAPSE, - mu=OUTLIER_RT_LAPSE_MU, sd=OUTLIER_RT_LAPSE_SD), + anticipations=dict( + rate=OUTLIER_RATE_ANTICIP, mu=OUTLIER_RT_ANTICIP_MU, sd=OUTLIER_RT_ANTICIP_SD + ), + lapses=dict( + rate=OUTLIER_RATE_LAPSE, mu=OUTLIER_RT_LAPSE_MU, sd=OUTLIER_RT_LAPSE_SD + ), ), floor_ms=RT_FLOOR, ), @@ -137,26 +164,34 @@ def write_meta(subjects: pd.DataFrame, trials: pd.DataFrame): clip=list(ACC_CLIP), ), generated_utc=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), - file_rows=dict( - subjects=int(len(subjects)), - trials=int(len(trials)), - ), + file_rows=dict(subjects=int(len(subjects)), trials=int(len(trials))), ) - with open("data/synthetic/psych_stroop_meta.json", "w") as f: + meta_path = outdir / "psych_stroop_meta.json" + with meta_path.open("w") as f: json.dump(meta, f, indent=2) - print("Wrote data/synthetic/psych_stroop_meta.json") + print(f"Wrote {meta_path}") def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--seed", type=int, default=RNG_SEED) - ap.add_argument("--n-subjects", type=int, default=N_SUBJ) - ap.add_argument("--n-trials", type=int, default=N_TRIALS_PER_COND) + ap = argparse.ArgumentParser(description="Simulate trial-level Stroop data") + ap.add_argument("--seed", type=int, default=RNG_SEED_DEFAULT) + ap.add_argument("--n-subjects", type=int, default=N_SUBJ_DEFAULT) + ap.add_argument("--n-trials", type=int, default=N_TRIALS_PER_COND_DEFAULT) + ap.add_argument( + "--outdir", + type=pathlib.Path, + default=Path("data/synthetic"), + help="Where to write outputs (CSV/JSON).", + ) args = ap.parse_args() - - # Pass CLI args to simulate - simulate(n_subjects=args.n_subjects, n_trials=args.n_trials, seed=args.seed) + + simulate( + n_subjects=args.n_subjects, + n_trials=args.n_trials, + seed=args.seed, + outdir=args.outdir, + ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_cli_smoke.py b/tests/test_cli_smoke.py new file mode 100644 index 0000000..687a208 --- /dev/null +++ b/tests/test_cli_smoke.py @@ -0,0 +1,15 @@ +from __future__ import annotations +import pathlib, subprocess, sys, tempfile + +SCRIPTS = ["ch13_stroop_within","ch13_fitness_mixed","sim_stroop","sim_fitness_2x2"] + +def run_module(mod: str) -> None: + root = pathlib.Path(__file__).resolve().parents[1] + with tempfile.TemporaryDirectory() as tmpd: + cmd = [sys.executable, "-m", f"scripts.{mod}", "--outdir", tmpd, "--seed", "42"] + res = subprocess.run(cmd, cwd=root, capture_output=True, text=True) + assert res.returncode == 0, res.stderr or res.stdout + +def test_scripts_run_with_cli(): + for m in SCRIPTS: + run_module(m) \ No newline at end of file