Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 93 additions & 23 deletions scripts/sim_stroop.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,136 @@
Creates:
data/synthetic/psych_stroop_trials.csv
data/synthetic/psych_stroop_subjects.csv
data/synthetic/psych_stroop_meta.json
"""
import os, numpy as np, pandas as pd
rng = np.random.default_rng(42)
import os, time, json
import numpy as np
import pandas as pd

RNG_SEED = 42
rng = np.random.default_rng(RNG_SEED)

N_SUBJ = 60
N_TRIALS_PER_COND = 100
CONDITIONS = ["congruent", "incongruent"]

# Parameters (kept explicit so meta stays in sync)
BASE_RT_CONGRUENT = 550 # ms
BASE_RT_INCONGRUENT = 650 # ms (=> 100 ms penalty)
BETWEEN_PERSON_RT_SD = 80 # ms subject intercept variation
TRIAL_NOISE_SD = 120 # ms trial-to-trial noise
BASELINE_SPEED_EFFECT = 25 # ms per +1 SD baseline_speed (faster -> shave time)
ACC_BASE_CONGRUENT = 0.97
ACC_BASE_INCONGRUENT = 0.90 # (=> 0.07 penalty)
ACC_BSL_SPEED_PENALTY = 0.03 # subtract per +1 SD if positive
ACC_ERR_BIAS_SD = 0.05
ACC_CLIP = (0.75, 0.99)
OUTLIER_RATE_ANTICIP = 0.03
OUTLIER_RT_ANTICIP_MU = 150
OUTLIER_RT_ANTICIP_SD = 30
OUTLIER_RATE_LAPSE = 0.04
OUTLIER_RT_LAPSE_MU = 2300
OUTLIER_RT_LAPSE_SD = 200
RT_FLOOR = 80

def simulate():
os.makedirs("data/synthetic", exist_ok=True)

# subject-level covariates
subjects = pd.DataFrame({
"subject": np.arange(1, N_SUBJ+1),
"gender": rng.choice(["F","M"], size=N_SUBJ, p=[0.6,0.4]),
"handedness": rng.choice(["R","L"], size=N_SUBJ, p=[0.9,0.1]),
"subject": np.arange(1, N_SUBJ + 1),
"gender": rng.choice(["F", "M"], size=N_SUBJ, p=[0.6, 0.4]),
"handedness": rng.choice(["R", "L"], size=N_SUBJ, p=[0.9, 0.1]),
# baseline processing speed (lower = faster RT offset)
"baseline_speed": rng.normal(loc=0.0, scale=1.0, size=N_SUBJ)
})

rows = []
for _, s in subjects.iterrows():
# random subject intercept (ms)
subj_offset = rng.normal(0, 80) # between-person variability
subj_offset = rng.normal(0, BETWEEN_PERSON_RT_SD) # between-person variability
# accuracy tendency (fast-but-error-prone archetype)
err_bias = rng.normal(0, 0.05)
err_bias = rng.normal(0, ACC_ERR_BIAS_SD)

for cond in CONDITIONS:
for t in range(N_TRIALS_PER_COND):
# base RTs: congruent faster than incongruent
base = 550 if cond == "congruent" else 650
# base RTs
base = BASE_RT_CONGRUENT if cond == "congruent" else BASE_RT_INCONGRUENT
# trial noise
noise = rng.normal(0, 120)
noise = rng.normal(0, TRIAL_NOISE_SD)
# baseline_speed: faster people shave time, slower add time
rt = base + subj_offset - 25*s["baseline_speed"] + noise
rt = base + subj_offset - BASELINE_SPEED_EFFECT * s["baseline_speed"] + noise

# accuracy: a bit worse for incongruent
p_correct = 0.97 if cond == "congruent" else 0.90
p_correct += -0.03*max(0, s["baseline_speed"]) + err_bias
p_correct = np.clip(p_correct, 0.75, 0.99)
correct = rng.random() < p_correct
p_correct = ACC_BASE_CONGRUENT if cond == "congruent" else ACC_BASE_INCONGRUENT
p_correct += -ACC_BSL_SPEED_PENALTY * max(0, s["baseline_speed"]) + err_bias
p_correct = float(np.clip(p_correct, *ACC_CLIP))
correct = bool(rng.random() < p_correct)

# sprinkle outliers (anticipations & lapses)
if rng.random() < 0.03:
rt = rng.normal(150, 30) # anticipatory
if rng.random() < 0.04:
rt = rng.normal(2300, 200) # lapse
if rng.random() < OUTLIER_RATE_ANTICIP:
rt = rng.normal(OUTLIER_RT_ANTICIP_MU, OUTLIER_RT_ANTICIP_SD) # anticipatory
if rng.random() < OUTLIER_RATE_LAPSE:
rt = rng.normal(OUTLIER_RT_LAPSE_MU, OUTLIER_RT_LAPSE_SD) # lapse

rt = max(80, rt) # keep sane
rt = max(RT_FLOOR, float(rt)) # keep sane

rows.append({
"subject": int(s["subject"]),
"condition": cond,
"trial": t+1,
"rt_ms": float(rt),
"trial": int(t + 1),
"rt_ms": rt,
"correct": int(correct),
})

trials = pd.DataFrame(rows)

# write CSVs
subjects.to_csv("data/synthetic/psych_stroop_subjects.csv", index=False)
trials.to_csv("data/synthetic/psych_stroop_trials.csv", index=False)
print("Wrote data/synthetic/psych_stroop_subjects.csv and psych_stroop_trials.csv")

return subjects, trials

def write_meta(subjects: pd.DataFrame, trials: pd.DataFrame):
meta = dict(
seed=RNG_SEED,
n_subjects=int(len(subjects)),
n_trials_per_cond=int(N_TRIALS_PER_COND),
conditions=CONDITIONS,
rt_ms=dict(
base_congruent=BASE_RT_CONGRUENT,
base_incongruent=BASE_RT_INCONGRUENT,
incongruent_penalty=BASE_RT_INCONGRUENT - BASE_RT_CONGRUENT,
between_person_sd=BETWEEN_PERSON_RT_SD,
trial_noise_sd=TRIAL_NOISE_SD,
baseline_speed_effect_ms_per_sd=BASELINE_SPEED_EFFECT,
outliers=dict(
anticipations=dict(rate=OUTLIER_RATE_ANTICIP,
mu=OUTLIER_RT_ANTICIP_MU, sd=OUTLIER_RT_ANTICIP_SD),
lapses=dict(rate=OUTLIER_RATE_LAPSE,
mu=OUTLIER_RT_LAPSE_MU, sd=OUTLIER_RT_LAPSE_SD),
),
floor_ms=RT_FLOOR,
),
acc=dict(
base_congruent=ACC_BASE_CONGRUENT,
base_incongruent=ACC_BASE_INCONGRUENT,
incongruent_penalty=ACC_BASE_CONGRUENT - ACC_BASE_INCONGRUENT,
baseline_speed_penalty_per_sd=ACC_BSL_SPEED_PENALTY,
error_bias_sd=ACC_ERR_BIAS_SD,
clip=list(ACC_CLIP),
),
generated_utc=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
file_rows=dict(
subjects=int(len(subjects)),
trials=int(len(trials)),
),
)
with open("data/synthetic/psych_stroop_meta.json", "w") as f:
json.dump(meta, f, indent=2)
print("Wrote data/synthetic/psych_stroop_meta.json")

if __name__ == "__main__":
simulate()
subjects_df, trials_df = simulate()
write_meta(subjects_df, trials_df)