# synthetic_repro.py
import numpy as np
import pandas as pd
import duckdb
from duckreg.estimators import DuckRegression
import statsmodels.formula.api as smf
np.random.seed(123)
n_clusters = 50
cluster_size = 10
N = n_clusters * cluster_size
cluster_ids = np.repeat(np.arange(n_clusters), cluster_size)
class_size = np.random.normal(30, 5, size=N)
# cluster random intercept
u = np.repeat(np.random.normal(0, 2, size=n_clusters), cluster_size)
eps = np.random.normal(0, 5, size=N)
# outcome
id_score = 50 + 0.5 * class_size + u + eps
data = pd.DataFrame({
"id_score": id_score,
"class_size": class_size,
"class_id": cluster_ids
})
# statsmodels
model = smf.ols("id_score ~ class_size", data=data).fit()
ms = model.get_robustcov_results(cov_type='cluster', groups=data['class_id'])
print("statsmodels cluster SE:\n", ms.summary())
# duckreg
conn = duckdb.connect("database.db")
conn.execute("DROP TABLE IF EXISTS test_data")
conn.execute("CREATE TABLE test_data AS SELECT * FROM data")
m = DuckRegression(
db_name="database.db",
table_name="test_data",
formula="id_score ~ class_size",
cluster_col="class_id",
n_bootstraps=200,
seed=21,
)
m.fit()
print("duckreg summary:")
display(m.summary())
pd.DataFrame([[ms.params[1], ms.bse[1]], [m.summary()['point_estimate'][1], m.summary()['standard_error'][1]]], columns=['Point estimate', 'Standard error'], index=['Statsmodels', 'DuckRegression'])
When running OLS with clustered data, the standard errors from
DuckRegressionandstatsmodelsdiffer by a non-trivial amount.Expected behavior:
Actual behavior:
statsmodelsreports SE ≈ 0.032 for the slope andduckregreports SE ≈ 0.0251.What I tried:
n_bootstrapsReplication with simulated data: