lrd/dataAnalysis.Rmd at master · adamSales/lrd · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
---
title: "LRD paper Appendix C, Data Analysis for AP Example"
author: "Adam C Sales & Ben B Hansen"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: html_document
---

Initialization.
If the variable `paperdir` is supplied, LaTeX code for the tables is saved there, for inclusion in the main paper; otherwise, the code is saved in the current working directory.


```{r setup, include=FALSE,cache=FALSE}
if(!exists('paperdir')) paperdir <- '.'
knitr::opts_chunk$set(echo = TRUE,error=FALSE, warning=FALSE,cache=FALSE,fig.path=paste0(paperdir,'/figure/'))
options(scipen=1, digits=3)
```

General dependencies.
```{r}
#print(getwd())
library('knitr')
library(ggplot2)
library(xtable)
library(robustbase)
library(rdd)
source("R/functions.r")
source("R/data.r")
source("R/ddsandwich.r")
```


```{r load-data,warning=FALSE,message=FALSE}


ciChar <- function(ci,est=FALSE){
    #ci <- round(ci,2)
    ci.out <- paste('(',round2(ci[1]),', ',round2(ci[2]),')',sep='')
    if(est) ci.out <- c(ci.out,as.character(ci[3]))
    ci.out
}

round2 <- function(x) sprintf('%.2f',x)#round(x,2)

nfunc <- function(bw) sum(abs(dat$R)<bw,na.rm=TRUE)

Wfunc <- function(W)
    paste0('[',round2(W[1]), ', ',round2(W[2]),')')


```


Load data. This routine will download and unzip the Lindo et al. replication
material into the `extdata` subdirectory.
```{r}
if(!is.element('dat',ls())){
  extdata_dir <- 'extdata'
  LSO_dta_location <- paste0(extdata_dir,'/data_for_analysis.dta')
  if(!file.exists(LSO_dta_location)){
    loc2 <- fetchLSOdata(extdata_dir)
    stopifnot(loc2==LSO_dta_location)
  }
  dat <- foreign::read.dta(LSO_dta_location)
  dat$dist_from_cut <- round(dat$dist_from_cut,2)
  dat$hsgrade_pct[dat$hsgrade_pct==100]=99.5
  dat$lhsgrade_pct=qlogis(dat$hsgrade_pct/100)
  dat$R <- dat$dist_from_cut
  dat$Z <- dat$gpalscutoff
}
```

Total sample size, and number of "compliers" (students whose actual AP
status matched what would have been predicted from first-year GPA)
```{r}
ncomp <- with(dat,sum(gpalscutoff& !probation_year1))
ntot <- nrow(dat)
```

Create RDD plots. First of the outcome (subsequent GPA):
```{r rddFig, fig.height=3.9,fig.width=3.9,dpi=300}

figDat <- aggregate(dat[,c('nextGPA','lhsgrade_pct')],by=list(R=dat$R),
                    FUN=mean,na.rm=TRUE)
figDat$n <- as.vector(table(dat$R))

ggplot(figDat,aes(R,nextGPA,size=n))+
  geom_point()+
  geom_vline(xintercept=0,linetype='dashed',size=1.5)+
  xlab('First-Year GPA (Distance from Cutoff)')+ylab('Avg Subsequent GPA')+
  scale_size_continuous(range=c(0.2,2),guide=FALSE)+
  theme(text = element_text(size=12))
```
then a covariate (High-School GPA):
```{r hs_gpaFig}
ggplot(figDat,aes(R,lhsgrade_pct,size=n))+geom_point()+geom_vline(xintercept=0,linetype='dotted',size=2)+xlab('First-Year GPA (Distance from Cutoff)')+ylab('Avg logit(hsgrade_pct)')+scale_size_continuous(range=c(0.2,2),guide=FALSE)+theme(text = element_text(size=12))

```

The McCrary density test failure and recovery described in Section 4.1
```{r}
ggplot(figDat,aes(R,n))+
  geom_point()+
  geom_vline(xintercept=0,linetype='dashed',size=.5)+
  xlab('First-Year GPA (Distance from Cutoff)')+
  ylab('Number of Observations')+
  scale_size_continuous(range=c(0.2,2),guide=FALSE)+
  theme(text = element_text(size=12))

(mccrary1 <- rdd::DCdensity(dat$R,-0.005, bin=0.01,plot=FALSE) )
( mccraryDougnut <- rdd::DCdensity(dat$R[dat$R!=0],-0.005, bin=0.01,plot=FALSE) )
```

The Frandsen (2016) test for manipulation when the
running variable is discrete, when the cutoff is the maximum GPA
receiving probation, or the minimum GPA not receiving probation:
```{r}
(frandsen1 <- frandsen(dat$R,cutoff=-0.01,BW=0.5))
(frandsen2 <- frandsen(dat$R,cutoff=0,BW=0.5))
```


## main analysis ##

The sh method uses `lmrob`, which in turn requires a random
seed.  For confidence interval and estimation routines it's
helpful to use the same seed throughout, as this means the
S-estimation initializers will always be sampling the same
subsets of the sample.


```{r main,cache=TRUE}
set.seed(201705)
lmrob_seed <- .Random.seed


SHmain <- sh(subset(dat,R!=0),BW=0.5,outcome='nextGPA',Dvar='probation_year1')
unlist(SHmain)

# No-donut variant (not discussed in text)
SHnodo <- sh(dat, BW=0.5, outcome='nextGPA',Dvar='probation_year1')

SHdataDriven <- sh(dat=subset(dat,R!=0),outcome='nextGPA')
unlist(SHdataDriven)

SHcubic <- sh(dat=subset(dat,R!=0),BW=0.5,outcome='nextGPA',rhs='~Z+poly(R,3)')
unlist(SHcubic)

SHitt <- sh(dat=subset(dat,R!=0),BW=0.5,outcome='nextGPA', Dvar=NULL)
unlist(SHitt) # (AS has no aversion to scatalogical humor)
```

Create Table 1:
```{r table1}

resultsTab <-
 do.call('rbind',
  lapply(list(main=SHmain,data_driven=SHdataDriven,cubic=SHcubic,ITT=SHitt),
   function(res) c(round2(res$CI[3]),
                   ciChar(res$CI[1:2]),
                   W=Wfunc(res$W),
                   n=prettyNum(res$n,',',trim=TRUE))))

colnames(resultsTab) <- c('Estimate','95\\% CI','$\\mathcal{W}$','$n$')


kable(resultsTab)


resultsTab <- cbind(Specification=c('Main','Adaptive $\\mathcal{W}$','Cubic','ITT'),resultsTab)

print(xtable(resultsTab,align='rlcccc'),
  file=paste0(paperdir,"/tab-results.tex"), floating=F,
  include.rownames=FALSE,
  sanitize.colnames.function=function(x) x,
  sanitize.text.function=function(x) x)

```

Results from two alternative methods, creating Table 2:
```{r tabAlt,cache=TRUE}
CFT <- cft(subset(dat,R!=0),BW=NULL,outcome='nextGPA')
IK <- ik(dat,outcome='nextGPA')

altTab <-
 do.call('rbind',
  lapply(list(`Local Linear`=IK,Limitless=SHitt,`Local Permutation`=CFT),
   function(res) c(round2(res$CI[3]),
    ciChar(res$CI[1:2]),
    W=Wfunc(res$W),
    n=prettyNum(res$n,',',trim=TRUE))))


colnames(altTab) <- c('Estimate','95\\% CI','$\\mathcal{W}$','$n$')

kable(altTab)

altTab <- cbind(Method=rownames(altTab),altTab)

print(xtable(altTab,align='rlcccc'),
  file=paste0(paperdir,"/tab-alt.tex"), floating=F,
  include.rownames=FALSE,
  sanitize.colnames.function=function(x) x)

```


## Examine robustness weights

If there are regions of the data of high influence, the robust fitter
should reject or downweight more frequently in those regions, and we'll see dips
on the plot of robustness weights vs R.

Here is the plot corresponding to the main analysis presented in the paper.
```{r weightMod}
lmrob_main <- lmrob(nextGPA~Z+R,
              offset=(SHmain$CI[3]*probation_year1),
              data=dat,subset=(R!=0 & abs(R)<.5),
              method='MM',
              control=lmrob.control(seed=lmrob_seed,
                            k.max=500, maxit.scale=500)
              )
```

Robustness weights are mostly near 1, never below .25.
```{r weightSummary}
robwts_main <- weights(lmrob_main, type="robustness")
summary(robwts_main)
```

Not too much pattern to the robustness weights --
although the lowest values do occur at slightly above
the cutpoint, where we'd see savvy students whose
rose above the cut due to savvyness.

```{r weightPlot, fig.height=3.9,fig.width=3.9,dpi=300}
require(mgcv)
ggp_main <- ggplot(data.frame(R=lmrob_main$model$R,
                              robweights=robwts_main),
                   aes(x=R,y=robweights))
ggp_main + geom_point(alpha=.1) + stat_smooth()+theme(text = element_text(size=12))+ylab('Robustness Weights')
```

When we fit without omitting R=0 students, here is
the best fitting version of the model.

```{r weightNoDonut}
lmrob_nodo <- lmrob(nextGPA~Z+R,
              offset=(SHnodo$CI[3]*probation_year1),
              data=dat,subset=(abs(R)<.5),
              method='MM',
              control=lmrob.control(seed=lmrob_seed,
                                    k.max=500, maxit.scale=500)
      )

robwts_nodo <- weights(lmrob_nodo, type="robustness")
```

Do the observations at R=0 stand out?
With no donut, robustness weights have a slight tendency
to be lower among observations at R=0.

```{r weightNoDonutSummary}
by(robwts_nodo,
   lmrob_nodo$model$R==0, summary)
t.test(wt~atcut, data.frame(wt=robwts_nodo,
                            atcut=(lmrob_nodo$model$R==0)),
       var.equal=F, alternative="g")
```

The plot is similar to that of the main analysis,
with some low robustness weight observations as R=0
but also plenty of ordinary weight observations there.
```{r weightNoDonutPlot}
ggp_nodo <- ggplot(data.frame(R=lmrob_nodo$model$R,
                              robweights=robwts_nodo),
                   aes(x=R,y=robweights))
ggp_nodo + geom_point(alpha=.1) + stat_smooth()
```


Save results:
```{r save}
save(list=ls(),file=paste0('RDanalysis-',format(Sys.time(),"%m%d%H%M"),'.RData'))
```

Session information
```{r}
sessionInfo()
```