-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.R
More file actions
182 lines (144 loc) · 6.86 KB
/
analysis.R
File metadata and controls
182 lines (144 loc) · 6.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#Library and loading----
library(glmnet)
library(glmnetUtils)
library(pROC)
library(rsample)
library(ModelMetrics)
library(ggplot2)
library(tidyverse)
library(modelsummary)
set.seed(438)
df <- read_csv("data/broward_agg.csv",
na = "NA")
df <- df |>
mutate(race = as.factor(race),
sex = as.factor(sex),
charge_degree = as.factor(charge_degree),
charge_id = as.factor(charge_id),
drug_type = as.factor(drug_type))
#not getting rid of other races because we need to consider overall accuracy for this first part
#TOP 1: The Paper's Linear Models----
#1: 7-predictor logistic algorithm assessment (uses full dataset)
get_metrics <- function(y){
#store relevant values
accuracy <- mean(y$y_act == y$y_hat) #overall accuracy for this round
acc_black <- mean(y$y_act[y$race == 2] == y$y_hat[y$race == 2])
acc_white <- mean(y$y_act[y$race == 1] == y$y_hat[y$race == 1])
#Confusion Matrix Black and white
cm_black <- confusionMatrix(y$y_act[(y$race == 2)], y$y_hat[(y$race == 2)])
cm_white <- confusionMatrix(y$y_act[(y$race == 1)], y$y_hat[(y$race == 1)])
#False Pos and False Neg
fn_black <- cm_black[2,1]/sum(cm_black[2,]) #2nd row is "actually true" row
fn_white <- cm_white[2,1]/sum(cm_white[2,])
fp_black <- cm_black[1,2]/sum(cm_black[1,]) #1st row is "actually false" row
fp_white <- cm_white[1,2]/sum(cm_white[1,])
#matrix
cbind(accuracy, acc_black, acc_white, fn_black, fp_black, fn_white, fp_white)
}
sev_df <- NULL
for (i in 1:1000) { #i could make this parallel....
#random split
splitobj <- initial_split(df, prop = 4/5) #initial object, 80/20 split
train <- training(splitobj)
test <- testing(splitobj)
#train model
sev_pred <- glm(two_year_recid ~ age + sex + priors_count + juv_fel_count +
juv_misd_count + charge_degree + charge_id, data=train, family="binomial")
#make prediction
test$charge_id[which(!(test$charge_id %in% unique(train$charge_id)))] <- NA #editing test data
#fixes error with new levels...cant predict for these, so just ignore?
y_hat <- predict(sev_pred, newdata=test, type="response")
y_hat <- as.numeric(y_hat > 0.5) #binary predictions
#get ids in this test batch
y <- df[which(df$id %in% test$id),] |>
select(race, two_year_recid) |> #add race variables so i can filter by this condition
rename(y_act = two_year_recid)
#combine obs
y$y_hat <- y_hat
y <- na.omit(y, cols="y_hat")#get rid of NA rows...
sev_df <- rbind(sev_df, get_metrics(y))
}
sev_df <- data.frame(sev_df)
write_csv(sev_df, "sev_est.csv")
#2: 2-predictor logistic algorithm assessment (uses full dataset)
#bootstrap standard errors
two_df <- NULL
for (i in 1:1000) { #bootstrapped = need to resample?
#random split
splitobj <- initial_split(df, prop = 4/5) #initial object, 80/20 split
train <- training(splitobj)
test <- testing(splitobj)
#train <- sample_n(df, size = floor(0.8*dim(df)[1]), replace = TRUE) #bootstrap
#test <- sample_n(df, size = floor(0.2*dim(df)[1]), replace = TRUE) #bootstrap
#train model
two_pred <- glm(two_year_recid ~ age + priors_count, data=train, family="binomial")
#make prediction
y_hat <- predict(two_pred, newdata=test, type="response") #needs to include race info
y_hat <- as.numeric(y_hat > 0.5) #binary predictions
#get ids in this test batch
y <- df[which(df$id %in% test$id),] |>
#to use bootstrap version, need to change from which to get duplicates
select(race, two_year_recid) |> #add race variables so i can filter by this condition
rename(y_act = two_year_recid)
#combine obs
y$y_hat <- y_hat
#make df
two_df <- rbind(two_df, get_metrics(y))
}
two_df <- data.frame(two_df)
write_csv(two_df, "two_est.csv")
#TOP 2: Given race, white people have a negative correlation with decile score, while Black people have a uniform one----
dec_white <- df |>
filter(race == 1,
drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance")) |>
group_by(compas_decile_score) |>
count(race, compas_decile_score)
chisq.test(dec_white$n) #probability uniform
chisq.test(dec_white$n[1:9]) #probability uniform without 10th decile
dec_black <- df |>
filter(race == 2,
drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance")) |>
group_by(compas_decile_score) |>
count(compas_decile_score)
chisq.test(dec_black$n) #probability uniform dist
chisq.test(dec_black$n[1:9]) #probability uniform without 10th decile
#Looking at Race and Drug Interactions----
#specifically interaction btwn charge and not charged
df_sub <- df |>
mutate(drug_related = if_else(aggregated == "Drug-Related", TRUE, FALSE)) |>
filter(race %in% 1:2,
drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance", NA))
glm_compare_comp <- glm(compas_guess ~ race*drug_related, df_sub, family="binomial")
glm_compare_act <- glm(two_year_recid ~ race*drug_related, df_sub, family="binomial")
glms_sub <- list("COMPAS Guess" = glm_compare_comp,
"Actual 2-Year Recidivism" = glm_compare_act)
modelsummary(c(glms_sub),
coef_map = c("(Intercept)" = "$\\beta_0$ (White)",
"race2" = "$\\beta_1$ (Black)",
"drug_relatedTRUE" = "$\\beta_2$ (Drug-Related Charge)",
"race2:drug_relatedTRUE" = "$\\beta_3$ (Black and Drug-Related Charge)"),
gof_map = c("nobs"))
#looking at decile
df_sub <- df_sub |>
mutate(drug_related = if_else(aggregated == "Drug-Related", TRUE, FALSE)) |>
filter(race %in% 1:2,
drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance", NA))|>
mutate(drug_type = if_else(is.na(drug_type), "None", drug_type),
drug_type = as.factor(drug_type))
df_sub$drug_type <- relevel(df_sub$drug_type, ref = "None")
lm_drug_indicator <- lm(compas_decile_score ~ drug_type, df_sub)
lm_drug_w_race <- lm(compas_decile_score ~ drug_type*race, df_sub)
lms <- list("Drug Interaction with Decile" = lm_drug_indicator,
"Race and Drug Interaction with Decile" = lm_drug_w_race)
modelsummary(c(lms),
coef_map = c("(Intercept)" = "Intercept",
"drug_typeCannabis/Marijuana" = "Cannabis Charge (Indicator)",
"drug_typeCocaine" = "Cocaine (Indicator)",
"drug_typeControlled Substance" = "Controlled Substance (Indicator)",
"race2" = "Black (Indicator)",
"drug_typeCannabis/Marijuana:race2" = "Cannabis Charge and Black (Interaction)",
"drug_typeCocaine:race2" = "Cocaine and Black (Interaction)",
"drug_typeControlled Substance:race2" = "Controlled Substance and Black (Interaction)"),
gof_map = c("nobs", "r.squared", "rmse"),
statistic = c("std.error",
"p = {p.value}"))