aqrd23/analysis.R at main · odebbie/aqrd23 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#Library and loading----
library(glmnet)
library(glmnetUtils)
library(pROC)
library(rsample)
library(ModelMetrics)
library(ggplot2)
library(tidyverse)
library(modelsummary)


set.seed(438)

df <- read_csv("data/broward_agg.csv",
               na = "NA")

df <- df |>
  mutate(race = as.factor(race),
         sex = as.factor(sex),
         charge_degree = as.factor(charge_degree),
         charge_id = as.factor(charge_id),
         drug_type = as.factor(drug_type))

#not getting rid of other races because we need to consider overall accuracy for this first part

#TOP 1: The Paper's Linear Models----
#1: 7-predictor logistic algorithm assessment (uses full dataset)

get_metrics <- function(y){
  #store relevant values
  accuracy <- mean(y$y_act == y$y_hat) #overall accuracy for this round
  acc_black <- mean(y$y_act[y$race == 2] == y$y_hat[y$race == 2])
  acc_white <- mean(y$y_act[y$race == 1] == y$y_hat[y$race == 1])
  #Confusion Matrix Black and white
  cm_black <- confusionMatrix(y$y_act[(y$race == 2)], y$y_hat[(y$race == 2)])
  cm_white <- confusionMatrix(y$y_act[(y$race == 1)], y$y_hat[(y$race == 1)])

  #False Pos and False Neg
  fn_black <- cm_black[2,1]/sum(cm_black[2,]) #2nd row is "actually true" row
  fn_white <- cm_white[2,1]/sum(cm_white[2,])
  fp_black <- cm_black[1,2]/sum(cm_black[1,]) #1st row is "actually false" row
  fp_white <- cm_white[1,2]/sum(cm_white[1,])

  #matrix
  cbind(accuracy, acc_black, acc_white, fn_black, fp_black, fn_white, fp_white)
}
sev_df <- NULL
for (i in 1:1000) { #i could make this parallel....
  #random split
  splitobj <- initial_split(df, prop = 4/5) #initial object, 80/20 split
  train <- training(splitobj)
  test <- testing(splitobj)

  #train model
  sev_pred <- glm(two_year_recid ~ age + sex + priors_count + juv_fel_count +
                    juv_misd_count + charge_degree + charge_id, data=train, family="binomial")
  #make prediction
  test$charge_id[which(!(test$charge_id %in% unique(train$charge_id)))] <- NA #editing test data
  #fixes error with new levels...cant predict for these, so just ignore?

  y_hat <- predict(sev_pred, newdata=test, type="response")
  y_hat <- as.numeric(y_hat > 0.5) #binary predictions

  #get ids in this test batch
  y <- df[which(df$id %in% test$id),] |>
    select(race, two_year_recid) |> #add race variables so i can filter by this condition
    rename(y_act = two_year_recid)

  #combine obs
  y$y_hat <- y_hat
  y <- na.omit(y, cols="y_hat")#get rid of NA rows...

  sev_df <- rbind(sev_df, get_metrics(y))

}
sev_df <- data.frame(sev_df)
write_csv(sev_df, "sev_est.csv")

#2: 2-predictor logistic algorithm assessment (uses full dataset)
#bootstrap standard errors
two_df <- NULL
for (i in 1:1000) { #bootstrapped = need to resample?
  #random split
  splitobj <- initial_split(df, prop = 4/5) #initial object, 80/20 split
  train <- training(splitobj)
  test <- testing(splitobj)

  #train <- sample_n(df, size = floor(0.8*dim(df)[1]), replace = TRUE) #bootstrap
  #test <- sample_n(df, size = floor(0.2*dim(df)[1]), replace = TRUE) #bootstrap

  #train model
  two_pred <- glm(two_year_recid ~ age + priors_count, data=train, family="binomial")

  #make prediction
  y_hat <- predict(two_pred, newdata=test, type="response") #needs to include race info
  y_hat <- as.numeric(y_hat > 0.5) #binary predictions

  #get ids in this test batch
  y <- df[which(df$id %in% test$id),] |>
    #to use bootstrap version, need to change from which to get duplicates
    select(race, two_year_recid) |> #add race variables so i can filter by this condition
    rename(y_act = two_year_recid)

  #combine obs
  y$y_hat <- y_hat

  #make df
  two_df <- rbind(two_df, get_metrics(y))

}

two_df <- data.frame(two_df)
write_csv(two_df, "two_est.csv")


#TOP 2: Given race, white people have a negative correlation with decile score, while Black people have a uniform one----
dec_white <- df |>
  filter(race == 1,
         drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance")) |>
  group_by(compas_decile_score) |>
  count(race, compas_decile_score)
chisq.test(dec_white$n) #probability uniform
chisq.test(dec_white$n[1:9]) #probability uniform without 10th decile

dec_black <- df |>
  filter(race == 2,
         drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance")) |>
  group_by(compas_decile_score) |>
  count(compas_decile_score)
chisq.test(dec_black$n) #probability uniform dist
chisq.test(dec_black$n[1:9]) #probability uniform without 10th decile

#Looking at Race and Drug Interactions----

#specifically interaction btwn charge and not charged
df_sub <- df |>
  mutate(drug_related = if_else(aggregated == "Drug-Related", TRUE, FALSE)) |>
  filter(race %in% 1:2,
         drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance", NA))

glm_compare_comp <- glm(compas_guess ~ race*drug_related, df_sub, family="binomial")
glm_compare_act <- glm(two_year_recid ~ race*drug_related, df_sub, family="binomial")

glms_sub <- list("COMPAS Guess" = glm_compare_comp,
                 "Actual 2-Year Recidivism" = glm_compare_act)

modelsummary(c(glms_sub),
             coef_map = c("(Intercept)" = "$\\beta_0$ (White)",
                          "race2" = "$\\beta_1$ (Black)",
                          "drug_relatedTRUE" = "$\\beta_2$ (Drug-Related Charge)",
                          "race2:drug_relatedTRUE" = "$\\beta_3$ (Black and Drug-Related Charge)"),
             gof_map = c("nobs"))


#looking at decile

df_sub <- df_sub |>
  mutate(drug_related = if_else(aggregated == "Drug-Related", TRUE, FALSE)) |>
  filter(race %in% 1:2,
         drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance", NA))|>
        mutate(drug_type = if_else(is.na(drug_type), "None", drug_type),
               drug_type = as.factor(drug_type))
df_sub$drug_type <- relevel(df_sub$drug_type, ref = "None")

lm_drug_indicator <- lm(compas_decile_score ~ drug_type, df_sub)
lm_drug_w_race <- lm(compas_decile_score ~ drug_type*race, df_sub)

lms <- list("Drug Interaction with Decile" = lm_drug_indicator,
            "Race and Drug Interaction with Decile" = lm_drug_w_race)
modelsummary(c(lms),
             coef_map = c("(Intercept)" = "Intercept",
                          "drug_typeCannabis/Marijuana" = "Cannabis Charge (Indicator)",
                          "drug_typeCocaine" = "Cocaine (Indicator)",
                          "drug_typeControlled Substance" = "Controlled Substance (Indicator)",
                          "race2" = "Black (Indicator)",
                          "drug_typeCannabis/Marijuana:race2" = "Cannabis Charge and Black (Interaction)",
                          "drug_typeCocaine:race2" = "Cocaine and Black (Interaction)",
                          "drug_typeControlled Substance:race2" = "Controlled Substance and Black (Interaction)"),
             gof_map = c("nobs", "r.squared", "rmse"),
             statistic = c("std.error",
                           "p = {p.value}"))