equiticity-census-data/creating_datasets_and_index.R at master · NUstat/equiticity-census-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
library(tidyverse)
library(tidycensus)
library(rgdal)
library(rgeos)
library(sp)
library(magrittr)
library(raster)
library(sf)
library(httr)
library(XML)
library(viridis)
library(patchwork)
library(openxlsx)
library(corrplot)
library(janitor)


# Reading in community area shapefile
#This data was obtained from https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Community-Areas-current-/cauq-8yn6
neighborhoods <- read_sf("neighborhoods/geo_export_9c969155-1ad2-4497-b263-d7a54abec1f1.shp")

#Reading in data from Chicago Data Snapshots
#This data was obtained from https://datahub.cmap.illinois.gov/dataset/community-data-snapshots-raw-data/resource/8c4e096e-c90c-4bef-9cf1-9028d094296e
#Note that there is an error in the TOT_POP column, population doesn't match the data in the Chicago
#Data Snpshots pdf and is in decimals

acs_0 <- read_csv("data/ACS_Chicago_Agg.csv")

pop_fixed <- read_csv("data/population_fixed.csv")

acs_01 <- full_join(acs_0, pop_fixed, by = c("GEOID" = "comm_num"))

#Getting rid of empty columns
acs <- acs_01[ , colSums(is.na(acs_01)) < nrow(acs_01)]

#Creating percentage variables
acs <- acs %>%
  mutate(black_perc = (BLACK / pop_fixed)*100,
         hisp_perc = (HISP / pop_fixed)*100,
         unemp_perc = (UNEMP / pop_fixed) * 100,
         white_perc = (WHITE/pop_fixed)*100,
         asian_perc = (ASIAN/pop_fixed)*100,
         avg_hh_size = POP_HH / TOT_HH,
         bach_perc = (BACH / pop_fixed)*100,
         not_eng_perc = (NOT_ENGLISH/pop_fixed) * 100)


#Turning community area identifier numbers into numeric type
neighborhoods$area_num_1 <- as.numeric(neighborhoods$area_num_1)

#Joining ACS data with mapping data
acs_neighborhoods <- full_join(neighborhoods, acs, by = c("area_num_1" = "GEOID"))

#Reading in Divvy station location data and putting longitude and latitude columns first
#This data was obtained from https://data.cityofchicago.org/Transportation/Divvy-Bicycle-Stations-All-Map/bk89-9dk7
test_data <- read_csv("data/Divvy_Bicycle_Stations.csv") %>%
  dplyr::select(Longitude, Latitude, everything())

#Reading in community area shapefile
#This data was obtained from https://datahub.cmap.illinois.gov/dataset/community-data-snapshots-raw-data/resource/8c4e096e-c90c-4bef-9cf1-9028d094296e
neighborhood_map <- read_sf("neighborhoods/geo_export_9c969155-1ad2-4497-b263-d7a54abec1f1.shp")

#Changing to an sf object
test_data_sf <- st_as_sf(test_data, coords = c('Longitude', 'Latitude'), crs = st_crs(neighborhood_map))

#Mapping stations to community areas
test_data <- test_data_sf %>%
  mutate(intersection = as.integer(st_intersects(geometry, neighborhood_map)),
         neighborhood = if_else(is.na(intersection), '', neighborhood_map$community[intersection]))


#Turning into a tibble
test_data_done <- tibble(test_data)

#Grouping by community area to find number of stations and docks in service for each community area
try <- test_data_done %>%
  group_by(neighborhood) %>%
  summarize(n = n(), num_docks_in_service = sum(`Docks in Service`))

#Getting rid of first row which has Evanston data
try <- try[-1,]

#Joining bike data to the ACS and community area map data
map_try <- full_join(try, acs_neighborhoods, by = c("neighborhood" = "community"))

#Replacing missing values with zero
map_try[is.na(map_try)] <- 0


#Reading in community area density, area, and pop data from Wikipedia and turning into a tibble
url <- "https://en.wikipedia.org/wiki/Community_areas_in_Chicago"

r <- GET(url)

doc <- readHTMLTable(doc=content(r, "text"))

pop_table <- doc[1]$`Chicago community areas by number, population, and area[8]`
pop_table <- tail(pop_table, 78)
pop_table <- as_tibble(head(pop_table, 77))

#Making better column names and fixing some punctuation issues
colnames(pop_table) <- c("num", "neighborhood", "pop", "area_sq_mi", "area_sq_km", "density_sq_mi", "density_sq_km")
pop_table$neighborhood <- toupper(pop_table$neighborhood)
pop_table$neighborhood[32] <- "LOOP"
pop_table$neighborhood[76] <- "OHARE"

#Joining wiki data with previous data table and turning columns that should be numeric into numeric
pop_data <- full_join(pop_table, map_try, by = c("neighborhood" = "neighborhood"))
pop_data$num <- parse_number(pop_table$num)
pop_data$pop <- parse_number(pop_table$pop)
pop_data$area_sq_mi <- parse_number(pop_table$area_sq_mi)
pop_data$area_sq_km <- parse_number(pop_table$area_sq_km)
pop_data$density_sq_mi <- parse_number(pop_table$density_sq_mi)
pop_data$density_sq_km <- parse_number(pop_table$density_sq_km)

#Adding ratio variables, #bike stations / population, #docks in service to population
pop_data <- pop_data %>%
  mutate(station_pop_ratio = n/pop_fixed,
         dock_pop_ratio = num_docks_in_service/pop_fixed)

#Creating some ratio variables: # bike stations to area in sq mi and # bike stations to population density
pop_data <- pop_data %>%
  mutate(bike_area = n/area_sq_mi,
         bike_density = n/density_sq_mi)


#Changing column name to be more descriptive
names(pop_data)[names(pop_data) == 'n'] <- 'num_of_divvy_stations'


#Reading in census tract-level ACS data
acs_tract <- read_csv("data/census_team_2015_2019_dat.csv")

#Grabbing ID number and corresponding community area number from census tract map data
#This data was obtained from https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Census-Tracts-2010/5jrd-6zik
tract_neighborhoods <- read_csv("data/CensusTractsTIGER2010.csv") %>%
  dplyr::select(TRACTCE10, COMMAREA)

#Grabbing disability related columns from census tract data
acs_disabled <- acs_tract %>%
  dplyr::select(`census tract code`, ends_with("with a disability (estimate)"))

#Summing counts split by age and sex into one column with the total count of disabled people
acs_disabled$tot_disabled <- rowSums(acs_disabled[, 2:13])

#Doing the same thing but with the non-disabled people columns
acs_non_disabled <- acs_tract %>%
  dplyr::select(`census tract code`, ends_with("no disability (estimate)"))

acs_non_disabled$tot_non_disabled <- rowSums(acs_non_disabled[, 2:13])

#Joining the two into a single disability frame
acs_dis <- full_join(acs_disabled, acs_non_disabled, by = c("census tract code" = "census tract code"))

#Joining disability data with rest of ACS tract-level data and turning count into percent
acs_tract_disabled <- full_join(acs_tract, acs_dis) %>%
  mutate(disabled_perc = (tot_disabled / (tot_disabled + tot_non_disabled)) * 100)

#Modifying ID number for joining
acs_tract_disabled$id_num = str_sub(acs_tract_disabled$`census geographic identifier`, -11)

#Reading in census tract shapefile
#This data was obtained from https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Census-Tracts-2010/5jrd-6zik
map_tracts <- read_sf("census_tracts/geo_export_c8ec6a1a-b3b4-490e-9319-f32777c20c5b.shp")

#Joining census tract shape file with ACS w/disability data, creating a variable for percent of 18-35 year olds that are disabled
acs_disabled_0 <- left_join(map_tracts, acs_tract_disabled, by = c("geoid10" = "id_num")) %>%
  mutate(young_disabled_adult_perc = ((`female: 18 to 34 years: with a disability (estimate)` + `male: 18 to 34 years: with a disability (estimate)`) / (`female: 18 to 34 years (estimate)` + `male: 18 to 34 years (estimate)`))*100)

#Joining with tract to neighborhood correspondence table
acs_disabled <- full_join(acs_disabled_0, tract_neighborhoods, by = c("tractce10" = "TRACTCE10"))

#Turning joined table into a tibble, selecting desired columns, and grouping census tracts into community areas
acs_neighborhoods_disabled <- acs_disabled %>%
  as_tibble() %>%
  dplyr::select(commarea, tot_disabled, ends_with("with a disability (estimate)")) %>%
  group_by(commarea) %>%
  summarise_each(list(sum))

#Joining with wikipedia data and cleaning up column names
acs_agg_0 <- full_join(acs_neighborhoods_disabled, pop_data, by = c("commarea" = "area_numbe")) %>%
  clean_names()

#Turning into a tibble and creating percent variables

acs_agg <- acs_agg_0 %>%
  as_tibble() %>%
  mutate(non_white_perc = ((pop_fixed - white) / pop_fixed) * 100,
         disabled_perc = (tot_disabled / pop_fixed) * 100,
         single_parent_perc = (ct_sp_wchild / tot_hh) * 100,
         not_very_well_eng_perc = (ling_iso / pop_fixed) * 100,
         no_internet_perc = (no_internet / tot_hh) * 100,
         unemp_perc = (unemp / pop_fixed) * 100,
         renting_hh_perc = (rent_occ_hu / tot_hh) * 100,
         less_hs_perc = (lt_hs / pop_fixed) * 100,
         no_vech_perc = (no_veh / tot_hh) * 100,
         walk_bike_perc = (walk_bike / pop_fixed) * 100,
         bach_perc = (bach / pop_fixed) * 100) %>%
  dplyr::select(!c(geometry, area, area_num_1, comarea, comarea_id, perimeter, geog))


#Selecting desired variables from poverty-level data table and cleaning column names
poverty <- read_csv("data/poverty_data.csv") %>%
  dplyr::select(GEOID, TRACTA, AMC3E001, AMC3E002, AMC3E003, AMC3E004, AMC3E005, AMC3E006,
                AMC3E007, AMC3E008, AMC3E009, AMC3E010, AMC3E011, AMC3E012, AMC3E013) %>%
  clean_names()

#Modifying tract ID number for joining
poverty$geo_id_join = str_sub(poverty$geoid, -11)

#Joining to tract-neighborhood correspondence table and cleaning column names
acs_agg_pov <- left_join(tract_neighborhoods, poverty, by = c("TRACTCE10" = "tracta")) %>%
  clean_names()

#Turning into a tibble, selecting desired variables, and aggregating by community area
acs_agg_p <- acs_agg_pov %>%
  as_tibble() %>%
  dplyr::select(commarea, amc3e001, amc3e002, amc3e003, amc3e004, amc3e005, amc3e006,
                amc3e007, amc3e008, amc3e009, amc3e010, amc3e011, amc3e012, amc3e013) %>%
  group_by(commarea) %>%
  summarise_each(list(sum))

#Turning community area ID number into numeric
acs_agg$commarea <- parse_number(acs_agg$commarea)

#Joining aggregated poverty data with other aggregated ACS data and creating a variable for
#percent of families living below the poverty line
acs_final <- full_join(acs_agg_p, acs_agg, by = c("commarea" = "commarea")) %>%
  mutate(under_pov_perc = ((amc3e002 + amc3e003 + amc3e004) / amc3e001) * 100)

#Reading in data on how people commute to work and cleaning column names
#This data was obtained from the census bureau data portal
commute <- read_csv("data/commute.csv") %>%
  dplyr::select(starts_with("Estimate"), id) %>%
  clean_names()

#Turning tract ID number into numeric
commute$id_num <- parse_number(str_sub(commute$id, -11))

#Reading in community area to tract correspondence data and cleaning column names
tract_neigh <- read_csv("data/CensusTractsTIGER2010.csv") %>%
  dplyr::select(GEOID10, COMMAREA) %>%
  clean_names()

#Joining correspondence table to commuting data and cleaning column names
acs_commute <- left_join(tract_neigh, commute, by = c("geoid10" = "id_num")) %>%
  clean_names()

#Selecting desired variables and aggregating by community area
acs_commute <- acs_commute %>%
  as_tibble() %>%
  dplyr::select(commarea, estimate_total, estimate_total_car_truck_or_van) %>%
  group_by(commarea) %>%
  summarise_each(list(sum))

#Joining ACS data to commuting data, creating variable for percent of commuters that
#commute by car, truck, or van
acs_done <- full_join(acs_commute, acs_final, by = c("commarea" = "commarea")) %>%
  mutate(car_truck_commute_perc = (estimate_total_car_truck_or_van / estimate_total) * 100)

#Reading in ACS tract level data with info on what industries people work in and
#selecting variables that likely cover office jobs

# We tried this but due to hazy variable definitions, we often ended up with percetnages
# over 100.  Leaving this code in but commented out in case whoever picks this us
# next has a better idea of how to fix it
#
# acs_new <- read_csv("data/census_team_2015_2019_dat_new.csv") %>%
#   dplyr::select(`census geographic identifier`,`occupation total (estimate)`,
#                 `wholesale trade (estimate)`, `retail trade (estimate)`, `information (estimate)`,
#                 `finance and insurance, and real estate, and rental and leasing (estimate)`,
#                 `professional, scientific, and management, and administrative, and waste management services (estimate)`,
#                 `educational services, and health care and social assistance (estimate)`,
#                 `public administration (estimate)`,
#                  `management, business, science, and arts occupations (estimate)`,
#                  `sales and office occupations (estimate)`)
#
# #Creating a variable for a rough estimate of office jobs
# acs_new$office_jobs <- rowSums(acs_new[, 3:9])
#
# #Modifying tract ID number for joining
# acs_new$id_num = str_sub(acs_new$`census geographic identifier`, -11)
#
# #Turning ID number to numeric
# acs_new$id_num <- parse_number(acs_new$id_num)
#
# #Joining with tract to community area correspondence table and cleaning column names
# acs_done <- left_join(tract_neigh, acs_new, by = c("geoid10" = "id_num")) %>%
#   clean_names()
#
# #Turning into tibble, selecting desired variables, and aggregating by community area
# acs_done <- acs_done %>%
#   as_tibble() %>%
#   dplyr::select(commarea, office_jobs, occupation_total_estimate) %>%
#   group_by(commarea) %>%
#   summarise_each(list(sum))

#Joining with commuting data
# acs_done <- full_join(acs_done, acs_commute, by = c("commarea" = "commarea"))


#Factor analysis with relevant variables with factor loadings > .3
fac <- factanal(~non_white_perc + disabled_perc + no_vech_perc + renting_hh_perc +
                  single_parent_perc + no_internet_perc + bach_perc +
                  unemp_perc + under_pov_perc + medinc,
                factors = 1, data = acs_done, scores = "regression")


#Adding column for the index
acs_done$index <- fac$scores

#Turning into tibble and normalizing index
acs_done_for_csv <- acs_done %>%
  as_tibble() %>%
  mutate(index = (index - min(index)) / (max(index) - min(index))) %>%
  dplyr::select(-c(num, pop)) %>%
  rename(community_id = commarea, tot_families = amc3e001,
         inc_pov_lt_.5 = amc3e002, inc_pov_.5_to_.74 = amc3e003, inc_pov_.75_to_.99 = amc3e004,
         inc_pov_1_to_1.24 = amc3e005, inc_pov_1.25_to_1.49 = amc3e006, inc_pov_1.5_to_1.74 = amc3e007,
         inc_pov_1.75_to_1.84 = amc3e008, inc_pov_1.85_to_1.99 = amc3e009, inc_pov_2_to_2.99 = amc3e010,
         inc_pov_3_to_3.99 = amc3e011, inc_pov_4_to_4.99 = amc3e012, inc_pov_5_and_up = amc3e013,
         msh_index = index, population = pop_fixed, total_commuters = estimate_total,
         total_commuters_car_truck_or_van = estimate_total_car_truck_or_van,
         comm_area_name = neighborhood)

#Getting rid of "estimate" suffix
colnames(acs_done) <- gsub("estimate","", colnames(acs_done))

comm_dens <- read_csv("data/comm_density_2021.csv") %>%
  dplyr::select(-c(X1))


acs_done_dens <- full_join(acs_done_for_csv, comm_dens, by = c("comm_area_name" = "community_area")) %>%
  head(-1)


#Writing to csv file
wb <- createWorkbook()

addWorksheet(wb, sheetName = "Sheet1")

writeData(wb, sheet = 1, x = acs_done_dens)

saveWorkbook(wb, file = "curated_data/final_demographic_data_no_geometry_with_connectivity.xlsx")


#################

#Adding geometry back in to demographic data and saving as rds

acs_geo <- full_join(acs_done_dens, neighborhoods, by = c("community_id" = "area_num_1")) %>%
  dplyr::select(-c(shape_area.y, shape_len.y)) %>%
  rename(shape_area = shape_area.x, shape_len = shape_len.x)

write_rds(acs_geo, "curated_data/final_demographic_data_with_geometry_and_connectivity.rds")


#Adding mapping geometry to divvy connectivity data and writing to rds

# comm_dens_geo <- full_join(neighborhoods, comm_dens, by = c("community" = "community_area"))
#
# write_rds(comm_dens_geo, "curated_data/final_demographic_data_with_geometry_and connectivity.rds")

#Joining density/connectivity divvy data to CMAP Community Data Snapshots and writing to excel file
snapshot_connectivity <- full_join(acs_0, comm_dens, by = c("GEOG" = "community_area"))

wb <- createWorkbook()

addWorksheet(wb, sheetName = "Sheet1")

writeData(wb, sheet = 1, x = snapshot_connectivity)

saveWorkbook(wb, file = "curated_data/snapshots_with_connectivity.xlsx")

#Creating dataset used by viz & comm team by changing variable names, making some
#proportion variables, and selected the desired columns

acs_data <- read_rds("curated_data/final_demographic_data_with_geometry.rds")

for_viz_0 <- acs_data %>%
  mutate(prop_white = white/pop, non_white = (pop - white),
         prop_non_white = non_white / pop, prop_poverty = under_pov_perc/100,
         prop_disabled = tot_disabled / pop) %>%
  rename(community_id = commarea, num_stations = num_of_divvy_stations,
         num_stations_capita = station_pop_ratio, num_bikes = num_docks_in_service,
         num_bikes_capita = dock_pop_ratio, median_income = medinc, msh_index_score = msh_index,
         inc_gt_150k = inc_gt_150, disabled = tot_disabled, single_parent_hh = ct_sp_wchild)

for_viz <- for_viz_0 %>%
  dplyr::select(community_id, community, geometry, num_stations, num_stations_capita,
                num_bikes, num_bikes_capita, population, white, prop_white,
                non_white, prop_non_white, median_income, inc_lt_25k, inc_25_50k,
                inc_50_75k, inc_75_100k, inc_100_150k, inc_gt_150k, prop_poverty, disabled,
                prop_disabled, msh_index_score, tot_hh, unemp, unemp_perc, non_white_perc, disabled_perc,
                no_veh, no_vech_perc, rent_occ_hu, renting_hh_perc, single_parent_hh,
                single_parent_perc, no_internet, no_internet_perc, bach, bach_perc,
                under_pov_perc)

#Writing viz&comm data to rds

write_rds(for_viz, "curated_data/selected_data_for_viz_com.rds")

for_viz_csv <- for_viz %>%
  dplyr::select(-c(geometry))

#Writing viz&comm data to xlsx without the mapping geometry column

wb <- createWorkbook()

addWorksheet(wb, sheetName = "Sheet1")

writeData(wb, sheet = 1, x = for_viz_csv)

saveWorkbook(wb, file = "curated_data/for_viz_no_geometry.xlsx")

just_index <- for_viz_csv %>%
  dplyr::select(community_id, community, msh_index_score)

wb <- createWorkbook()

addWorksheet(wb, sheetName = "Sheet1")

writeData(wb, sheet = 1, x = just_index)

saveWorkbook(wb, file = "curated_data/community_index.xlsx")