-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathClean_Address.R
More file actions
108 lines (88 loc) · 3.19 KB
/
Clean_Address.R
File metadata and controls
108 lines (88 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#fewer rows
address_clean_v1<-geo_addresses_v1%>%
filter(match=="Match")%>%
distinct()
#Handle unmatched addresses
unmatched_addresses<-geo_addresses_v1%>%
filter(!match=="Match")%>%
distinct()%>%
select(id_add_orig, old.address)
## add in parallel
#Single Address Geocoding
if(!exists("geo_addresses_v2")){
print("Processing single addresses. May take a while.")
geo_addresses_v2 <- furrr::future_map2(unmatched_addresses$old.address,
unmatched_addresses$id_add_orig,
safe_geocode_single_address)
geo_addresses_v2<-map_dfr(geo_addresses_v2, unlist)
}
#Split Geocoded Data by Match Type
#there are remaining unmatched addresses,
#duplicate matches, and also some correct matches.
#The sum of these should be the same as
#the number of rows in the previous `unmatched_addresses` dataset.
address_clean_v2<-geo_addresses_v2%>%
mutate(street = str_split_fixed(new.address, ",", n=2)[,1])%>%
select(id_add_orig, old.address, new.address,
street, city, state, zip,
lon, lat, tigerLine, side, new.address1)%>%
filter(!is.na(new.address))%>%
select(-new.address1)%>%
mutate(tigerLine = as.numeric(tigerLine),
zip = as.numeric(zip),
match.type = "Single")
unmatched_duplicates<-geo_addresses_v2%>%
filter(!is.na(lon1))%>%
select(id_add_orig, old.address, ends_with(c("1","2")))
unmatched_addresses_v2<-geo_addresses_v2%>%
filter(is.na(new.address) & is.na(new.address1))%>%
select(id_add_orig, old.address)
#remove duplicates - doesn't need to be rerun
if(!exists("geo_address_v3")){
data<-unmatched_duplicates%>%
mutate(keep=NA_integer_)
for(i in 1:nrow(data)){
print(paste("Original Address: ",
data$old.address[i], sep=""))
print(paste("1:", data$new.address1[i], sep=" "))
print(paste("2:", data$new.address2[i], sep=" "))
print("3: None of the above")
k<-menu(c("1", "2", "3"),
title="Which matched address to keep?")
data$keep[i]<-k
}
geo_address_v3<-data%>%
pivot_longer(cols = ends_with(c("1", "2")),
names_to = c(".value", "addNum"),
names_pattern = "(.+)(.)")
address_clean_v3<-geo_address_v3%>%
filter(addNum==keep)%>%
select(-addNum, -keep)%>%
mutate(tigerLine = as.numeric(tigerLine),
zip = as.numeric(zip),
match.type = "Single Dup")
unmatched_addresses_v3<-geo_address_v3%>%
filter(keep==3)%>%
select(id_add_orig, old.address)%>%
distinct()
}
if(!exists("address_tofix")){
address_tofix<-full_join(unmatched_addresses_v2,
unmatched_addresses_v3)%>%
merge_orig_address(.)%>%
mutate(match.type="Unmatched")
#choose easy location to find file
write_csv(address_tofix, "Data/seattle_address_tofix21.csv")
}
address_clean<-merge_orig_address(address_clean_v1)%>%
full_join(.,address_clean_v2)%>%
full_join(., merge_orig_address(address_clean_v3))%>%
full_join(., address_tofix)%>%
full_join(., address_na)
address_clean<-address_clean%>%
mutate( new.address = case_when(
is.na(new.address) ~ old.address,
TRUE ~ new.address))%>%
group_by(new.address)%>%
mutate(id_add_clean = paste("CA", cur_group_id(), sep="."))%>%
ungroup()