-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpre-processing.R
More file actions
181 lines (141 loc) · 5.93 KB
/
pre-processing.R
File metadata and controls
181 lines (141 loc) · 5.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Pre-processing
library(tidyverse)
library(readr)
library(stringr)
# To use this script:
# source("pre-processing.R")
load_data <- function(){
# Load the data in global environment
sale_listings <<- read_csv("data/sale_listings.csv")
}
clean_data <- function(){
sale_listings <<- sale_listings %>%
# Make sure we have valid prices
filter(price > 1)%>%
# Reasonable range of sale prices
filter(price < quantile(price, 0.95)) %>%
# Filter out unreasonable numbers
filter(size_sqft != 588527) %>%
filter(bedrooms != 99) %>%
filter(bathrooms != 66) %>%
# Make unreasonable sizes NA
mutate(size_sqft = ifelse(size_sqft < 30, NA, size_sqft))
#import zipcode data
library(zipcodeR)
data("zip_code_db")
zip_code <- zip_code_db %>%
select(zipcode, major_city, county, state, lat, lng)
sale_listings <<- sale_listings %>%
mutate(addr_zip = ifelse(addr_zip < 10000, paste0("0", addr_zip), addr_zip)) %>%
rename("zipcode" = "addr_zip") %>%
mutate(zipcode = as.character(zipcode))
#join sale_listings with zipcode data
sale_listings <<- left_join(sale_listings, zip_code, by = "zipcode")
#these two zipcodes did not have info in zip_code_db, so use original ones in sale_listings
sale_listings <<- sale_listings %>%
mutate(lat = ifelse(zipcode %in% c("11249", "11466"), addr_lat, lat),
lng = ifelse(zipcode %in% c("11249", "11466"), addr_lon, lng)) %>%
select(id, property_id, unittype, listing_description, bedrooms, bathrooms,
size_sqft, price, addr_street, addr_unit, zipcode, floor_count,
year_built, is_historic, major_city:lng)%>%
rename(long=lng)
#5763 property_id has duplicates
duplicate_id <- sale_listings %>%
filter(!is.na(property_id)) %>%
group_by(property_id) %>%
summarize(n=n()) %>%
filter(n > 1)
#12,786 listings should eventually only keep 5763
#make size_sqft 0 == NA
duplicate_listing <- sale_listings %>%
mutate(size_sqft = ifelse(size_sqft == 0, NA, size_sqft)) %>%
filter(property_id %in% duplicate_id$property_id)
#count row NA
na_row <- as.data.frame(rowSums(is.na(duplicate_listing))) %>%
rename("na_row" = "rowSums(is.na(duplicate_listing))")
duplicate_na <- cbind.data.frame(duplicate_listing, na_row)
#choose listing that has less NA attributes
#11,334 listings
duplicate_listing_2 <- duplicate_na %>%
group_by(property_id) %>%
filter(na_row == min(na_row))
length_des <- as.data.frame(nchar(duplicate_listing_2$listing_description)) %>%
rename("len_des" = "nchar(duplicate_listing_2$listing_description)") %>%
mutate(len_des = as.numeric(len_des))
duplicate_des <- cbind.data.frame(duplicate_listing_2, length_des)
#keep the one with longer description
no_duplicate <- duplicate_des %>%
mutate(len_des = ifelse(is.na(len_des) == TRUE, 0, len_des)) %>%
group_by(property_id) %>%
filter(len_des == max(len_des)) %>%
#then keep the one with more bedrooms
filter(bedrooms == max(bedrooms)) %>%
arrange(property_id) %>%
#lastly keep distinct ids
distinct(property_id, .keep_all = TRUE) %>%
select(-na_row, -len_des)
#eventually there should be about 52638 listings
sale_listings <<- sale_listings %>%
filter(!property_id %in% duplicate_id$property_id) %>%
rbind.data.frame(no_duplicate)
sale_listings <<- sale_listings %>%
filter(!is.na(state))%>%
mutate(unittype = as.factor(unittype),
is_historic = as.factor(is_historic),
major_city = as.factor(major_city),
county = as.factor(county),
state = as.factor(state)) %>%
#we looked at the two listings that have NA in is_historic and they were not historic (built in 2007);
#resolve NA for random forest models
mutate(is_historic = replace_na(is_historic, 0))
# Group cities to reduce number of levels
# Look at median price of each city
median_price_city <- aggregate(sale_listings$price,
by = list(sale_listings$major_city),
median)
# Create groups based on median prices
median_price_city$price_group <- cut(median_price_city$x, 10, include.lowest=TRUE,
labels = seq(10,1))
city_group <- median_price_city%>%
select(-x) %>%
rename(major_city = Group.1,
city_group = price_group)
sale_listings <<- left_join(sale_listings, city_group, by = "major_city") %>%
mutate(city_group = as.factor(city_group))
}
impute_data <- function(){
library(mice)
# Create a new data frame for imputation
sale_listings_mice <- sale_listings %>%
mutate(log_size = log(size_sqft))
# Only impute NA values for the log_size column
na_matrix <- is.na(sale_listings_mice)
na_matrix[,-ncol(sale_listings_mice)] <- FALSE
# Create a predictor matrix
# Predictors: "unittype", "bedrooms", "bathrooms", "floor_count", "major_city", "state"
predictorMatrix <- make.predictorMatrix(sale_listings_mice)
predictorMatrix[,c(1, 2, 4, 7:11, 13, 14, 16, 18:20)] <- 0
predictorMatrix[1:19,] <- 0
# Imputation with mice
sale_listings_imp <- mice(sale_listings_mice, m=1,
method = "norm.predict",
predictorMatrix = predictorMatrix,
where = na_matrix,
seed = 410)
# Fill in the generated values
sale_listings_imputed <- complete(sale_listings_imp)
# Clean up and un-log the log_size variable
sale_listings_imputed <- sale_listings_imputed %>%
rename(size_sqft_na = size_sqft) %>%
mutate(size_sqft = exp(log_size)) %>%
select(-log_size)
return(sale_listings_imputed)
}
ml_setup <- function(data){
#set seed for spliting train/test
set.seed(410)
# For now, split the data into training/test sets
train <<- data %>% sample_frac(.70)
test <<- data %>% anti_join(train)
# Implement 5-fold CV in the future
}