-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_processing.R
More file actions
89 lines (82 loc) · 4.23 KB
/
text_processing.R
File metadata and controls
89 lines (82 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# Text processing
library(tidyverse)
library(readr)
library(stringr)
library(tidytext)
text_var <- function(){
#for listings that have no description, force NA to 0 in order to keep all listings
sale_listings_ss <<- sale_listings_imputed %>%
mutate(listing_description = ifelse(is.na(listing_description) == TRUE, 0, listing_description)) %>%
mutate(
stainless_steel = as.integer(str_detect(listing_description, "[Ss]tainless.[Ss]teel")),
hw_floors = as.integer(str_detect(listing_description, "[Hh]ardwood.[Ff]loors?")),
wd = as.integer(str_detect(listing_description, c(
"[Ww]d",
"[Ww]/d",
"[Ww]&d",
"[Ww] & d",
"[Ww]asher/dryer",
"[Ww]asher / dryer",
"[Ww]asher and dryer",
"[Ww]asher&dryer",
"[Ww]asher & dryer",
"[Ww]asher-dryer",
"[Ww]asher dryer"
))),
steel_app = as.integer(str_detect(listing_description, "[Ss]teel.[Aa]ppliances?")),
fitness = as.integer(str_detect(listing_description, c(
"[Ff]itness",
"[Ff]itness center",
"[Ff]itness-center"
))),
# top 10 words with highest price average and highest frequency
marble = as.integer(str_detect(listing_description, "[Mm]arble")),
master = as.integer(str_detect(listing_description, "[Mm]aster")),
#views = as.integer(str_detect(listing_description, "[Vv]iews")),
custom = as.integer(str_detect(listing_description, c("[Cc]ustom", "[Cc]ustomize*"))),
floor = as.integer(str_detect(listing_description, "[Ff]loor*")),
private = as.integer(str_detect(listing_description, "[Pp]rivate")),
window = as.integer(str_detect(listing_description, "[Ww]indow*")),
dining = as.integer(str_detect(listing_description, "[Dd]ining")),
offer = as.integer(str_detect(listing_description, "[Oo]ffer*")),
#light = as.integer(str_detect(listing_description, "[Ll]ight")),
#----------------------------------------------------------------
sqft = as.integer(str_detect(listing_description, c("[Ss]quare.[Ff]eet", "[Ss]quare.[Ff]oot"))),
hudson_river = as.integer(str_detect(listing_description, "[Hh]udson.[Rr]iver")),
#----------------------------------------------------------------
renovate = as.integer(str_detect(listing_description, "[Rr]enovat*")),
closet_space = as.integer(str_detect(listing_description, "[Cc]loset.[Ss]pace")),
spacious = as.integer(str_detect(listing_description, "[Ss]pacious*")),
storage = as.integer(str_detect(listing_description, "[Ss]torage")),
closet_space = as.integer(str_detect(listing_description, "[Cc]loset.[Ss]pace")),
roof_deck = as.integer(str_detect(listing_description, "[Rr]oof.[Dd]eck")),
park = as.integer(str_detect(listing_description, "[Pp]ark[\\>|s]")),
balcony = as.integer(str_detect(listing_description, "[Bb]alcon*")),
courtyard = as.integer(str_detect(listing_description, "[Cc]ourtyard*")),
view = as.integer(str_detect(listing_description, "[Vv]iew*")),
window = as.integer(str_detect(listing_description, "[Ww]indow*")),
natural_light = as.integer(str_detect(listing_description, "[Nn]atural.[Ll]ight*")),
en_suite = as.integer(str_detect(listing_description, "[Ee]n.[Ss]uite")),
pet_friendly = as.integer(str_detect(listing_description, "[Pp]et.[Ff]riendly")),
tree_lined = as.integer(str_detect(listing_description, "[Tt]ree.[Ll]ined")),
central_park = as.integer(str_detect(listing_description, "[Cc]entral.[Pp]ark")),
# outdoor_space = as.integer(ifelse((park | balcony | courtyard | roof_deck | central_park | tree_lined) == 1, 1, 0))
)
library(tidytext)
afinn_lex <- get_sentiments("afinn")
sentiments <- sale_listings_ss %>%
select(id, listing_description) %>%
unnest_tokens(output = word,
input = listing_description) %>%
anti_join(stop_words) %>%
inner_join(afinn_lex)
sentiment_summary <- sentiments %>%
group_by(id) %>%
summarise(score = sum(value))
sale_listings_ss <<- left_join(sale_listings_ss, sentiment_summary, by = "id")
#for listings that have no emotion-associated words, use score = 0
#mutate all binary variables to factors
sale_listings_ss <<- sale_listings_ss %>%
mutate(score = replace_na(score, 0))%>%
mutate_if(is.integer, as.factor)
}