-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathR_visualization.R
More file actions
120 lines (83 loc) · 4.68 KB
/
R_visualization.R
File metadata and controls
120 lines (83 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Case-Study Title: Story Telling with Data (decision making and reasoning based-on data visualization)
# Data Analysis methodology: CRISP-DM
# Dataset: Sales data of a retail company in USA
# Case Goal: Comparing between Men & Women in each different cities, is there any difference of generating total revenue to company from gender aspect?
### Required Library ----
install.packages('ggplot2')
library('ggplot2')
### Read Data from File ----
data <- read.csv("CS_02_01.csv", header = T)
dim(data)
head(data)
summary(data)
summary(data[,c("Gender" , "Revenue")])
data$Revenue
### Data PreProcessing ----
# Data Cleaning
data$Revenue <- as.numeric(gsub("[$]" , "", data$Revenue))
data$Gender <- factor(data$Gender)
summary(data[,c("Gender" , "Revenue")])
# Data Preparation
city_gender_rev_1 <- as.data.frame(tapply(data$Revenue,
list(data$City, data$Gender),
sum)) # total revenue in each city per each gender
city_gender_rev_1
city_gender_rev_1$total_rev <- city_gender_rev_1$F + city_gender_rev_1$M # total revenue
city_gender_rev_1 <- city_gender_rev_1[order(city_gender_rev_1$total_rev),] #sort dataframe based-on total_rev column in each city
city_gender_rev_1$city <- rownames(city_gender_rev_1)
city_gender_rev_1$city <- factor(city_gender_rev_1$city, levels = reorder(city_gender_rev_1$city, city_gender_rev_1$total_rev)) # sort city names levels based-on total_rev
rownames(city_gender_rev_1) <- NULL
c1 <- city_gender_rev_1[, c("city", "F")]
colnames(c1)[2] <- "revenue"
c1$gender <- "F"
c2 <- city_gender_rev_1[, c("city", "M")]
colnames(c2)[2] <- "revenue"
c2$gender <- "M"
city_gender_rev_2 <- rbind(c1, c2)
city_gender_rev_2 # appropriate dataframe for ggplot2
### Plot without a story to tell ----
ggplot(city_gender_rev_2, aes(y= city, x= revenue, fill = gender)) +
geom_col(Position= 'dodge')
### Plot with a story to tell ----
ggplot(city_gender_rev_2, aes(x =revenue , y= city)) +
geom_point(aes(color= gender)) +
geom_line(aes(group=city))
city_gender_rev_1$diff_percent <- round((pmax(city_gender_rev_1$M, city_gender_rev_1$F) / pmin(city_gender_rev_1$M, city_gender_rev_1$F) - 1) * 100, 2)
city_gender_rev_1
city_gender_rev_1$gender_max <- ifelse(city_gender_rev_1$F > city_gender_rev_1$M, "F", "M") # maximum revenue in each city by gender
#identify cities with 20% or higher gap in revenue between Men and Women (cities with diff_percent >= 20)
big_diff_cities <- city_gender_rev_1$city[city_gender_rev_1$diff_percent >= 20]
#create another dataframe for creating highlights and labels
highlight <- city_gender_rev_2[city_gender_rev_2$city %in% big_diff_cities,]
highlight$diff_percent <- city_gender_rev_1[match(highlight$city, city_gender_rev_1$city), "diff_percent"]
highlight$gender_max <- city_gender_rev_1[match(highlight$city, city_gender_rev_1$city), "gender_max"]
highlight
plot_labels <- highlight[highlight$gender == highlight$gender_max,] # remove duplicate rows
plot <- ggplot(city_gender_rev_2, aes(x= revenue , y= city))+
geom_line(aes(group = city), alpha= 0.3) +
geom_point(aes(color= gender), size= 1.5 , alpha= 0.3) +
geom_line(data = highlight , aes(group = city)) +
geom_point(data = highlight, aes(color = gender), size = 2) +
geom_text(data = plot_labels, aes(color = gender_max, label = paste0("+", diff_percent, "%")), size = 3, hjust = - 0.5)
plot
#final plot
plot <- plot + scale_color_discrete(labels = c("Female", "Male")) +
scale_x_continuous(labels = scales::dollar, expand = c(0.02, 0),
limits = c(0, 10500),
breaks = seq(0, 10000, by = 2500)) +
scale_y_discrete(expand = c(.02, 0)) +
labs(title = "Total Revenue by City and Gender",
subtitle = "Out of 23 cities, eight locations experience a 20% or greater difference \nin revenue generated by males versus females. Hidalgo experiences the \ngreatest difference with females generating 86% more revenue than males.") +
theme_minimal() +
theme(axis.title = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.justification = c(0, 1),
legend.position.inside= c(.1, 1.075),
legend.background = element_blank(),
legend.direction = "horizontal",
plot.title = element_text(size = 20, margin = margin(b = 10)),
plot.subtitle = element_text(size = 10, color = "darkslategrey", margin = margin(b = 25)),
plot.caption = element_text(size = 8, margin = margin(t = 10), color = "grey70", hjust = 0))
plot