R_Visualization/R_visualization.R at main · Mobin-Ghanbari/R_Visualization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Case-Study Title: Story Telling with Data (decision making and reasoning based-on data visualization)
# Data Analysis methodology: CRISP-DM
# Dataset: Sales data of a retail company in USA
# Case Goal: Comparing between Men & Women in each different cities, is there any difference of generating total revenue to company from gender aspect?

### Required Library ----
install.packages('ggplot2')
library('ggplot2')


### Read Data from File ----

data <- read.csv("CS_02_01.csv", header = T)
dim(data)
head(data)
summary(data)

summary(data[,c("Gender" , "Revenue")])


data$Revenue

### Data PreProcessing ----
# Data Cleaning
data$Revenue <- as.numeric(gsub("[$]" , "", data$Revenue))
data$Gender <- factor(data$Gender)

summary(data[,c("Gender" , "Revenue")])


# Data Preparation
city_gender_rev_1 <- as.data.frame(tapply(data$Revenue,
                                          list(data$City, data$Gender),
                                          sum))   # total revenue in each city per each gender
city_gender_rev_1


city_gender_rev_1$total_rev <- city_gender_rev_1$F + city_gender_rev_1$M   # total revenue

city_gender_rev_1 <- city_gender_rev_1[order(city_gender_rev_1$total_rev),]  #sort dataframe based-on total_rev column in each city

city_gender_rev_1$city <- rownames(city_gender_rev_1)
city_gender_rev_1$city <- factor(city_gender_rev_1$city, levels = reorder(city_gender_rev_1$city, city_gender_rev_1$total_rev))  # sort city names levels based-on total_rev


rownames(city_gender_rev_1) <- NULL

c1 <- city_gender_rev_1[, c("city", "F")]
colnames(c1)[2] <- "revenue"
c1$gender <- "F"
c2 <- city_gender_rev_1[, c("city", "M")]
colnames(c2)[2] <- "revenue"
c2$gender <- "M"
city_gender_rev_2 <- rbind(c1, c2)
city_gender_rev_2  # appropriate dataframe for ggplot2


### Plot without a story to tell ----
ggplot(city_gender_rev_2, aes(y= city, x= revenue, fill = gender)) +
  geom_col(Position= 'dodge')

### Plot with a story to tell ----
ggplot(city_gender_rev_2, aes(x =revenue , y= city)) +
  geom_point(aes(color= gender)) +
  geom_line(aes(group=city))


city_gender_rev_1$diff_percent <- round((pmax(city_gender_rev_1$M, city_gender_rev_1$F) / pmin(city_gender_rev_1$M, city_gender_rev_1$F) - 1) * 100, 2)
city_gender_rev_1


city_gender_rev_1$gender_max <- ifelse(city_gender_rev_1$F > city_gender_rev_1$M, "F", "M")  # maximum revenue in each city by gender

#identify cities with 20% or higher gap in revenue between Men and Women (cities with diff_percent >= 20)

big_diff_cities <- city_gender_rev_1$city[city_gender_rev_1$diff_percent >= 20]

#create another dataframe for creating highlights and labels
highlight <- city_gender_rev_2[city_gender_rev_2$city %in% big_diff_cities,]
highlight$diff_percent <- city_gender_rev_1[match(highlight$city, city_gender_rev_1$city), "diff_percent"]
highlight$gender_max   <- city_gender_rev_1[match(highlight$city, city_gender_rev_1$city), "gender_max"]

highlight


plot_labels <- highlight[highlight$gender == highlight$gender_max,]  # remove duplicate rows


plot <- ggplot(city_gender_rev_2, aes(x= revenue , y= city))+
  geom_line(aes(group = city), alpha= 0.3) +
  geom_point(aes(color= gender), size= 1.5 , alpha= 0.3) +
  geom_line(data = highlight , aes(group = city)) +
  geom_point(data = highlight, aes(color = gender), size = 2) +
  geom_text(data = plot_labels, aes(color = gender_max, label = paste0("+", diff_percent, "%")), size = 3, hjust = - 0.5)

plot


#final plot
plot <- plot + scale_color_discrete(labels = c("Female", "Male")) +
  scale_x_continuous(labels = scales::dollar, expand = c(0.02, 0),
                     limits = c(0, 10500),
                     breaks = seq(0, 10000, by = 2500)) +
  scale_y_discrete(expand = c(.02, 0)) +
  labs(title = "Total Revenue by City and Gender",
       subtitle = "Out of 23 cities, eight locations experience a 20% or greater difference \nin revenue generated by males versus females. Hidalgo experiences the \ngreatest difference with females generating 86% more revenue than males.") +
  theme_minimal() +
  theme(axis.title = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        legend.title = element_blank(),
        legend.justification = c(0, 1),
        legend.position.inside= c(.1, 1.075),
        legend.background = element_blank(),
        legend.direction = "horizontal",
        plot.title = element_text(size = 20, margin = margin(b = 10)),
        plot.subtitle = element_text(size = 10, color = "darkslategrey", margin = margin(b = 25)),
        plot.caption = element_text(size = 8, margin = margin(t = 10), color = "grey70", hjust = 0))
plot