Coding-Sample/data.R at main · marinefuji/Coding-Sample · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
##Marine Fujisawa

## Sample Project: Chosen because it includes data cleaning, web scraping, data visualizations, and a code that produces a ShinyApp to allow the user to interact with the data.

# The first step I take is the collection and cleaning of data.
# After obtaining the data, it was then cleaned to ensure consistency across the different datasets.
# Next, I created a scatterplot to visualize some relationships within the data. I also create a choropleth map to be used for the ShinyApp.

# Setting work directory
setwd("/Users/marinefujisawa1/Documents/GitHub/Coding-Sample")
#load libraries
library(tidyverse)
library(dplyr)

# Import first dataset: GDP of countries
gdp<-read.csv("data/GDP_countries.csv")
#View(gdp)

# Clean data
# First, remove last 5 rows as they are blank
gdp<-head(gdp,-5)

#We want to know economic recovery from the COVID crisis, so only going to keep 2019-2023 data
gdp <- gdp[, -c(5:11)]

#Next, replace ".." values with NA
gdp <- gdp %>%
  mutate_all(~ na_if(., ".."))

#Change column names of year so it's easier to use
years<-c("2019", "2020", "2021", "2022", "2023")
colnames(gdp)[(ncol(gdp)-4):ncol(gdp)] <- years

#Convert gdp values to numeric
gdp <- gdp %>%
  mutate(across((ncol(.)-4):ncol(.), as.numeric))

## Add some new metrics to measure how well each country recovered from COVID shocks
#Recovery percentage- Using 2019 GDP as the baseline, I use the formula:
#[(GDPyear - GDP2020)/(GDP2019 - GDP 2020)]*100%
gdp<-gdp%>%
  mutate(recovery2023 = ((`2023`-`2020`)/(`2019`-`2020`))*100)

#Also want to measure how much of a shock they experienced
#(GDP2019-GDP2020)/GDP2019*100
gdp<-gdp%>%
  mutate(decline = ((`2019`-`2020`)/`2019`)*100)


# Import second dataset: Export diversification index
export<-read.csv("data/Export_diversification_index.csv")
#View(export)

#Reducing number of columns so it's easier to see- we only need 2019 diversification data
export<-export[, -c(2:57)]

#Removing first row since it contains no data
export <- export[-1, ]

#Adding new column with country code so it can merge better with gdp data
#install.packages("countrycode")
library(countrycode)
export$country_iso <- countrycode(export$Economy_Label, "country.name", "iso3c")

#Again, cleaning up some columns since the data is not needed
export<-export[,-c(4:17)]
#Missed one column so dropping here
export <- export %>% select(-X2018_Diversification_Index_MissingValue)

#Now, will use webscraping to get another economic diversification index
library(rvest)
url<-"https://economicdiversification.com/the-index/#"
index_website<-read_html(url)
econ_diverse <- html_table(index_website, fill = TRUE)

#Loading the dataset, cleaning
econ_diverse<-(econ_diverse[[1]])

#adding the country code
econ_diverse$country_iso <- countrycode(econ_diverse$Countries, "country.name", "iso3c")

#Finally, merging all data into one dataset
gdp <- gdp %>%
  rename(country_iso = Country.Code)
initial_merge <- merge(gdp, export, by = "country_iso")

final_data<-merge(initial_merge, econ_diverse, by = "country_iso")
View(final_data)

write.csv(final_data, "final_data.csv", row.names = FALSE)

## Next, I created some scatterplots. Here, I am showing one of them (so the file doesn't get too long).

# First plot: scatterplot, using export diversification
# I wanted to create a plot without outliers.
# # Define outliers as (values outside 1.5*IQR from Q1 and Q3)
Q1 <- quantile(final_data$recovery2023, 0.25, na.rm=TRUE)
Q3 <- quantile(final_data$recovery2023, 0.75, na.rm=TRUE)
IQR_value <- Q3 - Q1

final_data_no_outliers <- final_data %>%
  filter(recovery2023 >= (Q1 - 2 * IQR_value) & recovery2023 <= (Q3 + 2 * IQR_value))

# Plot without outliers
plot1<-ggplot(final_data_no_outliers, aes(x = X2019_Diversification_Index_Value, y = decline)) +
  geom_point(color="olivedrab") +
  geom_smooth(method = "lm", color = "tan4")+
  labs(title = "Export Diversificaiton vs GDP Decline Percentage Post COVID", x = "Export Diversification", y = "GDP Decline Percentage")+
  theme_minimal()+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

plot1

ggsave(plot1,
       filename = "images/plot1.png",
       device = "png")

# Next, I created a choropleth map to be used later.
#install.packages("sf")
library(sf)
#install.packages("spData")
library(spData)

world_map <- spData::world
library(countrycode)
world_map$country_iso <- countrycode(world_map$name_long, "country.name", "iso3c")
world_map_data <- world_map %>%
  left_join(final_data, by = "country_iso")

choropleth1<-ggplot(world_map_data) +
  geom_sf(aes(fill = Average)) +  # The variable to color by
  scale_fill_gradient(low = "seashell", high = "tomato3")+
  labs(title = "Choropleth of Economic Diversification Index") +
  theme_minimal() +
  theme(legend.position = "bottom")

choropleth1

ggsave(choropleth1,
       filename = "images/choropleth1.png",
       device = "png")