Final_Project_AYT/data_wrangling.R at main · andiyoga34/Final_Project_AYT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#setting your working directory (adjust to your own)
setwd("/Users/Lenovo/Documents/GitHub/DAP2-final-project-andiyoga34/Data/Raw Files/")

# Load required libraries

library(tidyverse)


# Downloading GDP Per Capita dataset from Wolrd Bank web into designated folder and the ZIP file name
zip_file <- "/Users/Lenovo/Documents/GitHub/DAP2-final-project-andiyoga34/Data/Raw Files/worldbank_gdp_per_capita.zip"

# Check if the ZIP file already exists before downloading
if (!file.exists(zip_file)) {
  # If the ZIP file doesn't exist, download it
  url <- "https://api.worldbank.org/v2/en/indicator/NY.GDP.PCAP.CD?downloadformat=csv"
  download.file(url, zip_file, mode = "wb")
  print("Dataset downloaded.")
} else {
  print("Dataset already downloaded. Skipping download.")
}

# Unzip the ZIP file
unzip(zip_file, exdir = "/Users/Lenovo/Documents/GitHub/DAP2-final-project-andiyoga34/Data/Raw Files/")


# Function to load, clean, rename, and reshape data
load_and_reshape <- function(file_path, value_name, years = 2015:2023) {
  read.csv(file_path, skip = 4) %>%
    rename(Country_Name = Country.Name, Country_Code = Country.Code) %>%  # Rename columns
    select(Country_Name, Country_Code, paste0("X", years)) %>%
    pivot_longer(
      cols = starts_with("X"),
      names_to = "Year",
      values_to = value_name
    ) %>%
    mutate(Year = as.numeric(sub("X", "", Year)))
}

# Load and reshape datasets with renamed columns
debt_long <- load_and_reshape("Debt.csv", "Debt", 2015:2022)
gdp_long <- load_and_reshape("GDP.csv", "GDP", 2015:2023)
gdp_per_capita_long <- load_and_reshape("API_NY.GDP.PCAP.CD_DS2_en_csv_v2_77536.csv", "GDP_Per_Capita") #adjust the automatically retrieved file name accordingly
revenue_long <- load_and_reshape("Revenue.csv", "Revenue", 2015:2023)

# Merge datasets and removing NAs
merged_data <- reduce(
  list(debt_long, gdp_long, gdp_per_capita_long, revenue_long),
  ~ left_join(.x, .y, by = c("Country_Name", "Country_Code", "Year"))
) %>%
  drop_na(Country_Name, Country_Code)  # Handle NAs early

# Add category based on GDP per capita
merged_data <- merged_data %>%
  mutate(Category = case_when(
    Country_Name == "World" ~ "World",
    GDP_Per_Capita >= 12000 ~ "Developed",
    GDP_Per_Capita < 12000 ~ "Developing",
    TRUE ~ NA_character_
  ))

# Adjust GDP to billions
merged_data <- merged_data %>%
  mutate(GDP = GDP / 1e9)

# Function to summarize data
summarize_metric <- function(data, metric, start_year, end_year, agg_fn = mean) {
  data %>%
    filter(Year >= start_year & Year <= end_year) %>%
    group_by(Year, Category) %>%
    summarize(
      !!metric := agg_fn(.data[[metric]], na.rm = TRUE),
      .groups = "drop"
    ) %>%
    drop_na()
}

# Summarize datasets
debt_summary <- summarize_metric(merged_data, "Debt", 2015, 2022)
gdp_summary <- summarize_metric(merged_data, "GDP", 2015, 2023, sum)
revenue_summary <- summarize_metric(merged_data, "Revenue", 2015, 2022)


#exporting the merged_data as csv file to the designated folder
write.csv(merged_data,  file = "/Users/Lenovo/Documents/GitHub/DAP2-final-project-andiyoga34/Data/Final, clean data/merged_data.csv", row.names = FALSE)