diff --git a/1-data/.DS_Store b/1-data/.DS_Store new file mode 100644 index 0000000..fcd9d76 Binary files /dev/null and b/1-data/.DS_Store differ diff --git a/raw-data/cohort.csv b/1-data/raw-data/cohort.csv similarity index 100% rename from raw-data/cohort.csv rename to 1-data/raw-data/cohort.csv diff --git a/2-analysis/assignment1_analysis.R b/2-analysis/assignment1_analysis.R new file mode 100644 index 0000000..49355ca --- /dev/null +++ b/2-analysis/assignment1_analysis.R @@ -0,0 +1,105 @@ +# clear environment +rm(list = ls()) +# load libraries +library(tidyverse) +library(GGally) +library(ggplot2) + +# read in cohort data +d <- read.csv("1-data/raw-data/cohort.csv") + +# class(d$smoke) +# class(d$female) +# class(d$age) +# class(d$cost) +# class(d$cardiac) + +# check if any rows have missing values +missing <- d[rowSums(is.na(d)) > 0,] # no missing values + +############################################################################### +# plot histogram of cost and cardiac events +############################################################################### + +# create categorical variable for cardiac status +d <- d %>% + mutate(cardiac_cat = case_when(cardiac == 0 ~ "No cardiac event", + cardiac == 1 ~ "Cardiac event")) + +# create density plot of cost by cardiac event status +plot <- ggplot(d, aes(x=cost, fill=cardiac_cat)) + + geom_density(alpha = 0.7) + + ggtitle("Density plot of cost by cardiac event status") + + labs(fill="") + + xlab("Cost") + + ylab("Density") + + theme_minimal() + +# save density plot +ggsave(filename = "density_plot_cost_cardiac.jpg", plot = plot, path = "3-output") + +# remove categorical variable for cardiac event +d <- d %>% select(-cardiac_cat) + +############################################################################### +# create table to describe variables +############################################################################### + +# calculate stats for those with cardiac events +d_cardiac <- d %>% filter(cardiac == 1) %>% + select(-cardiac) %>% + summarise(across(everything(), mean)) %>% + pivot_longer(cols = everything(), names_to = "Variable", values_to = "Mean/% (In those with cardiac events)") + +d_cardiac <- d_cardiac %>% mutate("N (Cardiac events)" = nrow(d %>% filter(cardiac == 1))) + +# variables that should be percentages +percentage_vars <- c("smoke", "female") + +# convert Mean/% column values from mean to percentage if relevant +d_cardiac <- d_cardiac %>% + mutate("Mean/% (In those with cardiac events)" = ifelse(Variable %in% percentage_vars, + paste0(sprintf("%.1f", `Mean/% (In those with cardiac events)` * 100), "%"), + sprintf("%.1f", `Mean/% (In those with cardiac events)`))) + +# calculate stats for those without cardiac events +d_control <- d %>% filter(cardiac == 0) %>% + select(-cardiac) %>% + summarise(across(everything(), mean)) %>% + pivot_longer(cols = everything(), names_to = "Variable", values_to = "Mean/% (In those without cardiac events)") + +d_control <- d_control %>% mutate("N (No cardiac events)" = nrow(d %>% filter(cardiac == 0))) + +# convert Mean/% column values from mean to percentage if relevant +d_control <- d_control %>% + mutate("Mean/% (In those without cardiac events)" = ifelse(Variable %in% percentage_vars, + paste0(sprintf("%.1f", `Mean/% (In those without cardiac events)` * 100), "%"), + sprintf("%.1f", `Mean/% (In those without cardiac events)`))) +# join columns +stats <- d_control %>% + left_join(d_cardiac, by = "Variable") + +# label variables +labels <- c("Smoker", "Is female", "Age in years", "Cost of treatment (in dollars)") + +stats <- stats %>% + mutate(Variable = labels) + +# save table 1 +write.csv(stats, "~/Documents/BIOMEDIN251/Assignment2/3-output/table1.csv", row.names = FALSE) + +############################################################################### +# regression analysis +############################################################################### + +# run logistic regression +glm.fit <- glm(cardiac ~ smoke + female + age, data = d, family = "binomial") + +summary(glm.fit) + +# print OR and CI for cardiac events by smoking status +print(OR_smoke <- exp(glm.fit$coefficients["smoke"])) + +print(CI <- exp(confint(glm.fit))) + + diff --git a/3-output/.DS_Store b/3-output/.DS_Store new file mode 100644 index 0000000..7fc5e76 Binary files /dev/null and b/3-output/.DS_Store differ diff --git a/3-output/density_plot_cost_cardiac.jpg b/3-output/density_plot_cost_cardiac.jpg new file mode 100644 index 0000000..2fd7b78 Binary files /dev/null and b/3-output/density_plot_cost_cardiac.jpg differ diff --git a/3-output/table1.csv b/3-output/table1.csv new file mode 100644 index 0000000..edc4088 --- /dev/null +++ b/3-output/table1.csv @@ -0,0 +1,5 @@ +"Variable","Mean/% (In those without cardiac events)","N (No cardiac events)","Mean/% (In those with cardiac events)","N (Cardiac events)" +"Smoker","13.8%",4750,"53.2%",250 +"Is female","57.8%",4750,"11.6%",250 +"Age in years","43.9",4750,"45.5",250 +"Cost of treatment (in dollars)","9127.5",4750,"9892.3",250 diff --git a/Assignment2.Rproj b/Assignment2.Rproj new file mode 100644 index 0000000..a3e66b8 --- /dev/null +++ b/Assignment2.Rproj @@ -0,0 +1,14 @@ +Version: 1.0 +ProjectId: 2fe8802a-f9b3-4f07-ad26-7ccd6f52f3c7 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/README.md b/README.md index bba956e..0a882ab 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,34 @@ # Assignment #2 Repository -This repository includes the simulated data for Assignment #2. Fork this repository and add your analysis as described in the canvas assignment. +This repository uses the simulated data for Assignment #2 from the public repository https://github.com/MethodsForReproducibleHealthResearch/Assignment2. -The csv file for `cohort` in the `raw-data` folder includes 5,000 observations with variables `smoke`, `female`, `age`, `cardiac`, and `cost`. +Variables in the dataset (my interpretation): + + - smoke: binary variable indicating that the patient is a smoker (1) or non-smoker (0) + - female: binary variable indicating that the patient is female (1) or male (0) + - age: continuous variable indicating patient's age in years + - cardiac: binary variable indicating any cardiac event (1) and no event (0) + - cost: continuous variable indicating cost of hospitalization in dollars + +Below is a description of the repository structure: + + 1-data : contains raw data in the "raw-data folder" + 2-analysis: contains the analysis script + 3-output: contains tables and figures generated by the analysis script + +Methods: + +I used a generalized linear model with a binomial family to evaluate the association between +being a smoker and having any cardiac event. The model was adjusted for potential confounders +gender and age. + +Summary of results: + +- Table 1 contains descriptive statistics for variables grouped by cardiac event status. There was no missing data. +- The logistic regression results show that being a smoker is associated with a 8.3 fold higher adjusted odds of + having a cardiac event compared to not being a smoker (95% CI:6.3, 10.9). The model was adjusted for gender and age. +- The density plot "density_plot_cost_cardiac" shows that on average those who experienced any cardiac event + have higher costs compared to those who did not. + +Statement: +I did not use generative AI technology (e.g., ChatGPT) to complete any portion of the work. \ No newline at end of file