diff --git a/1-data/.DS_Store b/1-data/.DS_Store
new file mode 100644
index 0000000..fcd9d76
Binary files /dev/null and b/1-data/.DS_Store differ
diff --git a/raw-data/cohort.csv b/1-data/raw-data/cohort.csv
similarity index 100%
rename from raw-data/cohort.csv
rename to 1-data/raw-data/cohort.csv
diff --git a/2-analysis/assignment1_analysis.R b/2-analysis/assignment1_analysis.R
new file mode 100644
index 0000000..49355ca
--- /dev/null
+++ b/2-analysis/assignment1_analysis.R
@@ -0,0 +1,105 @@
+# clear environment 
+rm(list = ls())
+# load libraries 
+library(tidyverse)
+library(GGally)
+library(ggplot2)
+
+# read in cohort data 
+d <- read.csv("1-data/raw-data/cohort.csv")
+
+# class(d$smoke)
+# class(d$female)
+# class(d$age)
+# class(d$cost)
+# class(d$cardiac)
+
+# check if any rows have missing values 
+missing <- d[rowSums(is.na(d)) > 0,] # no missing values 
+
+###############################################################################
+# plot histogram of cost and cardiac events   
+###############################################################################
+
+# create categorical variable for cardiac status 
+d <- d %>%
+  mutate(cardiac_cat = case_when(cardiac == 0 ~ "No cardiac event",
+                                 cardiac == 1 ~ "Cardiac event"))
+                            
+# create density plot of cost by cardiac event status 
+plot <- ggplot(d, aes(x=cost, fill=cardiac_cat)) + 
+  geom_density(alpha = 0.7) +
+  ggtitle("Density plot of cost by cardiac event status") +
+  labs(fill="") +
+  xlab("Cost") + 
+  ylab("Density") +
+  theme_minimal()
+
+# save density plot 
+ggsave(filename = "density_plot_cost_cardiac.jpg", plot = plot, path = "3-output")
+
+# remove categorical variable for cardiac event 
+d <- d %>% select(-cardiac_cat)
+
+###############################################################################
+# create table to describe variables 
+###############################################################################
+
+# calculate stats for those with cardiac events 
+d_cardiac <- d %>% filter(cardiac == 1) %>% 
+  select(-cardiac) %>% 
+  summarise(across(everything(), mean)) %>% 
+  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Mean/% (In those with cardiac events)")
+  
+d_cardiac <- d_cardiac %>% mutate("N (Cardiac events)" = nrow(d %>% filter(cardiac == 1)))
+
+# variables that should be percentages 
+percentage_vars <- c("smoke", "female")
+
+# convert Mean/% column values from mean to percentage if relevant
+d_cardiac <- d_cardiac %>% 
+  mutate("Mean/% (In those with cardiac events)" = ifelse(Variable %in% percentage_vars, 
+                           paste0(sprintf("%.1f", `Mean/% (In those with cardiac events)` * 100), "%"), 
+                           sprintf("%.1f", `Mean/% (In those with cardiac events)`)))
+
+# calculate stats for those without cardiac events 
+d_control <- d %>% filter(cardiac == 0) %>% 
+  select(-cardiac) %>% 
+  summarise(across(everything(), mean)) %>% 
+  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Mean/% (In those without cardiac events)")
+
+d_control <- d_control %>% mutate("N (No cardiac events)" = nrow(d %>% filter(cardiac == 0)))
+
+# convert Mean/% column values from mean to percentage if relevant
+d_control <- d_control %>% 
+  mutate("Mean/% (In those without cardiac events)" = ifelse(Variable %in% percentage_vars, 
+                                                          paste0(sprintf("%.1f", `Mean/% (In those without cardiac events)` * 100), "%"), 
+                                                          sprintf("%.1f", `Mean/% (In those without cardiac events)`)))
+# join columns 
+stats <- d_control %>% 
+  left_join(d_cardiac, by = "Variable")
+
+# label variables 
+labels <- c("Smoker", "Is female", "Age in years", "Cost of treatment (in dollars)")
+
+stats <- stats %>%
+  mutate(Variable = labels) 
+
+# save table 1 
+write.csv(stats, "~/Documents/BIOMEDIN251/Assignment2/3-output/table1.csv", row.names = FALSE)
+
+###############################################################################
+# regression analysis  
+###############################################################################
+
+# run logistic regression 
+glm.fit <- glm(cardiac ~ smoke + female + age, data = d, family = "binomial")
+
+summary(glm.fit)
+
+# print OR and CI for cardiac events by smoking status 
+print(OR_smoke <- exp(glm.fit$coefficients["smoke"]))
+
+print(CI <- exp(confint(glm.fit))) 
+
+
diff --git a/3-output/.DS_Store b/3-output/.DS_Store
new file mode 100644
index 0000000..7fc5e76
Binary files /dev/null and b/3-output/.DS_Store differ
diff --git a/3-output/density_plot_cost_cardiac.jpg b/3-output/density_plot_cost_cardiac.jpg
new file mode 100644
index 0000000..2fd7b78
Binary files /dev/null and b/3-output/density_plot_cost_cardiac.jpg differ
diff --git a/3-output/table1.csv b/3-output/table1.csv
new file mode 100644
index 0000000..edc4088
--- /dev/null
+++ b/3-output/table1.csv
@@ -0,0 +1,5 @@
+"Variable","Mean/% (In those without cardiac events)","N (No cardiac events)","Mean/% (In those with cardiac events)","N (Cardiac events)"
+"Smoker","13.8%",4750,"53.2%",250
+"Is female","57.8%",4750,"11.6%",250
+"Age in years","43.9",4750,"45.5",250
+"Cost of treatment (in dollars)","9127.5",4750,"9892.3",250
diff --git a/Assignment2.Rproj b/Assignment2.Rproj
new file mode 100644
index 0000000..a3e66b8
--- /dev/null
+++ b/Assignment2.Rproj
@@ -0,0 +1,14 @@
+Version: 1.0
+ProjectId: 2fe8802a-f9b3-4f07-ad26-7ccd6f52f3c7
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/README.md b/README.md
index bba956e..0a882ab 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,34 @@
 # Assignment #2 Repository
 
-This repository includes the simulated data for Assignment #2. Fork this repository and add your analysis as described in the canvas assignment.
+This repository uses the simulated data for Assignment #2 from the public repository https://github.com/MethodsForReproducibleHealthResearch/Assignment2. 
 
-The csv file for `cohort` in the `raw-data` folder includes 5,000 observations with variables `smoke`, `female`, `age`, `cardiac`, and `cost`.
+Variables in the dataset (my interpretation):
+
+  - smoke: binary variable indicating that the patient is a smoker (1) or non-smoker (0)
+  - female: binary variable indicating that the patient is female (1) or male (0)
+  - age: continuous variable indicating patient's age in years 
+  - cardiac: binary variable indicating any cardiac event (1) and no event (0)
+  - cost: continuous variable indicating cost of hospitalization in dollars 
+
+Below is a description of the repository structure:
+
+          1-data : contains raw data in the "raw-data folder"
+          2-analysis: contains the analysis script
+          3-output: contains tables and figures generated by the analysis script
+
+Methods:
+
+I used a generalized linear model with a binomial family to evaluate the association between 
+being a smoker and having any cardiac event. The model was adjusted for potential confounders 
+gender and age. 
+
+Summary of results:
+
+- Table 1 contains descriptive statistics for variables grouped by cardiac event status. There was no missing data. 
+- The logistic regression results show that being a smoker is associated with a 8.3 fold higher adjusted odds of 
+  having a cardiac event compared to not being a smoker (95% CI:6.3, 10.9). The model was adjusted for gender and age. 
+- The density plot "density_plot_cost_cardiac" shows that on average those who experienced any cardiac event 
+  have higher costs compared to those who did not. 
+
+Statement:
+I did not use generative AI technology (e.g., ChatGPT) to complete any portion of the work.
\ No newline at end of file