-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_split.Rmd
More file actions
77 lines (64 loc) · 2.3 KB
/
data_split.Rmd
File metadata and controls
77 lines (64 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
---
title: "Untitled"
author: "Reuben"
date: "2024-12-03"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Clear environment
```{r}
rm(list = ls())
```
# Load patient data
```{r}
patient_data <- read.delim("ws3_grampian_patient_data.txt")
```
# Plot distribution of treatment responses
```{r}
library(ggplot2)
metadata <- as.data.frame(patient_data[,c(1,20)])
metadata <- metadata[-c(1,2),]
# Plot a barplot of the treatment response types
ggplot(metadata, aes(x = as.factor(Response.to.Treatment), fill = as.factor(Response.to.Treatment))) +
geom_bar() +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
scale_fill_hue(c = 100) +
theme(legend.position = "none")
```
# Split the data into two subsets - one for unsupervised MOFA models, one for supervised model
```{r}
library(caret)
# set a random seed for the data split
set.seed(123)
#split the data in half. Half the data for the unsupervised MOFA models and half for the supervised model.
# Create a balanced partition, with an equal number of each treatment type in each split
trainIndex <- createDataPartition(metadata$Response.to.Treatment, p = 0.5, list = FALSE, times = 1)
# Subset the data
unsupervised_data <- metadata[trainIndex, ]
supervised_data <- metadata[-trainIndex, ]
# Check the distribution of the response variable in each split
print("Unsupervised")
print(table(unsupervised_data$Response.to.Treatment))
print("Supervised")
print(table(supervised_data$Response.to.Treatment))
# Plot distributions of both subsets
ggplot(unsupervised_data, aes(x = as.factor(Response.to.Treatment), fill = as.factor(Response.to.Treatment))) +
geom_bar() +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
labs(x = "Response to Treatment")+
scale_fill_hue(c = 100) +
theme(legend.position = "none")
ggplot(supervised_data, aes(x = as.factor(Response.to.Treatment), fill = as.factor(Response.to.Treatment))) +
geom_bar() +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
labs(x = "Response to Treatment")+
scale_fill_hue(c = 100) +
theme(legend.position = "none")
```
# Save data subsets
```{r}
saveRDS(unsupervised_data, "unsupervised_ID")
saveRDS(supervised_data, "supervised_ID")
```