-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathApplication.R
More file actions
105 lines (75 loc) · 2.92 KB
/
Application.R
File metadata and controls
105 lines (75 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#installing some packages to read the csv data
install.packages("tidyverse")
library(tidyverse)
install.packages("visdat")
library(visdat)
install.packages("depth")
install.packages("fda.usc")
install.packages("ddalpha")
install.packages("DepthProc")
library(ggplot2)
library(depth)
library(DepthProc)
library(MASS)
library(class)
library(fda.usc)
library(ddalpha)
#read csv of the stroke dataset
stroke <- read_csv("healthcare-dataset-stroke-data.csv")
############### checking the data, converting into categorial values ##############
head(stroke)
str(stroke)
#Cleaning the data to make a more accurate classification
#BMI has some empty data (n/a)
stroke$bmi <- as.numeric(stroke$bmi)
vis_dat(stroke)
colSums(is.na(stroke)) #there are 201 n/a's, which is an acceptable amount to drop
stroke <- drop_na(stroke) #drop n/a's
#Gender contains 'other', we remove this for more easy classification
stroke <- stroke %>%
filter(gender != 'Other')
# Develop Levels to our ordinal variable of smoking type
# stroke$smoking_status <- factor(stroke$smoking_status, order = TRUE,
# levels = c('never smoked', 'formerly smoked', 'smokes', 'Unknown'))
# male = 0, female = 1
stroke <- stroke %>%
mutate(gender = recode(gender, 'Male' = 0, 'Female' = 1))
stroke$gender <- as.double(stroke$gender)
# married = 1, not married = 0
stroke <- stroke %>%
mutate(ever_married = recode(ever_married, 'Yes' = 1, 'No' = 0))
stroke$ever_married <- as.double(stroke$ever_married)
# Residence; 1 = Urban, 0 = Rural
stroke <- stroke %>%
mutate(Residence_type = recode(Residence_type, 'Rural' = 0, 'Urban' = 1))
stroke$Residence_type <- as.double(stroke$Residence_type)
# Never smoked = 0, formerly smoked = 1, smokes = 2
# Unknown = 1, here we are just taking the average value because we don't know
stroke <- stroke %>%
mutate(smoking_status = recode(smoking_status, 'never smoked' = 0, 'formerly smoked' = 1,
'smokes' = 2, 'Unknown' = 1))
stroke$smoking_status <- as.double(stroke$smoking_status)
#Removing work_type because can't make it numeric
stroke <- stroke[, colnames(stroke) != "work_type"]
#check that there is now nothing categorial
vis_dat(stroke)
view(stroke)
############## Entrenamiento y predicción con DD-Classifier #############
amount = nrow(stroke)
print(amount)
ii<-sample(1:amount,0.8*amount) # Dividimos 80-20 para training y testeado.
stroke = as.data.frame(stroke)
group.train<-factor(stroke[ii,11]) # y de entrenamiento.
x.train<-stroke[ii,2:10] # x de entrenamiento.
out1=classif.DD(group.train,x.train,depth="HS",classif="lda") # Clasificador 1.
out2=classif.DD(group.train,x.train,depth="HS",classif="DD3") # Clasificador 2.
summary(out1)
summary(out2)
x.test<-stroke[-ii,2:10] # x de prueba.
group.test<-stroke[-ii,11] # y de prueba.
# Predicciones.
pred1=predict(out1,x.test)
pred2=predict(out2,x.test)
# Matrices de confusión.
table(pred1,group.test)
table(pred2,group.test)