Hackathons/deep_learning.R at master · 100rabh1401/Hackathons · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
path = "F:\\Padayi\\R code\\Deep learning"
setwd(path)

#load libraries
library(data.table)
library(mlr)

#set variable names
setcol <- c("age",
            "workclass",
            "fnlwgt",
            "education",
            "education-num",
            "marital-status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital-gain",
            "capital-loss",
            "hours-per-week",
            "native-country",
            "target")

#load data
train <- read.table("adult.data.txt", header=F, sep=",", col.names=setcol,
                    na.strings = c(" ?"), stringsAsFactors = F)
test <- read.table("adult.test.txt", header=F, sep=",", col.names=setcol,
                   skip=1, na.strings= c(" ?"), stringsAsFactors = F)

#View(train)
setDT(train)
setDT(test)
\
#Data Sanity
dim(train)
dim(test)

str(train)
str(test)


#checking missing values
table(is.na(train))
sapply(train, function(x) sum(is.na(x))/length(x))*100
table(is.na(test))
sapply(test, function(x) sum(is.na(x))/length(x))*100
#this function is checking the percentage of NA's in particular column

#Check Target variable
#binary in nature  check if data is imbalanced
train[,.N/nrow(train),target]
test[,.N/nrow(test),target]
#this function checks the target column and define the output in binary terms and there
#percentage

#Remove extra characters
test[,target := substr(target,start=1,stop=nchar(target)-1)]


#remove leading whitespaces
library(stringr)
char_col <- colnames(train)[sapply(test,is.character)]

for(i in char_col)
  set(train,j=i, value = str_trim(train[[i]],side="left"))

#set all characters variables as factor
fact_col <- colnames(train)[sapply(train,is.character)]

for(i in fact_col)
  set(train,j=i, value = factor(train[[i]]))

for(i in fact_col)
  set(test,j=i, value = factor(test[[i]]))

#impute missing values

imp1 <- impute(train,target = "target",
               classes = list(integer = imputeMedian(), factor = imputeMode()))
imp2 <- impute(test,target = "target",
               classes = list(integer = imputeMedian(), factor = imputeMode()))

train <- setDT(imp1$data)
test <- setDT(imp2$data)

#load the package
require(h2o)

#start h2o
localH2o <- h2o.init(nthreads = -1, max_mem_size = "20G")

#load data on H2o
trainh2o <- as.h2o(train)
testh2o <- as.h2o(test)

#set variables
y <- "target"
x <- setdiff(colnames(trainh2o),y)

#train the model - without hidden layer
deepmodel <- h2o.deeplearning(x = x
                              ,y = y
                              ,training_frame = trainh2o
                              ,standardize = T
                              ,model_id = "deep_model"
                              ,activation = "Rectifier"
                              ,epochs = 100
                              ,seed = 1
                              ,nfolds = 5
                              ,variable_importances = T)

#compute variable importance and performance
h2o.varimp_plot(deepmodel,num_of_features = 20)
h2o.performance(deepmodel,xval = T) #84.5 % CV accuracy