-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmachinelearning
More file actions
123 lines (109 loc) · 4.78 KB
/
machinelearning
File metadata and controls
123 lines (109 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#########################################################################################
#### REPLICATION OF "Predicting Conflict in Space and Time" by ##########################
#### Nils B. Weidmann and Michael D. Ward, then using Random Forest on same data ########
#########################################################################################
# Set working directory
setwd("/Users/kristopher/Desktop/Kristopher/PredictinPoliticalScience/PredictingConflictSpaceandTime")
#### Import data ####
data <- read.table("bosnia_replication.txt", header=T)
####lag conflict: ####
library(dplyr)
#One year lag
data <-
data %>%
group_by(MUNID) %>%
mutate(CONFLICTt1 = lag(CONFLICT, 1))
#Two years lag
data <-
data %>%
group_by(MUNID) %>%
mutate(CONFLICTt2 = lag(CONFLICT, 2))
#If conflict lagged is NA == 0
data1 <- na.omit(data) # Remove rows with >0 missing
data$CONFLICTt1[is.na(data$CONFLICTt1)] <- 0 # NA == 0
data$CONFLICTt2[is.na(data$CONFLICTt2)] <- 0 # NA == 0
#Model 1, binomial regression, replikation Table 1 baseline model, p.889
model1 <- glm(CONFLICT ~ TOTPOP91 + ELF91 + BORDER + terrmean + CONFLICTt1 +
CONFLICTt2, data=data1, family=binomial)
summary(model1)
#Model1, b binomial regression, replikation Table 1 baseline model,
#NOT deleted missing values in conflict. Better fit.
model2 <- glm(CONFLICT ~ TOTPOP91 + ELF91 + BORDER + terrmean + CONFLICTt1 +
CONFLICTt2, data=data, family=binomial)
summary(model2)
#### PLOTTING RESULTS FROM TABLE 1 ####
#Predikert vs. observert Y.
model1plot <- cbind(model1$fitted.values,model1$y)
plot(model1plot, main = "Predikert vs observert Y",
xlab = "Predikerte verdier", ylab = "Observert Y")
########################################################################################
#### Plotting predictors ####
library(ggplot2)
#### Total population
# on CONFLICT
qplot(TOTPOP91,CONFLICT,data=data)
# on terrmean
qq1 <- qplot(TOTPOP91,terrmean,colour=CONFLICT,data=data) # negativ trend
qq1 + geom_smooth(method="lm",formula=y~x)
glm(terrmean~TOTPOP91,data=data)
# on ELF91
qq2 <- qplot(TOTPOP91,ELF91,colour=CONFLICT,data=data)
qq2 + geom_smooth(method="lm",formula=y~x) # noe positiv trend
glm(TOTPOP91~ELF91,data=data)
# on BORDER
qplot(TOTPOP91,BORDER,colour=CONFLICT,data=data) #bare konflikter i ikke-naboer..
# on CONFLICT lagged with 1 year
qplot(TOTPOP91,CONFLICTt1,colour=CONFLICT,data=data)
#### terrmean
# on CONFLICT
qplot(terrmean,CONFLICT,data=data)
# on ELF91
qq3 <- qplot(terrmean,ELF91,colour=CONFLICT,data=data)
qq3 + geom_smooth(method="lm",formula=y~x) # noe positiv trend
# on BORDER
qplot(terrmean,BORDER,colour=CONFLICT,data=data)
########################################################################################
# Etter å ha sett på dataene så virker det som om det er for få konflikter og ikke
# en tydelig kausal sammenheng til å gjøre gode prediksjoner, men prøver alikevel..
########################################################################################
#### RANDOM FOREST ####
library(caret);library(kernlab)
# Split into training and testing dataset. 75% in training, by random.
inTrain <- createDataPartition(y=data$CONFLICT,p=0.75,list=F)
training <- data[inTrain,]
testing <- data[-inTrain,]
##### Building the model ####
#Import package
library(randomForest)
# Set a random seed
set.seed(1337)
# Build the model
modelFit <- randomForest(factor(CONFLICT) ~TOTPOP91 + ELF91 + BORDER +
terrmean + CONFLICTt1 + CONFLICTt2,
data=training)
#plot results
plot(modelFit)
legend('topright', colnames(modelFit$err.rate), col=1:3, fill=1:3)
# We see that the model predicts "no conflict" better than "conflict". Which variables are most important?
#### Variable importance ####
# Get importance
importance <- importance(modelFit)
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))
# Create a rank variable based on importance
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
# Use ggplot2 to plot the importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'red') +
labs(x = 'Variables') +
coord_flip() +
theme_bw()
#### Prediction on test dataset ####
prediction <- predict(modelFit, testing)
# Export solution to a dataframe with two columns: MUNID (id variable) and predicted conflict
solution <- data.frame(ID = testing$MUNID, Conflict = prediction)
# konklusjon: for få konflikter og for lite mønster i dataen til at modellen klarer å predikere presist..