Clust-Homeworks/Cheatsheet.R at main · zurlog/Clust-Homeworks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
## CHEATSHEET ####
## MODERN STATISTICS AND BIG DATA ANALYSIS ####

## LIBRARIES ####
library(pdfCluster)
library(fpc)
library(factoextra)
library(cluster)
library(smacof)
library(mclust)
library(fda)
library(funFEM)
#####

set.seed(1234)
kmeans(x, centers=k, iter.max = 100, nstart = 100, trace=FALSE)
NbClust::NbClust(data = NULL, diss = NULL, distance = "euclidean", min.nc = 2, max.nc = 20,
                 method = NULL)

## GAP STATISTIC ####
cluster::clusGap(x, FUNcluster, K.max, B = 100, d.power = 2,
                 spaceH0 = c("scaledPCA", "original"),
                 SE.factor = 2, method="globalSEmax",
                 nstart=100,...)

print(obj, method = "globalSEmax", SE.factor = 2, ...)
fviz_gap_stat(obj)

gapnc <- function(data,FUNcluster=kmeans,
                  K.max=10, B = 100, d.power = 2,
                  spaceH0 ="scaledPCA",
                  method ="globalSEmax", SE.factor = 2,...){
  # As in original clusGap function the ... arguments are passed on
  # to the clustering method FUNcluster (kmeans).
  # Run clusGap
  gap1 <- clusGap(data,kmeans,K.max, B, d.power,spaceH0,...)
  # Find optimal number of clusters; note that the method for
  # finding the optimum and the SE.factor q need to be specified here.
  nc <- maxSE(gap1$Tab[,3],gap1$Tab[,4],method, SE.factor)
  # Re-run kmeans with optimal nc.
  kmopt <- kmeans(data,nc,...)
  out <- list()
  out$gapout <- gap1
  out$nc <- nc
  out$kmopt <- kmopt
  out
}
# The output of clusGap is in component gapout.
# The optimal number of clusters is in component nc.
# The optimal kmeans output is in component kmopt.

set.seed(1234)
factoextra::fviz_nbclust(x, FUNcluster = hcut, k.max = 20, hc_method = "ward.D2",
             method = "gap_stat", maxSE = list(method = "globalmax", SE.factor = 2))

# This can be used to compute S for any clustering:
fpc::cluster.stats(dist(p05),kmbundestag5$cluster)
kmb$within.cluster.ss
# This is the same as kmeans5$tot.withinss

#####

## DISSIMILARITIES ####

scale(x)
dist(x, method = c("euclidean","manhattan"))
as.matrix(dist)[1,2]  # Check a dissimilarity value

cluster::daisy(x, metric = "euclidean")  # Handles NAs

# The mahalanobis command can only compute a vector of Mahalanobis distances
mahalm <- matrix(0,ncol=572,nrow=572)
olivecov <- cov(olive)
for (i in 1:572){
  mahalm[i,] <- mahalanobis(olive,as.numeric(olive[i,]),olivecov)}
# Note that it doesn't make a difference whether the data set is scaled or not.

dist(veronica,method="binary")  # Jaccard (asymmetric)
dist(veronica,method="manhattan")/583  # SMC for dummies
nomclust::sm(x)

1-abs(cor(x)) #  Largest dissim. for r = 0
0.5-cor(x)/2  #  Largest dissim. for r = -1
as.dist(cordist)

daisy(housing, metric="gower", type=list(asymm=c(2,4), symm=c(3,6)))

#####

## AAHC ####
# Complete Linkage enforces within-cluster homogeneity ag. separation
# Single Linkage enforces between-cluster separation ag. homogeneity
hclust(diss, method = "average")
cutree(hclust, k)

plot(hclust, hang=-1, xlab="",sub="" ,cex=0.6, cex.axis=1)
b=ggdendrogram(out.eucl, rotate = F, theme_dendro = F,labels = F)
print(b + ggtitle("Ward"))

tail(cbind(hclust$merge, hclust$height),13)
plot(3:20,rev(hclust$height)[3:20],type="b",
     xlab="K",ylab="Height",
     cex.lab=0.8,cex.axis=0.7,
     main="Scree Plot", cex.main=0.9); grid()

plot(x=1:10,y=rev(tail(avg$height,10)),type="b", xlab="K",ylab="Height",cex.lab=0.8,cex.axis=0.7,
main="Scree Plot", cex.main=0.9); grid()


#####

## SILHOUETTE ####
# Suited for PAM and Average Linkage
fviz_nbclust(x, FUNcluster = hcut, k.max = 20, hc_method = "complete",
             hc_metric="euclidean", method = "silhouette")

pasw <- NA
pclusk <- list()
psil <- list()
# Look at K between 2 and 30:
for (k in 2:30){
  # PAM clustering:
  pclusk[[k]] <- pam(diss,k)
  # Computation of silhouettes (partition vector as argument):
  psil[[k]] <- silhouette(pclusk[[k]],dist=diss)
  # ASW needs to be extracted:
  pasw[k] <- summary(psil[[k]])$avg.width}

plot(1:30,pasw,type="l",xlab="Number of clusters",ylab="ASW Plot")
plot(psil[[5]])
mlr3viz::autoplot(preds, task, type = "sil")
#####

## VISUALIZATION ####
## PCA is connected to varcov and euclidean dist so good for kmeans
## Made for continuous variables - struggle with high dimensional (tiny % of variation)
princomp(x)
plot(x=pca$scores[,1], y=pca$scores[,2],
     col = ($cluster+1), pch=($cluster+1),
     cex=1.1, xlab="PC1 (44.3%)", ylab = "PC2 (27.8%)", main = "PC1 / PC2 - plot")

mlr3viz::autoplot(preds, task, type = "pca", frame = TRUE)

clusym[]; clucols(mod$cluster)

smacof::mds(diss, ndim = 2, type = "ratio")
plot(mds$conf, type = "p", asp = 1, main="MDS Plot -"
     xlab=paste("Stress : ", as.character(mds$stress*100),"%"))
# Intended to represent diss. in a euclidean way, as good as possible
# Artifacts of MDS may show results in a misleading way (depending on stress)
# Good level of stress: under 10%
# Over 20% there is quite a bit of info not represented

# I decided to use the Jaccard distance also for AFLP genes:
vveronica <- dist(t(x),method="binary")
varclust <- hclust(vveronica,method="average")
# As a clustering this is pretty messy,
# but still it can be used to impose an order of genes.

heatmap(as.matrix(x),Rowv=as.dendrogram(average),
        Colv=as.dendrogram(varclust),
        col=grey(seq(1,0,-0.01)))   # OR

# heatmap, rows ordered by clusters,
# columns by earlier variable clustering
heatmap(veronicam[order(veronicabernm$flexout[[6]]@cluster),],
        Rowv=NA,Colv=as.dendrogram(varclust),
        RowSideColors=palette()[veronicabernm$flexout[[6]]@cluster]
        [order(veronicabernm$flexout[[6]]@cluster)],
        col=c(0,1),scale="none")

clusplot(pam)
#####

## MODEL BASED CLUSTERING ####

mclust::Mclust(x,G=1:15, modelNames = c("VVV"))
summary(mclust$BIC)
plot(mclust)


molive$classification
# Clustering vector
molive$parameters
# Estmated parameters
molive$z
# Matrix of posterior probabilities p_ik that point i was generated
# by mixture component k
prod(mclust$parameters$variance$shape)==1

factoextra::fviz_mclust_bic(mclust, model.names = NULL, shape = 1, lwd=3,
                            color = "model", palette = NULL, legend = NULL,cex=2,
                            main = "Mixture Model Selection", xlab = "Number of Components",
                            ylab = "BIC")

### To have a look at the best covmat models, just rerun the function
selected_mix=Mclust(swdbcc, G=1:10, verbose = T, modelNames = c('VVV','VEV','EVV'))

#####

## MIXTURES OF SKEW AND HEAVY-TAILED ####
# Different mixtures fit data in different ways and it's hard to say what's best
# Mixtures can well approximate each other, it's hard to choose among them
# Skew shapes can be interpreted as one cluster or as one symmetric core plus others

library(EMMIXskew)
for (i in 1:12){
  print(i)
  tryattempts <- 3
  trycounter <- 1
  tst <- try(skewmix[[i]] <- EmSkew(x,g=i,distr="mst",ncov=3))
  while((is.null(tst) | class(tst)=="try-error") & trycounter<tryattempts+1){
    print("Error, try again")
    tst <- try(skewmix[[i]] <- EmSkew(x,g=i,distr="mst",ncov=3))
    trycounter <- trycounter+1
  }
  trycounter <- 1
  while((is.null(tst) | class(tst)=="try-error") & trycounter<tryattempts+1){
    print("Error, try again")
    tst <- try(skewmix[[i]] <- EmSkew(x,g=i,distr="mst",ncov=4))
    trycounter <- trycounter+1
  }
  trycounter <- 1
  while((is.null(tst) | class(tst)=="try-error") & trycounter<tryattempts+1){
    print("Error, try again")
    tst <- try(skewmix[[i]] <- EmSkew(x,g=i,distr="mst",ncov=2))
    trycounter <- trycounter+1
  }
  bicvals[i] <- skewmix[[i]]$bic
  #ariarea[i] <- adjustedRandIndex(skewmix[[i]]$clust,oliveoil$macro.area)
  #ariregion[i] <- adjustedRandIndex(skewmix[[i]]$clust,oliveoil$region)
}


#####

## LATENT CLASS ANALYSIS ####
## For categorical data (also binary)
## Previously we used AHC on SM-J Dissimilarities + MDS

set.seed(1234)
fpc::flexmixedruns(x, continuous=0, discrete=ncol(x), n.cluster=1:10, simruns = 100,
                   verbose = T, allout = F)

which.min(out$bicvals) # Which 1:10 model is the best?
out$optimalk; out$optsummary
plot(1:10,out$bicvals,typ="l", xlab="Number of clusters", ylab="BIC")

str(out$flexout, max.level = 2)
# if allout=TRUE, flexout[[]] list of flexmix output objects for all numbers of components
out$flexout[[k]]@cluster # Clustering
out$flexout[[k]]@prior # Mixing parameters
str(out$flexout[[k]]@components) # Model Object Structure
out$flexout[[k]]@components$Comp.1[[1]]@parameters$pp # zeta par for 1st Comp
out$flexout[[k]]@components[[5]][[1]]@parameters$pp


f <- cbind(x1,x2,x3,x4)~1
set.seed(1234)
poLCA::poLCA(f, x, nclass=3, maxiter = 5000, nrep = 70)
# nclass K is fixed in advance
a$predclass # Clustering
a$bic
#####

## FUNCTIONAL DATA ####

# Raw data plot:
plot(1:ncol(x),x[1,],type="l",ylab="",xlab="",main="", ylim = c(min(x),max(x)))
for(i in 2:nrow(x)){
  points(1:ncol(x),x[i,],type="l")
  }

# Constructing B-spline basis
bbasis <- create.bspline.basis(c(1,ncol(x)),nbasis=10) # with p=10, d=4
# Splines approximating data as linear combinations of B-spline basis
fd10 <- Data2fd(1:ncol(x),y=t(as.matrix(x)),basisobj=bbasis)

# Plot basis
plot(bbasis)

# Smooth splines for data with smooth mean function:
plot(fd10)
meanx <- mean.fd(fd10)
lines(meanx,col=2,lwd=5)
# Show smooth fit of individual countries
plotfit.fd(t(x),1:ncol(x),fd10,index=79,cex.pch=0.5)


## FUNCTIONAL PCA
fpca <- pca.fd(fd10, nharm = 5)
plot(fpca$harmonics) # PCs phi_k
fpca$varprop # Percentage of variance
cumsum(fpca$varprop) # Cumulative percentage of variance

plot.pca.fd(fpca, expand = 0)
pairs(fpca$scores,col=clust,pch=clusym[clust])

# Create functional data object of PCA approximations
pcaapprox <- fpca$harmonics
i <- 1
pcacoefi <- fpca$harmonics$coefs %*% fpca$scores[i,]+fd10$coefs
covidpcaapprox$coefs <- pcacoefi
for (i in 2:179){
  pcacoefi <- fpca$harmonics$coefs %*% fpca$scores[i,]+mcovid$coefs
  pcaapprox$coefs <- cbind(pcaapprox$coefs, pcacoefi)
}
dimnames(pcaapprox$coefs)[[2]] <- covid21[,1]
plotfit.fd(t(x),1:555,pcaapprox,index=79,cex.pch=0.5)


## funFEM
set.seed(1234567)
femmodels <- c("DkBk", "DkB", "DBk","DB", "AkjBk",
               "AkjB", "AkBk", "AkBk", "AjBk", "AjB", "ABk","AB")
nmodels <- length(femmodels)
femresults <- list() # Save output for all models in femmodels
bestk <- bestbic <- numeric(0)
# bestk: vector of best K for each model.
# bestbic: Best BIC value for each model.
K=2:10 # Numbers of clusters K to try out.
fembic <- matrix(NA,nrow=nmodels,ncol=max(K))
# fembic will hold all BIC values for models (rows) and K (columns);
# NA for those that cannot be fitted.
for (i in 1:nmodels){ # This takes a long time!!
  print(femmodels[i])
  femresults[[i]] <- funFEM(fd10,model=femmodels[i],K=K)
  fembic[i,K] <- femresults[[i]]$allCriterions$bic
  bestk[i] <- which(fembic[i,]==max(fembic[i,K],na.rm=TRUE))
  bestbic[i] <- max(fembic[i,K],na.rm=TRUE)
}
besti <- which(bestbic==max(bestbic,na.rm=TRUE))

# This prints out the countries in the clusters.
for(i in 1:femresult11$K){
  print(i)
  print(covid21[femresult11$cls==i,1])
}
pairs(fpca$scores,col=femresult11$cls,pch=19)

#Visualisation of discriminative subspace U
fdproj <- t(fdcovid$coefs) %*% femresult11$U
pairs(fdproj,col=femresults11$cls,pch=19)
plot(fdproj,col=femresult11$cls,pch=19,xlab="DC 1",ylab="DC 2")

# Plot the cluster mean curves
clmeans <- fd10; clmeans$coefs <- t(femcovid$prms$my)
plot(clmeans,lwd=3) # col doesn't seem to work here, neither lwd
legend(100,10,legend=1:8,col=c(1:6,1:2),lty=c(1:5,1:3))
# Plot individual clusters and mean curves
par(ask=TRUE)
for (k in 1:femcovid$K){
  plot(1:ncol(x),x[1,],type="l", ylim=c(0,25),ylab="")
       for(i in 2:179){
  points(1:ncol(x),x[i,],type="l",col=as.integer(femcovid$cls[i]==k))
    meank <- colMeans(x[femcovid$cls==k,])
    points(1:ncol(x),meank,type="l",lwd=5,col=2)}
}
par(ask=FALSE)
#####


## ROBUST STATISTICS ####

mad(x)
robustbase::huberM(x,k=1.5)
huber$s    # MAD by default
huber$SE   # standard error
robustbase::covMcd(x, alpha = 0.75)
# Do not use RAW components

# library robustbase has a plot.mcd function and one could
# use plot(mcdd) for outlier diagnostic plots, but this has some problems
# - need to add tol=1e-20 because otherwise gives an error.
plot(1:nrow(x),sqrt(mcdd$mah),type="n",xlab="Observation",
     ylab="Squared robust Mahalanobis distance")
text(1:170,sqrt(mcdd$mah),rownames(dortmund),cex=0.7)
abline(sqrt(qchisq(0.99,7)),0,col=2)

plot(sqrt(mcdd75$mah),sqrt(mcdd$mah),xlim=c(0,30),ylim=c(0,30),
     xlab="Squared robust Mahalanobis distance (alpha=0.75)",
     ylab="Squared robust Mahalanobis distance (alpha=0.5)")
abline(sqrt(qchisq(0.99,7)),0,col=2)
abline(v=sqrt(qchisq(0.99,7)),col=2)

robustbase::lmrob(y~x1+x2+x3, method="MM",data=regdata3)   # MM-estimator
par(mfrow=c(2,3))
plot(mm)
plot(1:nrow(x),mm$rweights)
summary(mm)

#####