聚类分析

层次聚类

#01Jun2018

library(tidyverse)
library(DataExplorer)
library(NbClust)

data(nutrient, package = "flexclust")
profile_missing(nutrient)
boxplot(nutrient)

nutrient.scaled <- scale(nutrient)
boxplot(nutrient.scaled)
d <- dist(nutrient.scaled)
heatmap(as.matrix(d), labCol = F, labRow = F)

set.seed(1234)
order <- sample(1:nrow(nutrient.scaled),nrow(nutrient.scaled))
nutrient.scaled <- nutrient.scaled[order,]
d <- dist(nutrient.scaled)

fit.average <- hclust(d, method = "average")
plot(fit.average, hang = -1.5, cex = .8,  main = "Average Linkage Clustering")

nc <- NbClust(nutrient.scaled, distance = "euclidean",
              min.nc = 2, max.nc = 15, method = "average")
clusters <- cutree(fit.average, k = 2)
table(clusters)
par(mfrow = c(1, 1))
plot(fit.average, hang = -1, cex = .8,
     main = "Average Linkage Clustering \n 2 Cluster Solution")
rect.hclust(fit.average, k = 2)

kmeans

##kmean
library(NbClust)
data(nutrient, package = "flexclust")
nutrient.scaled <- scale(nutrient)

set.seed(1234)
nc <- NbClust(nutrient.scaled, min.nc = 2, max.nc = 15, method = "kmeans")

set.seed(1234)
fit.km <- kmeans(nutrient.scaled, 3)
summary(fit.km)


##kmeans
library(ggplot2)
library(cluster)
library(factoextra)
data("USArrests")
USArrests <- na.omit(USArrests)
head(USArrests, n = 6)

df <- scale(USArrests)
res <- get_clust_tendency(df, 40, graph = T)
res$plot
res$hopkins_stat
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25, K.max = 10, B=500)
fviz_gap_stat(gap_stat)
km.res <- kmeans(df, 4, nstart = 25)
fviz_cluster(km.res, USArrests)

使用eclust进行kmeans聚类或层次聚类

library(cluster)
library(factoextra)
data("USArrests")
USArrests <- na.omit(USArrests)
df <- scale(USArrests)

set.seed(1234)
res.km <- eclust(df, "kmeans")
fviz_gap_stat(res.km$gap_stat)
fviz_silhouette(res.km)

res.hc <- eclust(df, "hclust")
fviz_dend(res.hc, rect = T)
fviz_silhouette(res.hc)
fviz_cluster(res.hc)