library(fpc)#使用kmeansruns函数
library(mlbench)#使用数据
library(ggplot2)
data('Vehicle',package = 'mlbench')
data<-Vehicle[,-19]
set.seed(1111)
##combine opel and saab to car
class<-Vehicle[,19]
levels(class)[levels(class)=='saab']<-'car'
levels(class)[levels(class)=='opel']<-'car'
ggplot(data=Vehicle,aes(x=class))+geom_bar()
fit<-kmeansruns(data,krange = 1:8,
criterion='ch',
runs=100,
scaledata=TRUE,
critout = TRUE,
)
attributes(fit)
table(fit$cluster,class)
#class
#bus car van
#1 161 195 195
#2 57 234 4
##聚类2可能代表了car?
模糊k均值法:允许样本属于多个簇
library(fclust)
fit2<-FKM(data,k=2,m=2,RS=10,stand=1)
attributes(fit2)
head(fit2$clus)#查看聚类的概率
table(fit2$clus[,1],class)
#class
##bus car van
#1 158 186 195
#2 60 243 4
###可以通过评测聚类有效值,如轮廓值来评估最佳K值
Fclust.index(fit2,index = 'SIL.F')
#The default value alpha=1 has been set for computing SIL.F
#[1] 0.6353147
系统聚类分析,即层次聚类
library(pvclust)
data<-scale(data)
set.seed(2021)
fit3<-pvclust(data,
method.hclust = 'ward.D',
nboot = 5000,
method.dist = 'euclidean')
##上述函数通过hclust函数进行聚类分析
print(fit3)
#结果中au为近似无偏P值,bp为自助概率P值,se.au表示对自身P值的估计
plot(fit3)
pvrect(fit3,alpha = 0.95)
从数据集中随机抽取子集数据,然后进行这些子集数据长度聚类分析。再进行大量多次的循环运算,计算每个聚类簇发生的次数比例(自助概率BP)
利用不同抽样规模的重抽样来估计每个聚类簇的p值,产生AUP值(近似无偏P值)。
高AU值的聚类簇表示受到数据的高度支持。
image.pngau 的P值95%的区间
image.png基于模型聚类,假定数据是服从高斯分布的
library(mclust)
library(dplyr)
set.seed(1111)
data<-Vehicle[,-19]
fit4<-Mclust(as.matrix(data),G=1:8,modelNames =c("EII", "VII", "EEI", "EVI", "VEI", "VVI"))#将模型分为1-8个高斯成分,通过贝叶斯信息准则来选择最佳模型
?Mclust
attributes(fit4)
fit4$modelName#最佳模型
fit4$BIC#根据BIC选最佳个数
head(fit4$BIC)
bic<-as.matrix(fit4$BIC)
attributes(bic)
bic<-matrix(data=bic[1:48],
nrow=8,ncol=6,
byrow = FALSE,
dimnames = list(1:8,c('EII','VII','EEI','EVI','VEI','VVI')))
bic<-as.data.frame(bic)
bic<-mutate(bic,num=1:8)
ggplot()+
geom_line(data=bic,aes(x=num,y=EII,colour='EII'))+
geom_point(data =bic,aes(x=num,y=EII,colour='EII'))+
geom_line(data=bic,aes(x=num,y=VII,colour='VII'))+
geom_point(data =bic,aes(x=num,y=VII,colour='VII'))+
geom_line(data=bic,aes(x=num,y=EEI,colour='EEI'))+
geom_point(data =bic,aes(x=num,y=EEI,colour='EEI'))+
geom_line(data=bic,aes(x=num,y=EVI,colour='EVI'))+
geom_point(data =bic,aes(x=num,y=EVI,colour='EVI'))+
geom_line(data=bic,aes(x=num,y=VEI,colour='VEI'))+
geom_point(data =bic,aes(x=num,y=VEI,colour='VEI'))+
geom_line(data=bic,aes(x=num,y=VVI,colour='VVI'))+
geom_point(data =bic,aes(x=num,y=VVI,colour='VVI'))+
ylab('GIC')
#从2开始平缓,所以选择2或者3作为聚类个数
image.png
网友评论