||
#聚类分析#使用k-means聚类所需的包:#factoextra#cluster#使用内置的R数据集USArrests#install.packages("factoextra")library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster) data("USArrests")##remove any missing value (i.e, NA values for not available)#That might be present in the dataUSArrests <- na.omit(USArrests)#view the first 6 rows of the datahead(USArrests, n=6)
## Murder Assault UrbanPop Rape ## Alabama 13.2 236 58 21.2 ## Alaska 10.0 263 48 44.5 ## Arizona 8.1 294 80 31.0 ## Arkansas 8.8 190 50 19.5 ## California 9.0 276 91 40.6 ## Colorado 7.9 204 78 38.7
desc_stats <- data.frame( Min=apply(USArrests, 2, min),#minimum Med=apply(USArrests, 2, median),#median Mean=apply(USArrests, 2, mean),#mean SD=apply(USArrests, 2, sd),#Standard deviation Max=apply(USArrests, 2, max)#maximum) desc_stats <- round(desc_stats, 1)#保留小数点后一位head(desc_stats)#变量有很大的方差及均值时需进行标准化df <- scale(USArrests)#数据集群性评估使用get_clust_tendency()计算Hopkins统计量res <- get_clust_tendency(df, 40, graph = TRUE) res$hopkins_stat
## [1] 0.6559125
#Visualize the dissimilarity matrixres$plot
#Hopkins统计量的值<0.5,表明数据是高度可聚合的。另外,从图中也可以看出数据可聚合。#估计聚合簇数由于k均值聚类需要指定要生成的聚类数量,#因此我们将使用函数clusGap()来计算用于估计最优聚类数。#函数fviz_gap_stat()用于可视化。set.seed(123)## Compute the gap statisticgap_stat <- clusGap(df, FUN = kmeans, nstart = 25, K.max = 10, B = 500) # Plot the resultfviz_gap_stat(gap_stat)
#图中显示最佳为聚成四类(k=4)#进行聚类set.seed(123) km.res <- kmeans(df, 4, nstart = 25) head(km.res$cluster, 20)
## Alabama Alaska Arizona Arkansas California Colorado ## 1 4 4 1 4 4 ## Connecticut Delaware Florida Georgia Hawaii Idaho ## 3 3 4 1 3 2 ## Illinois Indiana Iowa Kansas Kentucky Louisiana ## 4 3 2 3 2 1 ## Maine Maryland ## 2 4
# Visualize clusters using factoextrafviz_cluster(km.res, USArrests)
#检查cluster silhouette图sil <- silhouette(km.res$cluster, dist(df)) rownames(sil) <- rownames(USArrests) head(sil[, 1:3])
## cluster neighbor sil_width ## Alabama 1 4 0.48577530 ## Alaska 4 1 0.05825209 ## Arizona 4 3 0.41548326 ## Arkansas 1 3 0.11870947 ## California 4 3 0.43555885 ## Colorado 4 3 0.32654235
#Visualize fviz_silhouette(sil)
## cluster size ave.sil.width ## 1 1 8 0.39 ## 2 2 13 0.37 ## 3 3 16 0.34 ## 4 4 13 0.27
#图中可以看出有负值,可以通过函数silhouette()确定是哪个观测值neg_sil_index <- which(sil[, "sil_width"] < 0) sil[neg_sil_index, , drop = FALSE]
## cluster neighbor sil_width ## Missouri 4 3 -0.07318144
#eclust():增强的聚类分析#与其他聚类分析包相比,eclust()有以下优点:#简化了聚类分析的工作流程#可以用于计算层次聚类和分区聚类#eclust()自动计算最佳聚类簇数。#自动提供Silhouette plot#可以结合ggplot2绘制优美的图形###############使用eclust()的K均值聚类# Compute k-meansres.km <- eclust(df, "kmeans")
# Gap statistic plotfviz_gap_stat(res.km$gap_stat)
#使用**eclust()**的层次聚类# Enhanced hierarchical clusteringres.hc <- eclust(df, "hclust") # compute hclustfviz_dend(res.hc, rect = TRUE) # dendrogam
#下面的R代码生成Silhouette plot和分层聚类散点图。fviz_silhouette(res.hc) # silhouette plot
## cluster size ave.sil.width ## 1 1 19 0.26 ## 2 2 19 0.28 ## 3 3 12 0.43
fviz_cluster(res.hc) # scatter plot
###########################library(factoextra)library(cluster) data("USArrests") USArrests <- na.omit(USArrests)#view the first 6 rows of the datadf <- scale(USArrests)#使用**eclust()**的层次聚类# Enhanced hierarchical clusteringres.hc <- eclust(df, "hclust") # compute hclust# Gap statistic plotfviz_gap_stat(res.hc$gap_stat)
fviz_dend(res.hc, rect = TRUE) # dendrogam
#下面的R代码生成Silhouette plot和分层聚类散点图。fviz_silhouette(res.hc) # silhouette plot
## cluster size ave.sil.width ## 1 1 19 0.26 ## 2 2 19 0.28 ## 3 3 12 0.43
fviz_cluster(res.hc) # scatter plot
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-12-25 15:45
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社