我正在使用一个名为productQuality1.1的CSV数据集,其中包含5列,其中位数是我的产品质量绩效,用于确定聚类结果。我已经发现最好的k聚类数是2。如何获得数据的聚类结果?我在下面粘贴了我的数据的位置:
structure(list(weld.type.ID = 1:33, weld.type = structure(c(29L,
11L, 16L, 4L, 28L, 17L, 19L, 5L, 24L, 27L, 21L, 32L, 12L, 20L,
26L, 25L, 3L, 7L, 13L, 22L, 33L, 1L, 9L, 10L, 18L, 15L, 31L,
8L, 23L, 2L, 14L, 6L, 30L), .Label = c("1,40,Material A", "1,40S,Material C",
"1,80,Material A", "1,STD,Material A", "1,XS,Material A", "10,10S,Material C",
"10,160,Material A", "10,40,Material A", "10,40S,Material C",
"10,80,Material A", "10,STD,Material A", "10,XS,Material A",
"13,40,Material A", "13,40S,Material C", "13,80,Material A",
"13,STD,Material A", "13,XS,Material A", "14,40,Material A",
"14,STD,Material A", "14,XS,Material A", "15,STD,Material A",
"15,XS,Material A", "2,10S,Material C", "2,160,Material A", "2,40,Material A",
"2,40S,Material C", "2,80,Material A", "2,STD,Material A", "2,XS,Material A",
"4,80,Material A", "4,STD,Material A", "6,STD,Material A", "6,XS,Material A"
), class = "factor"), alpha = c(281L, 196L, 59L, 96L, 442L, 98L,
66L, 30L, 68L, 43L, 35L, 44L, 23L, 14L, 24L, 38L, 8L, 8L, 5L,
19L, 37L, 38L, 6L, 11L, 29L, 6L, 16L, 6L, 16L, 3L, 4L, 9L, 12L
), beta = c(7194L, 4298L, 3457L, 2982L, 4280L, 3605L, 2229L,
1744L, 2234L, 1012L, 1096L, 1023L, 1461L, 1303L, 531L, 233L,
630L, 502L, 328L, 509L, 629L, 554L, 358L, 501L, 422L, 566L, 403L,
211L, 159L, 268L, 167L, 140L, 621L), Median = c(0.0375507383753025,
0.043546015959685, 0.0166888869351212, 0.0310875876067419, 0.0935470294716035,
0.0263798143584636, 0.0286213698125569, 0.0167296957822645, 0.029403369311426,
0.0404683392593359, 0.0306699148693358, 0.0409507113292405, 0.0152814823151512,
0.0103834693100336, 0.0426953962552843, 0.139335880048896, 0.0120333156133183,
0.0150573864235556, 0.0140547965388361, 0.0354001989345449, 0.0551110033888123,
0.0636987097619679, 0.0156058684578843, 0.0208640835981798, 0.0636580207464108,
0.00992440459162821, 0.0374531528739036, 0.0262100640799903,
0.0898729525910631, 0.00989157442426205, 0.0215577154517479,
0.0584418091169483, 0.0184528408043719)), class = "data.frame", row.names = c(NA,
-33L))
我想您或多或少知道有两个聚类,并且您想查看聚类是否使您对Median变量有良好的分离。
首先我们看一下您的数据框:
summary(productQuality1.1)
weld.type.ID weld.type alpha beta
Min. : 1 1,40,Material A : 1 Min. : 3.00 Min. : 140
1st Qu.: 9 1,40S,Material C : 1 1st Qu.: 9.00 1st Qu.: 403
Median :17 1,80,Material A : 1 Median : 24.00 Median : 621
Mean :17 1,STD,Material A : 1 Mean : 54.24 Mean :1383
3rd Qu.:25 1,XS,Material A : 1 3rd Qu.: 44.00 3rd Qu.:1744
Max. :33 10,10S,Material C: 1 Max. :442.00 Max. :7194
(Other) :27
Median
Min. :0.009892
1st Qu.:0.016689
Median :0.029403
Mean :0.036686
3rd Qu.:0.042695
Max. :0.139336
您只能使用alpha和beta,因为ID,weld.type是唯一的条目(例如标识符)。我们这样做:
clus = kmeans(productQuality1.1[,c("alpha","beta")],2)
productQuality1.1$cluster = factor(clus$cluster)
请注意,我使用的alpha和beta值一开始的比例非常不同。我们可以可视化聚类:
ggplot(productQuality1.1,aes(x = alpha,y = beta,col = cluster))+ geom_point()
仅使用kmeans将这些观察结果分成2个集群并不是一件容易的事,因为其中一些具有很高的alpha / beta值。我们还可以查看您的中位数值如何分布:
ggplot(productQuality1.1,aes(x = alpha,y = beta,col = Median))+geom_point()+ scale_color_viridis_c()
最后我们看中值:
ggplot(productQuality1.1,aes(x = Median,col = cluster))+ geom_density()
我会说第二组中的一些中位数较高,但是有些不容易分开。鉴于我们在散点图中看到的内容,可能不得不更多地考虑如何使用您拥有的alpha和beta值。