“手工”PCA 上的向量与输出双图不匹配

问题描述 投票:0回答:1

我使用 R 包 PCAtools 生成基因计数宏基因组数据的 PCA。当使用 PCAtools 可视化双图时,载荷的大小适合数据分散(我理解向量长度代表对分散的影响)。见下文:

ex
是归一化基因计数表子集。
dput(ex)
如下。

library(PCAtools)
p <- pca(ex,removeVar = 0.1)
biplot(p,lab=NULL,hline = 0, vline = 0,
       showLoadings = T)
# this gives a figure with 3 loadings, automatically made from PCAtools.

然后,我提取旋转数据和变量载荷,以使用 ggplot2 生成我自己的 PCA 图。然而,我的图上的加载向量长度很小,并且与 PCAtools 双图不匹配。

# select matching loadings
loadplot <- c("K02469","K10118","K20444")
# extract PC scores and loadings
PC <- p$rotated
load <-p$loadings
load <- na.omit(load[loadplot,])

# image with ggplot2
p.pcaNORM <- ggplot(data = PC, aes(x = PC1, y = PC2)) +
  geom_point(aes(),shape = 21,colour = "black", size = 5) +
  geom_text(data=load, aes(x=PC1, y=PC2, label=row.names(load)),
            inherit.aes = FALSE, size=3) +
  geom_segment(data=load, aes(x=0, xend=load$PC1, y=0, yend=load$PC2), inherit.aes = FALSE,
               arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
  labs(x = "PC1 27.8%", y = "PC2 12.3%", title="PCA of Gene Count Differential Abundances") +
  geom_hline(yintercept = 0, linetype="dashed",color="black") +
  geom_vline(xintercept=0, linetype="dashed",color="black") +
  theme_linedraw() +
  theme(legend.position  = "bottom", 
        legend.box       = "horizontal",
        legend.direction = "horizontal",
        panel.background = element_blank(),
        axis.text.x      = element_text(size=8),
        axis.text.y      = element_text(size=8),
        plot.title       = element_text(size = 10))
print(p.pcaNORM)

然后您可能会看到,例如,加载“K20444”在 PCAtools 双图中在 -1 和 -2.5 之间结束,但在我的 ggplot2 双图中在 0 和 -1 之间结束。有谁知道发生了什么事吗?

> dput(ex) structure(list(Ga0598246 = c(6.56539918955656, 6.07685285958297, 
9.80857543461825, 7.45380707677968, 5.50253732907563, 0, 1.00547341301176, 
9.57886787328451, 5.91763530872237, 9.13761113968035), Ga0598239 = c(5.37714039215198, 
4.44627250098116, 9.12527549535166, 7.37818173071451, 4.22343869037075,  0, 0, 9.41186693570199, 2.04265725161515, 9.06211714113395), 
    Ga0598242 = c(7.23329507168685, 3.30011916965994, 9.14571484611697, 
    6.63420100118826, 5.33389579265697, 0, 0, 9.23317518972483, 
    4.83457149887647, 9.02564207931812), Ga0598241 = c(6.42805923331295, 
    4.41343389152249, 9.17585857319441, 6.53724813980237, 5.95254094365946, 
    0, 1.55299220383005, 9.34416529172944, 5.9067619585435, 9.35272687344133
    ), Ga0598244 = c(6.51891536576828, 4.56535598153963, 9.40792503580225, 
    6.60909974507472, 5.37234302403178, 0, 0.989803762135471, 
    9.61255610440809, 5.76126935753228, 9.26497019795582), Ga0598240 = c(6.59592709583215, 
    5.0116546308929, 9.10583727581318, 6.45845808171092, 6.32419402380456, 
    0, 1.97474341675587, 9.59380242030398, 6.25206560327315, 
    9.37141719913754), Ga0598243 = c(6.48685136970719, 4.54940912022517, 
    8.95163946868835, 6.2487649672149, 6.21130294335921, 0, 1.97225049345733, 
    9.55911910161097, 6.0719500604804, 9.61218376859803), Ga0598247 = c(3.42017839411825, 
    5.22468821931174, 9.43811837974567, 8.49700801179772, 5.02416088318049, 
    0, 0, 10.2938603383254, 0, 8.70421802755197), Ga0598259 = c(6.85309401728228, 
    5.08842002313584, 9.16500402830579, 6.20640673224555, 5.24441419037537, 
    0, 0, 9.46180201100927, 5.5134145812396, 9.53162068178508
    ), Ga0598260 = c(6.7006085700961, 4.80510960111303, 9.06424555064897, 
    7.05413314135196, 5.56105321976388, 0, 0, 9.94666671304116, 
    5.99044411371985, 9.14986983395412)), row.names = c("K20444",  "K01711", "K10119", "K10118", "K02469", "K03046", "K14358", "K00945",  "K12454", "K01129"), class = "data.frame")
r ggplot2 pca
1个回答
0
投票

问题在于

biplot
在引擎盖下缩放载荷,即首先将载荷向量缩放到旋转数据的范围。其次,向量的长度缩放为
lengthLoadingsArrowsFactor=
参数的值,默认为
1.5

library(PCAtools)
library(ggplot2)

lengthLoadingsArrowsFactor <- 1.5

r <- min(
  (max(p$rotated[["PC1"]]) - min(p$rotated[["PC1"]]) /
    (max(p$loadings[["PC1"]]) - min(p$loadings[["PC1"]]))),
  (max(p$rotated[["PC2"]]) - min(p$rotated[["PC2"]]) /
    (max(p$loadings[["PC2"]]) - min(p$loadings[["PC2"]])))
)

ggplot(data = PC, aes(x = PC1, y = PC2)) +
  geom_point(aes(), shape = 21, colour = "black", size = 5) +
  geom_text(
    data = load, aes(
      x = PC1 * r * lengthLoadingsArrowsFactor,
      y = PC2 * r * lengthLoadingsArrowsFactor,
      label = row.names(load)
    ),
    inherit.aes = FALSE, size = 3
  ) +
  geom_segment(
    data = load, aes(
      x = 0, xend = PC1 * r * lengthLoadingsArrowsFactor,
      y = 0, yend = PC2 * r * lengthLoadingsArrowsFactor
    ), inherit.aes = FALSE,
    arrow = arrow(length = unit(0.25, "cm")), colour = "grey"
  ) +
  labs(x = "PC1 27.8%", y = "PC2 12.3%", title = "PCA of Gene Count Differential Abundances") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
  theme_linedraw() +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal",
    legend.direction = "horizontal",
    panel.background = element_blank(),
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8),
    plot.title = element_text(size = 10)
  ) +
  xlim(-2.5, 6.25) +
  ylim(-2.5, 1.5)


biplot(p,
  lab = NULL, hline = 0, vline = 0, showLoadings = TRUE
)

© www.soinside.com 2019 - 2024. All rights reserved.