我使用 R 包 PCAtools 生成基因计数宏基因组数据的 PCA。当使用 PCAtools 可视化双图时,载荷的大小适合数据分散(我理解向量长度代表对分散的影响)。见下文:
ex
是归一化基因计数表子集。 dput(ex)
如下。
library(PCAtools)
p <- pca(ex,removeVar = 0.1)
biplot(p,lab=NULL,hline = 0, vline = 0,
showLoadings = T)
# this gives a figure with 3 loadings, automatically made from PCAtools.
然后,我提取旋转数据和变量载荷,以使用 ggplot2 生成我自己的 PCA 图。然而,我的图上的加载向量长度很小,并且与 PCAtools 双图不匹配。
# select matching loadings
loadplot <- c("K02469","K10118","K20444")
# extract PC scores and loadings
PC <- p$rotated
load <-p$loadings
load <- na.omit(load[loadplot,])
# image with ggplot2
p.pcaNORM <- ggplot(data = PC, aes(x = PC1, y = PC2)) +
geom_point(aes(),shape = 21,colour = "black", size = 5) +
geom_text(data=load, aes(x=PC1, y=PC2, label=row.names(load)),
inherit.aes = FALSE, size=3) +
geom_segment(data=load, aes(x=0, xend=load$PC1, y=0, yend=load$PC2), inherit.aes = FALSE,
arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
labs(x = "PC1 27.8%", y = "PC2 12.3%", title="PCA of Gene Count Differential Abundances") +
geom_hline(yintercept = 0, linetype="dashed",color="black") +
geom_vline(xintercept=0, linetype="dashed",color="black") +
theme_linedraw() +
theme(legend.position = "bottom",
legend.box = "horizontal",
legend.direction = "horizontal",
panel.background = element_blank(),
axis.text.x = element_text(size=8),
axis.text.y = element_text(size=8),
plot.title = element_text(size = 10))
print(p.pcaNORM)
然后您可能会看到,例如,加载“K20444”在 PCAtools 双图中在 -1 和 -2.5 之间结束,但在我的 ggplot2 双图中在 0 和 -1 之间结束。有谁知道发生了什么事吗?
> dput(ex) structure(list(Ga0598246 = c(6.56539918955656, 6.07685285958297,
9.80857543461825, 7.45380707677968, 5.50253732907563, 0, 1.00547341301176,
9.57886787328451, 5.91763530872237, 9.13761113968035), Ga0598239 = c(5.37714039215198,
4.44627250098116, 9.12527549535166, 7.37818173071451, 4.22343869037075, 0, 0, 9.41186693570199, 2.04265725161515, 9.06211714113395),
Ga0598242 = c(7.23329507168685, 3.30011916965994, 9.14571484611697,
6.63420100118826, 5.33389579265697, 0, 0, 9.23317518972483,
4.83457149887647, 9.02564207931812), Ga0598241 = c(6.42805923331295,
4.41343389152249, 9.17585857319441, 6.53724813980237, 5.95254094365946,
0, 1.55299220383005, 9.34416529172944, 5.9067619585435, 9.35272687344133
), Ga0598244 = c(6.51891536576828, 4.56535598153963, 9.40792503580225,
6.60909974507472, 5.37234302403178, 0, 0.989803762135471,
9.61255610440809, 5.76126935753228, 9.26497019795582), Ga0598240 = c(6.59592709583215,
5.0116546308929, 9.10583727581318, 6.45845808171092, 6.32419402380456,
0, 1.97474341675587, 9.59380242030398, 6.25206560327315,
9.37141719913754), Ga0598243 = c(6.48685136970719, 4.54940912022517,
8.95163946868835, 6.2487649672149, 6.21130294335921, 0, 1.97225049345733,
9.55911910161097, 6.0719500604804, 9.61218376859803), Ga0598247 = c(3.42017839411825,
5.22468821931174, 9.43811837974567, 8.49700801179772, 5.02416088318049,
0, 0, 10.2938603383254, 0, 8.70421802755197), Ga0598259 = c(6.85309401728228,
5.08842002313584, 9.16500402830579, 6.20640673224555, 5.24441419037537,
0, 0, 9.46180201100927, 5.5134145812396, 9.53162068178508
), Ga0598260 = c(6.7006085700961, 4.80510960111303, 9.06424555064897,
7.05413314135196, 5.56105321976388, 0, 0, 9.94666671304116,
5.99044411371985, 9.14986983395412)), row.names = c("K20444", "K01711", "K10119", "K10118", "K02469", "K03046", "K14358", "K00945", "K12454", "K01129"), class = "data.frame")
问题在于
biplot
在引擎盖下缩放载荷,即首先将载荷向量缩放到旋转数据的范围。其次,向量的长度缩放为 lengthLoadingsArrowsFactor=
参数的值,默认为 1.5
。
library(PCAtools)
library(ggplot2)
lengthLoadingsArrowsFactor <- 1.5
r <- min(
(max(p$rotated[["PC1"]]) - min(p$rotated[["PC1"]]) /
(max(p$loadings[["PC1"]]) - min(p$loadings[["PC1"]]))),
(max(p$rotated[["PC2"]]) - min(p$rotated[["PC2"]]) /
(max(p$loadings[["PC2"]]) - min(p$loadings[["PC2"]])))
)
ggplot(data = PC, aes(x = PC1, y = PC2)) +
geom_point(aes(), shape = 21, colour = "black", size = 5) +
geom_text(
data = load, aes(
x = PC1 * r * lengthLoadingsArrowsFactor,
y = PC2 * r * lengthLoadingsArrowsFactor,
label = row.names(load)
),
inherit.aes = FALSE, size = 3
) +
geom_segment(
data = load, aes(
x = 0, xend = PC1 * r * lengthLoadingsArrowsFactor,
y = 0, yend = PC2 * r * lengthLoadingsArrowsFactor
), inherit.aes = FALSE,
arrow = arrow(length = unit(0.25, "cm")), colour = "grey"
) +
labs(x = "PC1 27.8%", y = "PC2 12.3%", title = "PCA of Gene Count Differential Abundances") +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
theme_linedraw() +
theme(
legend.position = "bottom",
legend.box = "horizontal",
legend.direction = "horizontal",
panel.background = element_blank(),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
plot.title = element_text(size = 10)
) +
xlim(-2.5, 6.25) +
ylim(-2.5, 1.5)
biplot(p,
lab = NULL, hline = 0, vline = 0, showLoadings = TRUE
)