按条件分组的重复的 PCA 图

问题描述 投票:0回答:1

我正在尝试根据 3 个条件(每个条件 3 个重复)生成的表达数据生成 PCA 图。我已经设法得到了一个图,但随后正在努力对条件进行着色和分组,因为我认为我可能已经将数据列出了错误。

然后,当我开始为每个样本着色时,我被卡住了,运行了以下代码。我想按包含 3 个重复的 O、FO 和 F 进行着色,然后省略这 3 个条件。任何帮助将不胜感激。

桌子:

structure(list(Gene_ID = c("gene-EHS42_RS00005", "gene-EHS42_RS00010", 
"gene-EHS42_RS00015", "gene-EHS42_RS00020", "gene-EHS42_RS00025", 
"gene-EHS42_RS00030", "gene-EHS42_RS00035", "gene-EHS42_RS00040", 
"gene-EHS42_RS00045", "gene-EHS42_RS00050", "gene-EHS42_RS00055", 
"gene-EHS42_RS00060", "gene-EHS42_RS00065", "gene-EHS42_RS00070", 
"gene-EHS42_RS00075", "gene-EHS42_RS00080"), O1 = c(757.784, 
896.264, 123.429, 985.022, 85.8583, 111.718, 10.7002, 152.577, 
17.7682, 1086.55, 2826.57, 109.637, 43.1502, 0, 3158.45, 2271.19
), O2 = c(723, 897.502, 157.31, 1075.96, 106.999, 118.593, 10.8549, 
137.093, 19.2265, 1142.01, 2841.09, 91.1191, 63.1088, 0, 2981.31, 
2136.32), O3 = c(724.17, 875.258, 133.573, 1155.09, 74.4442, 
107.826, 16.365, 164.105, 29.4387, 751.156, 2822.42, 93.7586, 
37.7846, 0, 2978.32, 2045.64), FO1 = c(688.876, 922.35, 135.935, 
1223.9, 119.83, 93.1258, 17.7483, 324.379, 77.5033, 862.804, 
2524.59, 95.5171, 53.9344, 0, 2455.88, 1462.5), FO2 = c(869.985, 
1185.33, 194.729, 882.644, 177.953, 135.183, 21.7251, 296.909, 
58.101, 1247, 2511.67, 114.952, 63.6875, 0, 1433.23, 904.294), 
    FO3 = c(840.392, 1195.88, 165.721, 937.342, 170.775, 145.854, 
    23.9473, 285.05, 44.2553, 1402.51, 2737.45, 100.696, 73.0917, 
    0, 1419.96, 1051.12), F1 = c(1718.91, 1729.51, 341.759, 1324.52, 
    86.4022, 264.029, 30.6917, 169.219, 37.1905, 1987.85, 1370.75, 
    97.2895, 69.3806, 0, 3641.66, 2916.67), F2 = c(1919.41, 1666.16, 
    323.399, 850.732, 67.4236, 271.421, 18.9667, 184.824, 18.0931, 
    1617.57, 1449.76, 86.3241, 48.5885, 0, 2524.14, 1730.51), 
    F3 = c(1951.07, 1850.52, 376.333, 1157.23, 41.8972, 277.754, 
    32.3741, 177.472, 34.1986, 1039.71, 874.081, 78.1316, 58.6108, 
    0, 3424.35, 2758.01)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -16L))

然后我运行的代码:

str(PcA_Plot_Data)
head(PcA_Plot_Data)

expression.pca <- prcomp(PcA_Plot_Data[,c(2:10)],
                         centre = TRUE,
                         scale. = TRUE)
summary(expression.pca)

library(ggfortify)
expression.pca.plot <- autoplot(expression.pca,
                                data = PcA_Plot_Data,
                                colour = '')
r ggplot2 bioinformatics ggfortify
1个回答
0
投票

你是对的,按照惯例,基因应该是行和列是样本。但是您正在对未转置的数据运行 PCA,但我假设您希望将每个样本作为最终图中的单个点。以下是有关操作的最小版本:

# It's kind of convention to have gene expression data as numeric matrix/data.frame without genes as a column
data <- as.data.frame(data)
rownames(data) <- data$Gene_ID
data$Gene_ID <- NULL

# Run PCA on transposed data
pca <- prcomp(t(data))

# Parse group names
groups <- gsub("1|2|3", "", colnames(data))

# Biplot
library(ggplot2)
to_plot <- data.frame(pca$x, group=groups)

ggplot(data=to_plot, aes(x=PC1, y=PC2, color=group)) + geom_point(size=3)

© www.soinside.com 2019 - 2024. All rights reserved.