从不同组的每个样本(cols)具有单个表达式值(行)的两个数据帧,我想计算每组的平均值和中位数。 我的解决方案似乎有点冗长,我想知道是否有更优雅的解决方案。
# expression values
genes <- paste("gene",1:1000,sep="")
x <- list(
A = sample(genes,300),
B = sample(genes,525),
C = sample(genes,440),
D = sample(genes,350)
)
# expression dataframe
crete_exp_df <- function(gene_nr, sample_nr){
df <- replicate(sample_nr, rnorm(gene_nr))
rownames(df) <- paste("Gene", c(1:nrow(df)))
colnames(df) <- paste("Sample", c(1:ncol(df)))
return(df)
}
exp1 <- crete_exp_df(50, 20)
exp2 <- crete_exp_df(50, 20)
# sample annotation
san <- data.frame(
id = colnames(exp1),
group = sample(1:4, 20, replace = TRUE))
# get ids of samples per group
ids_1 <- san %>% filter(group == 1) %>% pull(id)
ids_2 <- san %>% filter(group == 2) %>% pull(id)
ids_3 <- san %>% filter(group == 3) %>% pull(id)
ids_4 <- san %>% filter(group == 4) %>% pull(id)
id_list <- list(group1 = ids_1, group2 = ids_2, group3 = ids_3, group4 = ids_4)
# fct means df1
get_means_exp1 <- function(id){
apply(exp1[, id], 1, mean, na.rm = T)
}
# fct means df2
get_means_exp2 <- function(id){
apply(exp2[, id], 1, mean, na.rm = T)
}
# lapply on df1
list_means_exp1 <- lapply(id_list, get_means_exp1)
means_exp1 <- as.data.frame(list_means_exp1)
# lapply on df2
list_means_exp2 <- lapply(id_list, get_means_exp2)
means_exp2 <- as.data.frame(list_means_exp2)
我想这可以更优雅地解决。具体来说,如何获取每个组的 id 并编写一个适用于 df 的函数。 期待从您的解决方案中学习,非常感谢!
在使用
apply(., 1, FUN)
之前,检查是否有可用的向量化函数总是明智的,因为它们要快得多。对于行的算术平均值,有base::rowMeans
。对于中位数,我们可以使用matrixStats::rowMedians
。对于行,您也可以使用 matrixStats::rowMeans2
,速度稍快。在这里使用 vapply
是有意义的,它类似于 lapply
,但可以方便地生成矩阵并且在 *apply
系列中是最快的,因为我们可以预分配内存。 (注意: 我用set.seed(42)
来创建你的数据。)
所以也许您正在寻找这个:
vapply(id_list, \(x) rowMeans(exp1[, x]), numeric(dim(exp1)[1]))
# group1 group2 group3 group4
# Gene 1 -1.35631700 -0.328620048 0.160795323 -0.01011904
# Gene 2 0.33985130 0.432482763 -0.169343033 0.13019294
# Gene 3 0.46623064 0.154045975 0.362607622 0.58710492
# Gene 4 0.17049403 -0.036744170 -0.056742305 1.10934764
# Gene 5 -0.15515465 0.237211068 -0.426415836 -0.50977736
vapply(id_list, \(x) matrixStats::rowMedians(exp1[, x], useNames=TRUE), numeric(dim(exp1)[1]))
# group1 group2 group3 group4
# Gene 1 -1.22551737 -0.41642403 0.470862918 -1.782411e-01
# Gene 2 0.05680326 0.62277321 -0.512487033 3.943679e-01
# Gene 3 0.58009311 -0.10696651 0.149054062 9.345673e-01
# Gene 4 0.09852832 0.12774134 -0.573525823 1.046751e+00
# Gene 5 -0.44076823 0.11716389 -0.381682466 -8.480807e-01
所以,我使用了您提供的数据生成过程,并提出了一个更简单的解决方案。我将 exp1 更改为数据框,以整洁的格式 (
pivot_longer()
),从 san 数据框中添加组,最后应用简单的 dplyr
语法来总结您的数据。
library(tidyverse)
as.data.frame(exp1) %>%
rownames_to_column("Gene") %>%
pivot_longer(cols= 2:21, names_to = "id", values_to = "Values") %>%
left_join(., san) %>%
group_by(group) %>%
summarise(mean= mean(Values),
median= median(Values))
#> Joining with `by = join_by(id)`
#> # A tibble: 4 × 3
#> group mean median
#> <int> <dbl> <dbl>
#> 1 1 0.0803 0.0568
#> 2 2 -0.0383 -0.0387
#> 3 3 -0.00929 0.0356
#> 4 4 -0.0840 -0.0306
考虑到您的评论,只需按基因分组即可获得预期的输出。
library(tidyverse)
as.data.frame(exp1) %>%
rownames_to_column("Gene") %>%
pivot_longer(cols= 2:21, names_to = "id", values_to = "Values") %>%
left_join(., san) %>%
group_by(group, Gene) %>%
summarise(mean= mean(Values),
median= median(Values))
#> Joining with `by = join_by(id)`
#> `summarise()` has grouped output by 'group'. You can override using the
#> `.groups` argument.
#> # A tibble: 200 × 4
#> # Groups: group [4]
#> group Gene mean median
#> <int> <chr> <dbl> <dbl>
#> 1 1 Gene 1 -0.0642 -0.122
#> 2 1 Gene 10 0.0151 0.563
#> 3 1 Gene 11 -0.0585 -0.0367
#> 4 1 Gene 12 -0.978 -0.917
#> 5 1 Gene 13 -1.01 -1.37
#> 6 1 Gene 14 0.160 -0.394
#> 7 1 Gene 15 -0.295 -0.689
#> 8 1 Gene 16 0.774 0.729
#> 9 1 Gene 17 -0.356 -0.336
#> 10 1 Gene 18 -0.741 -0.103
#> # … with 190 more rows
创建于 2023-04-13 与 reprex v2.0.2
作为一个扩展性很好的替代方案,您可以使用
data.table
.
### load data.table
library(data.table)
### convert data.frames to data.table
exp1 <- as.data.table(exp1)[,Genes:=rownames(exp1),]
san <- as.data.table(san)
### switch to long format
exp1 <- melt(exp1, id.vars = "Genes", variable.name = "id", value.name = "Expression")
### join based on sample id
exp1Join <- merge.data.table(exp1, san, by = "id")
### compute statistics of choice
exp1Join[,.(mean=mean(Expression), median=median(Expression)),by=.(group, Genes)]
当然,如果您想收集所有数据并根据整个数据集(不同的实验)进行计算,您也可以在组合表中进行所有操作。
exp1 <- as.data.table(exp1)[,`:=`(Genes=rownames(exp1), Experiment=1),]
exp2 <- as.data.table(exp2)[,`:=`(Genes=rownames(exp2), Experiment=2),]
exp1 <- melt(exp1, id.vars = c("Genes", "Experiment"), variable.name = "id", value.name = "Expression")
exp2 <- melt(exp2, id.vars = c("Genes", "Experiment"), variable.name = "id", value.name = "Expression")
### combine tables
expCombined <- rbindlist(l = list(exp1, exp2))
expCombined <- merge.data.table(expCombined, san, by = "id")
### compute the mean, median, sd and sample size for every combination of gene, group and experiment
expCombined[,.(mean=mean(Expression),
median=median(Expression),
sd=sd(Expression),
N=.N),
by=.(group, Genes, Experiment)]
# group Genes Experiment mean median sd N
# 1: 1 Gene 1 1 -0.29234057 -0.24008726 0.6278528 5
# 2: 1 Gene 2 1 -0.74158796 -0.82441474 0.6289399 5
# 3: 1 Gene 3 1 -0.49293277 -0.30616603 1.1442834 5
# 4: 1 Gene 4 1 -0.33610311 -0.43948117 0.5331471 5
# 5: 1 Gene 5 1 0.68955333 0.60701836 0.9475727 5
# ---
#396: 4 Gene 46 2 1.17036249 1.17036249 0.4885201 2
#397: 4 Gene 47 2 0.64894986 0.64894986 0.1122624 2
#398: 4 Gene 48 2 -1.61083175 -1.61083175 0.6319153 2
#399: 4 Gene 49 2 -0.07673634 -0.07673634 0.7263174 2
#400: 4 Gene 50 2 -0.37240955 -0.37240955 0.8037523 2
另外,作为比较,我根据原始帖子、提供的 Tidyverse 解决方案和
vapply
方法对 exp1 进行了一个小测试。显然,当数据集很大时,这样的基准测试更有意义。
Unit: microseconds
expr min lq mean median uq max neval cld
TidyWay 57902.546 61651.077 76529.3966 67526.432 79027.012 172911.906 100 a
DTWay 2159.780 2490.218 3225.3781 2592.081 2960.918 17196.365 100 b
OrgWay 7459.775 8249.155 10667.4395 9224.186 11740.072 27480.962 100 c
VApplyWay 87.618 133.598 168.3478 146.398 189.990 782.736 100 b