我有一个包含 21 列的 data.frame,每三列代表一个特定的时间点。我想计算这些一式三份中每一个的平均值。
geneName t11 t12 t13 t21 t22 t23 t31 t32 t33 t41 t42 t43 t51 t52 t53 t61 t62 t63
gene1 gene1 3296 5133 3466 2166 1759 2099 1916 1379 1570 2533 1794 1016 800 79 648 99 60 152
gene2 gene2 4210 5505 4173 2736 2748 3052 2409 1944 2237 1158 3475 1488 4023 102 940 265 365 124
...
在上面的示例中,我想计算三个
t1
、t2
、t3
等的平均值
这是一个基本的 R 方法 -
# Column numbers to be ignored from the calculation
columns_not_included <- 1
# Keep only subset of dataset that we need
tmp <- df[-columns_not_included]
cbind.data.frame(df[columns_not_included],
sapply(split.default(tmp,
sub('\\d$', '', names(tmp))), rowMeans, na.rm =TRUE))
# geneName t1 t2 t3 t4 t5 t6
#1 gene1 3965.000 2008.000 1621.667 1781.000 509.000 103.6667
#2 gene2 4629.333 2845.333 2196.667 2040.333 1688.333 251.3333
sub('\\d$', '', names(tmp))
将删除列名称的最后一个数字,以便 t11
、t12
和 t13
都将仅返回 t1
,对于传递给 split.default
的其他组也是如此,将数据拆分为基于这些组的数据框列表。对于每个组,我们使用 rowMeans
计算行均值。
tidyverse
解决方案:
library(tidyverse)
df %>%
pivot_longer(-geneName, names_to = ".value", names_pattern = "(t.)") %>%
summarise(across(everything(), mean), .by = geneName)
# # A tibble: 2 × 7
# geneName t1 t2 t3 t4 t5 t6
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 gene1 3965 2008 1622. 1781 509 104.
# 2 gene2 4629. 2845. 2197. 2040. 1688. 251.
另一种基本 R 方法:
g1 <- t(substr(names(df[-1]), 1, 2))[rep(1, nrow(df)),]
cbind(df[1], tapply(unlist(df[-1]), list(row(g1), g1), mean))
geneName t1 t2 t3 t4 t5 t6
1 gene1 3965.000 2008.000 1621.667 1781.000 509.000 103.6667
2 gene2 4629.333 2845.333 2196.667 2040.333 1688.333 251.3333
df<- read.table(text =
"geneName t11 t12 t13 t21 t22 t23 t31 t32 t33 t41 t42 t43 t51 t52 t53 t61 t62 t63
gene1 3296 5133 3466 2166 1759 2099 1916 1379 1570 2533 1794 1016 800 79 648 99 60 152
gene2 4210 5505 4173 2736 2748 3052 2409 1944 2237 1158 3475 1488 4023 102 940 265 365 124", header = TRUE)
library(tidyverse)
df %>%
pivot_longer(cols = -geneName) %>%
mutate(name = substr(name, 1, 2)) %>%
group_by(geneName, name) %>%
summarise(mean = mean(value))
# A tibble: 12 × 3
# Groups: geneName [2]
geneName name mean
<chr> <chr> <dbl>
1 gene1 t1 3965
2 gene1 t2 2008
3 gene1 t3 1622.
4 gene1 t4 1781
5 gene1 t5 509
6 gene1 t6 104.
7 gene2 t1 4629.
8 gene2 t2 2845.
9 gene2 t3 2197.
10 gene2 t4 2040.
11 gene2 t5 1688.
12 gene2 t6 251.
这是另一种选择:
library(tidyverse)
test_data |>
pivot_longer(-geneName,
names_to = "time",
names_pattern = "t(\\d+)",
names_transform = as.numeric) |>
arrange(geneName, time) |>
mutate(group = cumsum(time - lag(time, default = first(time)-1)!=1)) |>
nest(data = -c(geneName, group)) |>
transmute(geneName = geneName,
time_range = map_chr(data, ~glue::glue("t{mn}-t{mx}",
mn = min(.x$time),
mx = max(.x$time))),
mean = map_dbl(data, ~mean(.x$value))) |>
pivot_wider(names_from = time_range, values_from = mean)
#> # A tibble: 2 x 7
#> geneName `t11-t13` `t21-t23` `t31-t33` `t41-t43` `t51-t53` `t61-t63`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 gene1 3965 2008 1622. 1781 509 104.
#> 2 gene2 4629. 2845. 2197. 2040. 1688. 251.
尽管我喜欢 Ronak 的巧妙答案,但我仍然更喜欢使用 tidyverse grammer 进行计算。我发现它更干净、更清晰。
但是谢谢你的帮助
test.mean <- test_data |>
group_by(gene)
dplyr::rowwise() |>
mutate("meant1" = mean(c(t11,t12,t13))) |>
mutate("meant2" = mean(c(t21,t22,t23))) |>
mutate("meant3" = mean(c(t31,t32,t33))) |>
mutate("meant4" = mean(c(t41,t42,t43))) |>
mutate("meant5" = mean(c(t51,t52,t53))) |>
mutate("meant6" = mean(c(t61,t62,t63))) |>
select(matches("geneName|mean")) |>
column_to_rownames("geneName") |>
as.data.frame()