我有一个代码段,我正在尝试对其进行优化以使其运行得更快。
df1 <- df %>%
rowwise() %>%
mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value) %>%
filter(oddsRatio > 1 & fisher < pval) %>%
mutate(test_direction = binom.test(x = forward, n = counts, p = 0.5)$p.value) %>%
filter(test_direction < 0.05)
我一直在尝试用
替换第一个变异/过滤器对df1 <- df %>%
{.$fisher = fisher.test(matrix(c(.$counts, .$nt1_not_t2,
.$nt2_not_t1, .$not_t1_or_t2), nrow = 2, ncol = 2))$p.value; .} %>%
vctrs::vec_slice(., .$oddsRatio > 1 & .$fisher < pval)
然而,矩阵试图使用整个数据集,而不是具有 4 个值。
Warning message:
In matrix(c(.$counts, .$nt1_not_t2, .$nt2_not_t1, .$not_t1_or_t2), :
data length differs from size of matrix: [12184 != 2 x 2]
不确定我是否可以添加一些语法来避免这种情况,或者另一种更高效的方法来实现这一点。
任何建议都很好
编辑:: 使用 profvis()
[![分析][1]][1]
一些样本数据
df <- structure(list(counts = c(114L, 55L, 57L, 95L, 514L, 65L, 694L,
28L, 148L, 122L, 240L, 38L, 260L, 65L, 40L, 12L, 32L, 134L, 42L,
16L, 37L, 33L, 63L, 16L, 20L, 13L, 12L, 17L, 15L, 26L, 548L,
31L, 467L, 202L, 1696L, 1422L, 219L, 362L, 417L, 1449L, 2142L,
241L, 128L, 1812L, 3281L, 677L, 1006L, 137L, 67L, 161L), forward = c(61L,
37L, 32L, 47L, 233L, 43L, 568L, 15L, 75L, 76L, 149L, 27L, 177L,
34L, 33L, 7L, 22L, 81L, 24L, 8L, 19L, 22L, 55L, 8L, 12L, 8L,
7L, 10L, 12L, 13L, 407L, 17L, 260L, 119L, 861L, 906L, 144L, 199L,
195L, 645L, 1223L, 166L, 73L, 844L, 2727L, 341L, 529L, 86L, 36L,
87L), oddsRatio = c(2.81719639416155, 2.56249627110554, 3.0012284711951,
3.2379086619481, 3.28262910798122, 1.78506192857701, 1.23683314379245,
1.47200829293576, 1.60268857356236, 2.54327837666455, 2.65317932754055,
2.7443152244971, 2.41170031230883, 1.39183344640434, 1.67403290633562,
2.45502917152859, 1.52227974689146, 1.67590893004601, 1.5285951005419,
1.88542317834637, 1.64599293496765, 1.23362495081645, 0.884202671972712,
1.15874072081511, 1.23428571428571, 0.918351141954909, 0.942141312184571,
1.63698770491803, 1.92011125705014, 1.67230461700034, 6.63377209451277,
1.98838889786539, 3.21071805754822, 2.3432554142601, 3.03013725938027,
11.6868669158062, 3.25457032130808, 3.5341881506799, 2.37982532860213,
11.383770310192, 2.98290705092543, 1.43986166989779, 1.32458721885379,
1.70316762338472, 1.20465352503506, 1.32048669611991, 1.38391148677588,
1.70567801561587, 1.11338513259357, 1.18942080378251), not_t1_or_t2 = c(16736L,
17180L, 17230L, 16989L, 12236L, 16869L, 4192L, 17243L, 15631L,
16569L, 15416L, 17344L, 14981L, 16665L, 17139L, 17533L, 17201L,
15903L, 17066L, 16400L, 12543L, 12161L, 4345L, 15346L, 14742L,
15391L, 15681L, 15977L, 16568L, 14576L, 14024L, 14291L, 13801L,
14039L, 11687L, 13734L, 14104L, 13968L, 13703L, 13701L, 10576L,
13757L, 14009L, 9868L, 3491L, 12503L, 11656L, 14064L, 14137L,
13875L), nt1_not_t2 = c(755L, 814L, 812L, 774L, 355L, 804L, 175L,
841L, 721L, 747L, 629L, 831L, 609L, 804L, 829L, 857L, 837L, 735L,
827L, 69L, 48L, 52L, 22L, 69L, 65L, 72L, 73L, 68L, 70L, 59L,
3609L, 4126L, 3690L, 3955L, 2461L, 2735L, 3938L, 3795L, 3740L,
2708L, 2015L, 3916L, 4029L, 2345L, 876L, 3480L, 3151L, 4020L,
4090L, 3996L), nt2_not_t1 = c(897L, 453L, 403L, 644L, 5397L,
764L, 13441L, 390L, 2002L, 1064L, 2217L, 289L, 2652L, 968L, 494L,
100L, 432L, 1730L, 567L, 2017L, 5874L, 6256L, 14072L, 3071L,
3675L, 3026L, 2736L, 2440L, 1849L, 3841L, 321L, 54L, 544L, 306L,
2658L, 611L, 241L, 377L, 642L, 644L, 3769L, 588L, 336L, 4477L,
10854L, 1842L, 2689L, 281L, 208L, 470L)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame")) ```
[1]: https://i.stack.imgur.com/a1ebG.png
根据 Axeman 在这里的评论,使用不同方法的一个小基准:
library(dplyr)
library(purrr)
my_fisher <- function(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2) {
fisher.test(
matrix(
c(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2),
nrow = 2,
ncol = 2
)
)$p.value
}
我们使用
purrr
s pmap
-函数和使用 furrr
s future_pmap
,pmap
的平行版本来应用这个函数:
# purrr
df %>%
mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
# furrr, choose workers based on actual hardware
library(furrr)
plan("future::multisession", workers = 4)
df %>%
mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
基准测试:
library(microbenchmark)
microbenchmark(
orig = df %>%
rowwise() %>%
mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value),
purrr = df %>%
mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher)),
furrr = df %>%
mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
)
退货
Unit: milliseconds
expr min lq mean median uq max neval
orig 108.7756 110.3752 116.08041 113.84570 118.3328 224.2154 100
purrr 105.9916 108.0498 114.80299 113.86020 116.7421 224.0461 100
furrr 86.1203 89.5472 93.77818 92.63145 96.1540 122.7134 100
所以使用
furrr
似乎在时间/速度方面有一点优势。