优化数据框上的重复 Fisher 测试

Question

我有一个代码段，我正在尝试对其进行优化以使其运行得更快。

df1 <- df %>%
rowwise() %>%
    mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
                                         nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value) %>% 
    filter(oddsRatio > 1 & fisher < pval) %>% 
    mutate(test_direction = binom.test(x = forward, n = counts, p = 0.5)$p.value) %>%
    filter(test_direction < 0.05)

我一直在尝试用

替换第一个变异/过滤器对

df1 <- df %>%
{.$fisher = fisher.test(matrix(c(.$counts, .$nt1_not_t2,
                                     .$nt2_not_t1, .$not_t1_or_t2), nrow = 2, ncol = 2))$p.value; .} %>% 
    vctrs::vec_slice(., .$oddsRatio > 1 & .$fisher < pval)

然而，矩阵试图使用整个数据集，而不是具有 4 个值。

Warning message:
In matrix(c(.$counts, .$nt1_not_t2, .$nt2_not_t1, .$not_t1_or_t2),  :
  data length differs from size of matrix: [12184 != 2 x 2]

不确定我是否可以添加一些语法来避免这种情况，或者另一种更高效的方法来实现这一点。

任何建议都很好

编辑：：使用 profvis()

[![分析][1]][1]

一些样本数据

df <- structure(list(counts = c(114L, 55L, 57L, 95L, 514L, 65L, 694L, 
28L, 148L, 122L, 240L, 38L, 260L, 65L, 40L, 12L, 32L, 134L, 42L, 
16L, 37L, 33L, 63L, 16L, 20L, 13L, 12L, 17L, 15L, 26L, 548L, 
31L, 467L, 202L, 1696L, 1422L, 219L, 362L, 417L, 1449L, 2142L, 
241L, 128L, 1812L, 3281L, 677L, 1006L, 137L, 67L, 161L), forward = c(61L, 
37L, 32L, 47L, 233L, 43L, 568L, 15L, 75L, 76L, 149L, 27L, 177L, 
34L, 33L, 7L, 22L, 81L, 24L, 8L, 19L, 22L, 55L, 8L, 12L, 8L, 
7L, 10L, 12L, 13L, 407L, 17L, 260L, 119L, 861L, 906L, 144L, 199L, 
195L, 645L, 1223L, 166L, 73L, 844L, 2727L, 341L, 529L, 86L, 36L, 
87L), oddsRatio = c(2.81719639416155, 2.56249627110554, 3.0012284711951, 
3.2379086619481, 3.28262910798122, 1.78506192857701, 1.23683314379245, 
1.47200829293576, 1.60268857356236, 2.54327837666455, 2.65317932754055, 
2.7443152244971, 2.41170031230883, 1.39183344640434, 1.67403290633562, 
2.45502917152859, 1.52227974689146, 1.67590893004601, 1.5285951005419, 
1.88542317834637, 1.64599293496765, 1.23362495081645, 0.884202671972712, 
1.15874072081511, 1.23428571428571, 0.918351141954909, 0.942141312184571, 
1.63698770491803, 1.92011125705014, 1.67230461700034, 6.63377209451277, 
1.98838889786539, 3.21071805754822, 2.3432554142601, 3.03013725938027, 
11.6868669158062, 3.25457032130808, 3.5341881506799, 2.37982532860213, 
11.383770310192, 2.98290705092543, 1.43986166989779, 1.32458721885379, 
1.70316762338472, 1.20465352503506, 1.32048669611991, 1.38391148677588, 
1.70567801561587, 1.11338513259357, 1.18942080378251), not_t1_or_t2 = c(16736L, 
17180L, 17230L, 16989L, 12236L, 16869L, 4192L, 17243L, 15631L, 
16569L, 15416L, 17344L, 14981L, 16665L, 17139L, 17533L, 17201L, 
15903L, 17066L, 16400L, 12543L, 12161L, 4345L, 15346L, 14742L, 
15391L, 15681L, 15977L, 16568L, 14576L, 14024L, 14291L, 13801L, 
14039L, 11687L, 13734L, 14104L, 13968L, 13703L, 13701L, 10576L, 
13757L, 14009L, 9868L, 3491L, 12503L, 11656L, 14064L, 14137L, 
13875L), nt1_not_t2 = c(755L, 814L, 812L, 774L, 355L, 804L, 175L, 
841L, 721L, 747L, 629L, 831L, 609L, 804L, 829L, 857L, 837L, 735L, 
827L, 69L, 48L, 52L, 22L, 69L, 65L, 72L, 73L, 68L, 70L, 59L, 
3609L, 4126L, 3690L, 3955L, 2461L, 2735L, 3938L, 3795L, 3740L, 
2708L, 2015L, 3916L, 4029L, 2345L, 876L, 3480L, 3151L, 4020L, 
4090L, 3996L), nt2_not_t1 = c(897L, 453L, 403L, 644L, 5397L, 
764L, 13441L, 390L, 2002L, 1064L, 2217L, 289L, 2652L, 968L, 494L, 
100L, 432L, 1730L, 567L, 2017L, 5874L, 6256L, 14072L, 3071L, 
3675L, 3026L, 2736L, 2440L, 1849L, 3841L, 321L, 54L, 544L, 306L, 
2658L, 611L, 241L, 377L, 642L, 644L, 3769L, 588L, 336L, 4477L, 
10854L, 1842L, 2689L, 281L, 208L, 470L)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame")) ```


  [1]: https://i.stack.imgur.com/a1ebG.png

Answer 1

根据 Axeman 在这里的评论，使用不同方法的一个小基准：

library(dplyr)
library(purrr)

my_fisher <- function(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2) {
  fisher.test(
    matrix(
      c(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), 
      nrow = 2, 
      ncol = 2
      )
  )$p.value
}

我们使用

purrr

s

pmap

-函数和使用

furrr

s

future_pmap

，

pmap

的平行版本来应用这个函数：

# purrr
df %>% 
  mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))

# furrr, choose workers based on actual hardware
library(furrr)
plan("future::multisession", workers = 4)

df %>% 
  mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))

基准测试：

library(microbenchmark)

microbenchmark(
  orig = df %>%
    rowwise() %>%
    mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
                                         nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value),
  purrr = df %>% 
    mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher)),
  furrr = df %>% 
    mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
)

退货

Unit: milliseconds
  expr      min       lq      mean    median       uq      max neval
  orig 108.7756 110.3752 116.08041 113.84570 118.3328 224.2154   100
 purrr 105.9916 108.0498 114.80299 113.86020 116.7421 224.0461   100
 furrr  86.1203  89.5472  93.77818  92.63145  96.1540 122.7134   100

所以使用

furrr

似乎在时间/速度方面有一点优势。

优化数据框上的重复 Fisher 测试

问题描述投票：0回答：1

1个回答

最新问题

优化数据框上的重复 Fisher 测试

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1