我最近问了一个question我得到了一个有用的答案,但考虑到我的大数据集(14K行),以下代码运行得非常慢。
如何使用像here这样的mcmapply来加快速度?
这是代码:
within(df,
count <- mapply(function(x, y) {
in5year <- paste(animals.2[year %in% (x-4):x], collapse = "; ")
sum(strsplit(in5year, "; ")[[1]] %in% strsplit(y, "; ")[[1]])
}, year, animals.1)
)
使用并行的解决方案。使用mclapply
而不是mcmapply
,因为它更快一点。
library(parallel)
library(dplyr)
library(microbenchmark)
df = data.frame(animals.1 = c("cat; dog; bird", "dog; bird", "bird", "dog"),
animals.2 = c("cat; dog; bird","dog; bird; seal", "bird", ""),
stringsAsFactors = F)
df <- replicate(10000,{df}, simplify=F) %>% do.call(rbind, .)
df$year <- seq(2000,2000 + nrow(df) - 1)
st_func <- function(df) {
within(df,
count <- mapply(function(x, y) {
in5year <- paste(animals.2[year %in% (x-4):x], collapse = "; ")
sum(strsplit(in5year, "; ")[[1]] %in% strsplit(y, "; ")[[1]])
}, year, animals.1)
)
}
mc_func <- function(df) {
df$count <- mclapply(1:nrow(df), function(i) {
x <- df$year[i]
y <- df$animals.1[i]
in5year <- paste(df$animals.2[df$year %in% (x-4):x], collapse = "; ")
sum(strsplit(in5year, "; ")[[1]] %in% strsplit(y, "; ")[[1]])
}, mc.cores=4) %>% unlist
return(df)
}
identical(mc_func(df), st_func(df)) # TRUE
microbenchmark(mc_func(df), st_func(df), times=5)
Unit: seconds
expr min lq mean median uq max neval cld
mc_func(df) 8.588759 8.637135 9.101409 8.91779 8.924929 10.43843 5 a
st_func(df) 30.090307 30.107282 30.440877 30.45653 30.696706 30.85356 5 b