我有一个大型数据框,我需要聚合并获取数字列的最小值和最大值。在我引入 NA 值之前,它运行得相当快。然后,聚合前 2000 行所花费的时间大约与聚合 3M 行的完整数据帧所花费的时间一样长。将完整的数据帧与 NA 一起运行数天。我可以做些什么来加快速度吗?
library(stringi)
library(dplyr)
n <- 3000000
h <- stri_rand_strings(40000, 15, pattern = "[a-z]")
sp <- stri_rand_strings(10000, 15, pattern = "[a-z]")
y <- round(runif(n, 1800, 2000))
r <- round(runif(n, 1, 100))
df <- data.frame(h = sample(h, n, replace = TRUE), sp = sample(sp, n, replace = TRUE), y, r)
# takes 12 seconds
df %>%
group_by(h, sp) %>%
summarize(r = sum(r), min_y = min(y), max_y = max(y))
# introduce NA
df$y[sample(nrow(df), 1500000)] <- NA
# first 2000 rows, takes about as long
df[1:2000,] %>%
group_by(h, sp) %>%
summarize(n = sum(r), min_x = min(x), max_x = max(x))
在我的机器上引入
NA
后,我没有看到明显的缓慢
df1 <- data.frame(h = sample(h, n, replace = TRUE), sp = sample(sp, n, replace = TRUE), y, r)
df2 <- df1
df2$y[sample(nrow(df2), 1500000)] <- NA
f <- function(df) {
df %>%
summarize(r = sum(r), min_y = min(y), max_y = max(y), .by = c(h, sp))
}
system.time(f(df1))
system.time(f(df2))
表演
> system.time(f(df1))
user system elapsed
14.94 0.49 15.48
> system.time(f(df2))
user system elapsed
15.20 0.18 15.42