这里是microbenchmark:
microbenchmark::microbenchmark(
apply=apply(df.L[2:3], 1, function(x) mean(df.L$value[seq(x[1], x[2])])),
FUNV=with(df.L, sapply(FUNV(start, finish), function(x) mean(value[x]))),
FUNV2=with(df.L, sapply(FUNV2(start, finish), function(x) mean(value[x]))),
FUNV3=with(df.L, sapply(FUNV3(start, finish), function(x) mean(value[x]))),
data.table={ ## see Cole's answer
dt.L[, id := .I]
dt.L[dt.L, on=.(id >= start, id <= finish), .(i.id, i.value, mean_col=mean(x.value)),
by=.EACHI, allow.cartesian=T]},
times=3L)
# Unit: seconds
# expr min lq mean median uq max neval cld
# apply 26.736665 26.740363 28.701785 26.744062 29.68435 32.624629 3 c
# FUNV 24.983665 26.513645 28.007959 28.043625 29.52011 30.996587 3 c
# FUNV2 15.371551 16.031383 16.848238 16.691215 17.58658 18.481949 3 b
# FUNV3 14.156043 14.266123 14.436663 14.376203 14.57697 14.777744 3 b
# data.table 2.138956 2.323735 2.426432 2.508515 2.57017 2.631825 3 a
经过测试:library(data.table)
dt <- data.table(value=c(10, 1:9), start=c(1, 4, 5, 8, 6, 3, 2, 1, 9, 4),
finish=c(3, 7, 8, 9, 10, 10, 4, 10, 10, 8))
df <- as.data.frame(df)
set.seed(42)
df.L <- df[sample(1:nrow(df), 1e6, replace=TRUE), ]
dt.L <- dt[sample(1:nrow(dt), 1e6, replace=TRUE), ]
这里是完成平均值的这一特定任务的另一种方法。 “ mean_column”的值可以计算为(running_sum[finish[i]] - running_sum[start[i] - 1]) / (finish[i] - start[i] + 1)
:
cs = cumsum(dt$value) # cumulative sum
s = dt$start - 1 # starting indices - 1
f = dt$finish # ending indices
# curent sums at all starting indices
cs.s = s
i = which(s > 0)
cs.s[i] = cs[s]
# current sums at all ending indices
cs.f = cs[f]
# subtract and divide
(cs.f - cs.s) / (f - s)
#[1] 2.0 5.5 6.5 8.5 8.0 6.5 3.0 5.5 9.5 6.0
时间代码:
library(data.table)
set.seed(0L)
nr <- 1e5L
dt <- data.table(id=1L:nr, value=1L:nr, start=sample(nr, nr, TRUE), finish=sample(nr, nr, TRUE))
dt[, c("start", "finish") := .(pmin(start, finish), pmax(start, finish))]
library(Rcpp)
cppFunction("
NumericVector rngmean(IntegerVector start, IntegerVector finish, NumericVector value) {
int sz = value.size();
int i, j;
double sum = 0.0;
NumericVector csum(sz);
NumericVector res(sz);
csum[0] = value[0];
for (i=1; i<sz; i++) {
csum[i] = value[i] + csum[i-1];
}
for (i=0; i<sz; i++) {
if (start[i]==1) {
res[i] = csum[finish[i] - 1];
} else {
res[i] = (csum[finish[i] - 1] - csum[start[i] - 2]) / (finish[i] - start[i] + 1);
}
}
return(res);
}
")
mtd0 <- function() {
dt[dt, on=.(id>=start, id<=finish), allow.cartesian=TRUE, by=.EACHI, mean(x.value)]$V1
}
mtd1 <- function() {
dt[, {
cs <- cumsum(as.numeric(value))
(cs[finish] - cs[start] + value[start]) / (finish - start + 1)
}]
}
mtd2 <- function() {
dt[, rngmean(start, finish, value)]
}
microbenchmark::microbenchmark(times=1L, mtd0(), mtd1(), mtd2())
时间:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0() 17431.150342 17431.150342 17431.150342 17431.150342 17431.150342 17431.150342 1
mtd1() 4.520483 4.520483 4.520483 4.520483 4.520483 4.520483 1
mtd2() 2.466647 2.466647 2.466647 2.466647 2.466647 2.466647 1
并且当nr = 20e6时,
microbenchmark::microbenchmark(times=1L, mtd1(), mtd2())
时间:
Unit: milliseconds
expr min lq mean median uq max neval
mtd1() 1402.2282 1402.2282 1402.2282 1402.2282 1402.2282 1402.2282 1
mtd2() 711.9264 711.9264 711.9264 711.9264 711.9264 711.9264 1
这里是基本的R解决方案。
您可以定义自定义函数f
,然后使用apply()
f <- function(v,s,d) mean(v[s:d])
val_vector <- dt$value
dt$mean <- apply(dt, 1, function(v) f(val_vector,v["start"],v["finish"]))
诸如此类
> dt
value start finish mean
1 1 1 3 2.0
2 2 4 7 5.5
3 3 5 8 6.5
4 4 8 9 8.5
5 5 6 10 8.0
6 6 3 10 6.5
7 7 2 4 3.0
8 8 1 10 5.5
9 9 9 10 9.5
10 10 4 8 6.0
这对您有用吗?
library(tidyverse)
dt <- data.table(value=1:10,
start = c(1,4,5,8,6,3,2,1,9,4),
finish = c(3,7,8,9,10,10,4,10,10,8))
dt %>% mutate(mean = (finish + start)/2)