我如何对数据表中特定列的不同子集取平均值?

问题描述 投票:3回答:5

给出一个示例数据框:

dt <- data.table(value=1:10,start=c(1,4,5,8,6,3,2,1,9,4),finish=c(3,7,8,9,10,10,4,10,10,8))

我想添加一个新列,其名称可能为mean_column。此列的第i行应具有值

mean( value[ seq( from = start[i], to=finish[i] ) ] )

我正在处理的真实数据有2000万行,所以我需要找到一种快速的方法来进行此计算。

编辑:data.table中的value列不必像示例中那样是有序序列。此列中的每个值都可以为正数。

r dataframe dplyr data.table
5个回答
3
投票

这是一种使用中的非等值联接的方法。

dt <- data.table(value=c(10,1:9),start=c(1,4,5,8,6,3,2,1,9,4),finish=c(3,7,8,9,10,10,4,10,10,8))
dt[, id := .I]

dt[dt,
   on = .(id >= start,
          id <= finish),
   .(i.id, i.value, mean_col = mean(x.value)),
   by = .EACHI,
   allow.cartesian = T]

       id    id  i.id i.value mean_col
    <int> <int> <int>   <num>    <num>
 1:     1     3     1      10 4.333333
 2:     4     7     2       1 4.500000
 3:     5     8     3       2 5.500000
 4:     8     9     4       3 7.500000
 5:     6    10     5       4 7.000000
 6:     3    10     6       5 5.500000
 7:     2     4     7       6 2.000000
 8:     1    10     8       7 5.500000
 9:     9    10     9       8 8.500000
10:     4     8    10       9 5.000000

尝试2,000,000行,这在我的计算机上花费4秒钟,并提供与@ jay.sf相同的答案。

n = 2e6
dt <- data.table(value = sample(1000L, n, TRUE), start = sample(n, n, TRUE))
dt[, finish := start + sample(30, n, TRUE)]
dt[finish > n, finish := n]

system.time({
dt[, id := .I]
  dt[dt,
     on = .(id >= start,
            id <= finish),
     .(i.id, i.value, mean_col = mean(x.value)),
     by = .EACHI,
     allow.cartesian = T]
})

##    user  system elapsed 
##   3.78    0.01    3.69 

#jay.sf base approach
system.time({
  FUNV3 <- Vectorize(function(x, y) x:y)
dt$mean.column2 <- with(dt, sapply(FUNV3(start, finish), function(x) mean(value[x])))
})

##   user  system elapsed 
##  24.45    0.04   24.72 

all.equal(dt$mean.column2,   dt[dt,
                                on = .(id >= start,
                                       id <= finish),
                                .(i.id, i.value, mean_col = mean(x.value)),
                                by = .EACHI,
                                allow.cartesian = T]$mean_col)

##[1] TRUE

3
投票

您可以使用apply方法。 1e6行大约需要20秒。

dt$mean.column <- apply(dt[2:3], 1, function(x) 
  mean(dt$value[seq(x[1], x[2])]))
#    value start finish mean.column
# 1      1     1      3         2.0
# 2      2     4      7         5.5
# 3      3     5      8         6.5
# 4      4     8      9         8.5
# 5      5     6     10         8.0
# 6      6     3     10         6.5
# 7      7     2      4         3.0
# 8      8     1     10         5.5
# 9      9     9     10         9.5
# 10    10     4      8         6.0

大约要快[[30%,不过] >>,如果我们Vectorize seq函数是这样的:FUNV <- Vectorize(function(x, y) seq(x, y)) dt$mean.column2 <- with(dt, sapply(FUNV(start, finish), function(x) mean(value[x]))) stopifnot(all.equal(dt$mean.column, dt$mean.column2))

[Edit:

实际上,FUNV()可以使用比seq()更快的东西来[[改良]],例如seq.int:FUNV2 <- Vectorize(function(x, y) seq.int(x, y)) FUNV3 <- Vectorize(function(x, y) x:y) 这里是
microbenchmark:

microbenchmark::microbenchmark( apply=apply(df.L[2:3], 1, function(x) mean(df.L$value[seq(x[1], x[2])])), FUNV=with(df.L, sapply(FUNV(start, finish), function(x) mean(value[x]))), FUNV2=with(df.L, sapply(FUNV2(start, finish), function(x) mean(value[x]))), FUNV3=with(df.L, sapply(FUNV3(start, finish), function(x) mean(value[x]))), data.table={ ## see Cole's answer dt.L[, id := .I] dt.L[dt.L, on=.(id >= start, id <= finish), .(i.id, i.value, mean_col=mean(x.value)), by=.EACHI, allow.cartesian=T]}, times=3L) # Unit: seconds # expr min lq mean median uq max neval cld # apply 26.736665 26.740363 28.701785 26.744062 29.68435 32.624629 3 c # FUNV 24.983665 26.513645 28.007959 28.043625 29.52011 30.996587 3 c # FUNV2 15.371551 16.031383 16.848238 16.691215 17.58658 18.481949 3 b # FUNV3 14.156043 14.266123 14.436663 14.376203 14.57697 14.777744 3 b # data.table 2.138956 2.323735 2.426432 2.508515 2.57017 2.631825 3 a

经过测试:
library(data.table)
dt <- data.table(value=c(10, 1:9), start=c(1, 4, 5, 8, 6, 3, 2, 1, 9, 4), 
                 finish=c(3, 7, 8, 9, 10, 10, 4, 10, 10, 8))
df <- as.data.frame(df)
set.seed(42)
df.L <- df[sample(1:nrow(df), 1e6, replace=TRUE), ]
dt.L <- dt[sample(1:nrow(dt), 1e6, replace=TRUE), ]

这里是完成平均值的这一特定任务的另一种方法。 “ mean_column”的值可以计算为(running_sum[finish[i]] - running_sum[start[i] - 1]) / (finish[i] - start[i] + 1)
cs = cumsum(dt$value) # cumulative sum s = dt$start - 1 # starting indices - 1 f = dt$finish # ending indices # curent sums at all starting indices cs.s = s i = which(s > 0) cs.s[i] = cs[s] # current sums at all ending indices cs.f = cs[f] # subtract and divide (cs.f - cs.s) / (f - s) #[1] 2.0 5.5 6.5 8.5 8.0 6.5 3.0 5.5 9.5 6.0

时间代码:
library(data.table) set.seed(0L) nr <- 1e5L dt <- data.table(id=1L:nr, value=1L:nr, start=sample(nr, nr, TRUE), finish=sample(nr, nr, TRUE)) dt[, c("start", "finish") := .(pmin(start, finish), pmax(start, finish))] library(Rcpp) cppFunction(" NumericVector rngmean(IntegerVector start, IntegerVector finish, NumericVector value) { int sz = value.size(); int i, j; double sum = 0.0; NumericVector csum(sz); NumericVector res(sz); csum[0] = value[0]; for (i=1; i<sz; i++) { csum[i] = value[i] + csum[i-1]; } for (i=0; i<sz; i++) { if (start[i]==1) { res[i] = csum[finish[i] - 1]; } else { res[i] = (csum[finish[i] - 1] - csum[start[i] - 2]) / (finish[i] - start[i] + 1); } } return(res); } ") mtd0 <- function() { dt[dt, on=.(id>=start, id<=finish), allow.cartesian=TRUE, by=.EACHI, mean(x.value)]$V1 } mtd1 <- function() { dt[, { cs <- cumsum(as.numeric(value)) (cs[finish] - cs[start] + value[start]) / (finish - start + 1) }] } mtd2 <- function() { dt[, rngmean(start, finish, value)] } microbenchmark::microbenchmark(times=1L, mtd0(), mtd1(), mtd2())

时间:

Unit: milliseconds
   expr          min           lq         mean       median           uq          max neval
 mtd0() 17431.150342 17431.150342 17431.150342 17431.150342 17431.150342 17431.150342     1
 mtd1()     4.520483     4.520483     4.520483     4.520483     4.520483     4.520483     1
 mtd2()     2.466647     2.466647     2.466647     2.466647     2.466647     2.466647     1

并且当nr = 20e6时,

microbenchmark::microbenchmark(times=1L, mtd1(), mtd2())

时间:

Unit: milliseconds
   expr       min        lq      mean    median        uq       max neval
 mtd1() 1402.2282 1402.2282 1402.2282 1402.2282 1402.2282 1402.2282     1
 mtd2()  711.9264  711.9264  711.9264  711.9264  711.9264  711.9264     1

这里是基本的R解决方案。
您可以定义自定义函数f,然后使用apply()

f <- function(v,s,d) mean(v[s:d]) val_vector <- dt$value dt$mean <- apply(dt, 1, function(v) f(val_vector,v["start"],v["finish"]))

诸如此类

> dt
   value start finish mean
1      1     1      3  2.0
2      2     4      7  5.5
3      3     5      8  6.5
4      4     8      9  8.5
5      5     6     10  8.0
6      6     3     10  6.5
7      7     2      4  3.0
8      8     1     10  5.5
9      9     9     10  9.5
10    10     4      8  6.0

这对您有用吗?
library(tidyverse) dt <- data.table(value=1:10, start = c(1,4,5,8,6,3,2,1,9,4), finish = c(3,7,8,9,10,10,4,10,10,8)) dt %>% mutate(mean = (finish + start)/2)

3
投票

2
投票

0
投票
© www.soinside.com 2019 - 2024. All rights reserved.