我想加速 data.table 中的这段代码。我想我可以做得更好。 这是典型的向后查找最后一个 TRUE 谓词,然后计算当前行的时间。这一切都是以小组为单位的。 我正在运行 300.000 行和 200.000 个组,并且需要计算许多这样的列(指标)。因此,速度对我来说很重要。
我做了一个较小的数据示例:
data <- data.table(
SYSKEY = c(
12, 13, 14, 15, 20,
22, 21, 24, 25, 26
),
Customer = c(
"John", "John", "John", "Tom", "Tom",
"Tom", "Sally", "Sally", "Sally", "Sally"
),
TRAN_DATTIM = as.Date(
c(
"28-02-2024", "28-02-2024", "02-03-2024", "02-03-2024", "02-03-2024",
"02-03-2024", "02-03-2024", "02-03-2024", "03-03-2024", "03-03-2024"
),
format="%d-%m-%Y", origin="01-01-1900"
),
Product = c(
"Eggs", "Milk", "Bread", "Butter","Eggs",
"Milk", "Bread", "Butter", "Eggs", "Wine"
)
)
我的函数和执行代码:
library(data.table)
build_recency <- function(
data,
name,
predicate,
aggregated_fun = "TIME",
gamma = 0.0001,
rolling_over
){
UseMethod("build_recency")
}
build_recency.data.table <- function(
data,
name,
predicate,
aggregated_fun = "TIME",
gamma = 0.01,
rolling_over = "PAN"
){
predicate <- enexpr(predicate)
data[,
PRED := fifelse(eval(predicate) == TRUE, 1L, 0L)
]
setorderv(
data,
c(rolling_over, "TRAN_DATTIM", "SYSKEY")
)
# make a left-join, that trail last TRUE predicate PRED in their
# rolling over's group
data[,c(
.SD[
# last timestamp with PRED==TRUE
PRED == 1, .(TRAN_DATTIM, SYSKEY)
][
.SD,
# assure that do not get itself
on = .(TRAN_DATTIM <= TRAN_DATTIM, SYSKEY < SYSKEY),
# in that cartesian product, get the latest.
# DT must be ordered by TRAN_DATTIM!
mult = "last",
# j
# return all .SD cols, plus new column x.TRAN_DATTIM
# I help me with env argument.
# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-programming.html#:~:text=Substituting%20lists%20of%20arbitrary%20length
cols,
env = list(
cols = I(c(colnames(.SD), "x.TRAN_DATTIM"))
)
]
),
by = rolling_over
][,
# building recency and cleaning auxiliar vars
`:=` (
name = {
x <- as.numeric(
difftime(
TRAN_DATTIM,
x.TRAN_DATTIM,
units = "days"
)
)
if(aggregated_fun == "EXP")
x <- exp(-gamma * x)
x
},
x.TRAN_DATTIM = NULL,
PRED = NULL
),
env = list(
gamma = I(gamma),
name = name
)
]
}
data2 <- build_recency(
data = data,
name = "RECENCY",
predicate = if_else(
Product == 'Eggs', TRUE, FALSE
),
gamma = 0.001,
rolling_over = 'Customer'
)
data2[]
预期结果:
Customer SYSKEY TRAN_DATTIM Product RECENCY
<char> <num> <Date> <char> <num>
1: John 12 2024-02-28 Eggs NA
2: John 13 2024-02-28 Milk 0
3: John 14 2024-03-02 Bread 3
4: Sally 21 2024-03-02 Bread NA
5: Sally 24 2024-03-02 Butter NA
6: Sally 25 2024-03-03 Eggs NA
7: Sally 26 2024-03-03 Wine 0
8: Tom 15 2024-03-02 Butter NA
9: Tom 20 2024-03-02 Eggs NA
10: Tom 22 2024-03-02 Milk 0
不是 data.table,但您可以使用混合包来非常快速地实现此目的。
library(tidyverse)
library(timeplyr)
library(collapse)
data %>%
mutate(id = fcumsum(Product == "Eggs", g = Customer)) %>%
mutate(RECENCY = time_elapsed(TRAN_DATTIM, "days", rolling = TRUE, g = pick(Customer, id))) %>%
arrange(Customer) %>%
mutate(RECENCY = if_else(id == 0, NA, RECENCY))
#> # A tibble: 10 × 6
#> SYSKEY Customer TRAN_DATTIM Product id RECENCY
#> <dbl> <chr> <date> <chr> <int> <dbl>
#> 1 12 John 2024-02-28 Eggs 1 NA
#> 2 13 John 2024-02-28 Milk 1 0
#> 3 14 John 2024-03-02 Bread 1 3
#> 4 21 Sally 2024-03-02 Bread 0 NA
#> 5 24 Sally 2024-03-02 Butter 0 NA
#> 6 25 Sally 2024-03-03 Eggs 1 NA
#> 7 26 Sally 2024-03-03 Wine 1 0
#> 8 15 Tom 2024-03-02 Butter 0 NA
#> 9 20 Tom 2024-03-02 Eggs 1 NA
#> 10 22 Tom 2024-03-02 Milk 1 0
创建于 2024-05-23,使用 reprex v2.1.0