我有一个带有日期和“组”的记录数据框。我想找到满足以下限制的记录的每种组合:
我的数据足够小,我应该能够找到每个组合。如果我可以在创建组合时应用约束,那么它应该保持大小易于管理。但到目前为止我只能创建所有组合然后进行过滤,这非常慢。
我猜
data.table
会有用吗?以下是 MRE:
library(tidyverse)
set.seed(0)
#Set some parameters to be used later
n_rows <- 1000
n_elig_groups <- 4
max_date_diff <- 8
#Create a data frame with id, date & group
df <- tibble(
id = 1:n_rows,
date = sample(seq.Date(ymd(20230101), ymd(20231231), by = "day"),
n_rows,
replace = TRUE),
group = rep(letters, length.out = n_rows)
)
#What are the eligible groups?
elig_groups <- letters[sample(1:26, n_elig_groups)]
#Find every combination of records within the eligible groups where:
#There is one and only one record from each group.
#Each record within the combination has a distinct date.
#All records within a combination are within x days of each other.
#Bad approach: this is technically correct but super inefficient
#Create every possible combination of records within the eligible groups
combos <- df %>%
filter(group == elig_groups[1]) %>%
select(-group) %>%
rename_all(function(x){paste0(x, "_", elig_groups[1])})
for(group_i in elig_groups[-1]){
combos <- combos %>%
expand_grid({
df %>%
filter(group == group_i) %>%
select(-group) %>%
rename_all(function(x){paste0(x, "_", group_i)})
})
}
#Now find the combinations which meet our constraints
valid_combos <- combos %>%
#First, pivot to get a row for every record
mutate(combo_id = row_number()) %>%
pivot_longer(-combo_id,
names_to = c(".value", "group"),
names_sep = "_") %>%
#Apply our constraints by combo
group_by(combo_id) %>%
filter(n_distinct(date) == n_elig_groups,
max(date) - min(date) <= max_date_diff) %>%
ungroup()
逻辑并没有更好(进行所有组合并保留好的组合),但是使用 data.table 它似乎更快..
library(data.table)
setDT(df)
dfg <- df[group %in% elig_groups]
df2 <- dfg[group == elig_groups[1]]
df2[, one := 1]
for (i in elig_groups[-1]){
current <- dfg[group == i][ , c(paste0("id", "_", i),
paste0("group", "_", i),
paste0("date", "_", i),
"one")
:= .(id, group, date, 1)][, 4:7]
df2 <- merge(df2,
current,
by = "one",
all = TRUE,
allow.cartesian = TRUE)
}
uni <- function (...){
x <- list(...)
sum(!duplicated(x))
}
date.cols <- names(which(sapply(df2,is.Date)))
df2[, date_max := do.call(pmax, .SD), .SDcols = date.cols]
df2[, date_min := do.call(pmin, .SD), .SDcols = date.cols]
valid_combos2 <- df2[date_max - date_min <= days(max_date_diff)]
valid_combos2[, n_date := do.call(uni, .SD), by = seq_len(nrow(valid_combos2)), .SDcols = date.cols]
valid_combos2 <- valid_combos2[n_date == 4]
更一般地说,对于最后一行:
valid_combos2 <- valid_combos2[n_date == length(elig_groups)]
在日期上使用
data.table
非等值连接形成有效的组合从属关系,然后过滤 id
和 group
,然后使用 igraph::cliques
查找大小 4 的所有组合:
library(data.table)
library(igraph)
combos <- matrix(
as.integer(
names(
unlist(
cliques(
graph_from_data_frame(
setDT(df)[,datePlus := date + max_date_diff][
df,
on = .(date > date, date <= datePlus),
.(id, i.id, group, i.group)
][
id != i.id & group != i.group
][,3:4 := NULL],
FALSE
),
n_elig_groups,
n_elig_groups
)
)
)
),
ncol = 4, byrow = TRUE
)
有近1M种组合:
dim(combos)
#> [1] 954110 4
第一个组合:
df[combos[1,], 1:3]
#> id date group
#> 1: 174 2023-05-06 r
#> 2: 184 2023-05-07 b
#> 3: 758 2023-05-05 d
#> 4: 783 2023-05-04 c