找到满足约束条件的每个组合

问题描述 投票:0回答:2

我有一个带有日期和“组”的记录数据框。我想找到满足以下限制的记录的每种组合:

  1. 每组只有一条记录。
  2. 组合中的每条记录都有不同的日期。
  3. 组合内的所有记录彼此之间的时间间隔在 x 天内。

我的数据足够小,我应该能够找到每个组合。如果我可以在创建组合时应用约束,那么它应该保持大小易于管理。但到目前为止我只能创建所有组合然后进行过滤,这非常慢。

我猜

data.table
会有用吗?以下是 MRE:

library(tidyverse)
set.seed(0)

#Set some parameters to be used later
n_rows <- 1000
n_elig_groups <- 4
max_date_diff <- 8


#Create a data frame with id, date & group
df <- tibble(
  id = 1:n_rows,
  date = sample(seq.Date(ymd(20230101), ymd(20231231), by = "day"),
                n_rows,
                replace = TRUE),
  group = rep(letters, length.out = n_rows)
)


#What are the eligible groups?
elig_groups <- letters[sample(1:26, n_elig_groups)]


#Find every combination of records within the eligible groups where:
  #There is one and only one record from each group.
  #Each record within the combination has a distinct date.
  #All records within a combination are within x days of each other.


#Bad approach: this is technically correct but super inefficient

#Create every possible combination of records within the eligible groups
combos <- df %>% 
  filter(group == elig_groups[1]) %>% 
  select(-group) %>% 
  rename_all(function(x){paste0(x, "_", elig_groups[1])})
for(group_i in elig_groups[-1]){
  combos <- combos %>% 
    expand_grid({
      df %>% 
        filter(group == group_i) %>% 
        select(-group) %>% 
        rename_all(function(x){paste0(x, "_", group_i)})
    })
}

#Now find the combinations which meet our constraints
valid_combos <- combos %>% 
  #First, pivot to get a row for every record
  mutate(combo_id = row_number()) %>% 
  pivot_longer(-combo_id,
               names_to = c(".value", "group"),
               names_sep = "_") %>% 
  #Apply our constraints by combo
  group_by(combo_id) %>% 
  filter(n_distinct(date) == n_elig_groups,
         max(date) - min(date) <= max_date_diff) %>% 
  ungroup()
r dataframe dplyr data.table combinations
2个回答
1
投票

逻辑并没有更好(进行所有组合并保留好的组合),但是使用 data.table 它似乎更快..


library(data.table)
setDT(df)
dfg <- df[group %in% elig_groups]
df2 <- dfg[group == elig_groups[1]]
df2[, one := 1]
for (i in elig_groups[-1]){
  current <- dfg[group == i][ , c(paste0("id", "_", i), 
                                  paste0("group", "_", i),
                                  paste0("date", "_", i),
                                  "one")
                              := .(id, group, date, 1)][, 4:7]
  df2 <- merge(df2, 
               current,
               by = "one",
               all = TRUE,
               allow.cartesian = TRUE)
    
  }

uni <- function (...){
  x <- list(...)
  sum(!duplicated(x))
      }
date.cols <- names(which(sapply(df2,is.Date)))
df2[, date_max := do.call(pmax, .SD), .SDcols = date.cols]
df2[, date_min := do.call(pmin, .SD), .SDcols = date.cols]

valid_combos2 <- df2[date_max - date_min <= days(max_date_diff)]
valid_combos2[, n_date := do.call(uni, .SD), by = seq_len(nrow(valid_combos2)),  .SDcols = date.cols]
valid_combos2 <- valid_combos2[n_date == 4]

更一般地说,对于最后一行:

valid_combos2 <- valid_combos2[n_date == length(elig_groups)]


0
投票

在日期上使用

data.table
非等值连接形成有效的组合从属关系,然后过滤
id
group
,然后使用
igraph::cliques
查找大小 4 的所有组合:

library(data.table)
library(igraph)

combos <- matrix(
  as.integer(
    names(
      unlist(
        cliques(
          graph_from_data_frame(
            setDT(df)[,datePlus := date + max_date_diff][
              df,
              on = .(date > date, date <= datePlus),
              .(id, i.id, group, i.group)
            ][
              id != i.id & group != i.group
            ][,3:4 := NULL],
            FALSE
          ),
          n_elig_groups,
          n_elig_groups
        )
      )
    )
  ),
  ncol = 4, byrow = TRUE
)

有近1M种组合:

dim(combos)
#> [1] 954110      4

第一个组合:

df[combos[1,], 1:3]
#>     id       date group
#> 1: 174 2023-05-06     r
#> 2: 184 2023-05-07     b
#> 3: 758 2023-05-05     d
#> 4: 783 2023-05-04     c
© www.soinside.com 2019 - 2024. All rights reserved.