我的目标是在每个ID具有多个可能重叠的曝光范围时,为每个ID定义唯一的非重叠范围(或时间间隔)。我发现R“ IntervalSurgeon”包中的“ flatten”函数可以实现任务。我的问题是:如何以“ data.table”方式有效地执行相同的任务并获得相同的“ tab_out”输出?
library(data.table)
library(IntervalSurgeon)
set.seed(2019)
N <- 3 # number of IDs
IDs <- paste0("ID", 1:N) # unique IDs
K <- 4 # number of exposures per ID
DT <- data.table(IDs = rep(IDs, each = K),
starts = sample(1:20, N * K, replace = T))[,
ends := starts + sample(1:5, N * K, replace = T)]
DT <- DT[order(IDs, starts),]
tab_out <- DT[, as.list(data.table(
flatten(as.matrix(cbind(starts, ends))))),
by = IDs]
DT
IDs starts ends
1: ID1 7 11
2: ID1 13 17
3: ID1 15 16
4: ID1 16 18
5: ID2 1 5
6: ID2 1 4
7: ID2 2 3
8: ID2 17 19
9: ID3 3 6
10: ID3 13 16
11: ID3 14 15
12: ID3 16 21
tab_out
IDs V1 V2
1: ID1 7 11
2: ID1 13 18
3: ID2 1 5
4: ID2 17 19
5: ID3 3 6
6: ID3 13 21
library( data.table )
library( intervals )
DT <- fread("
IDs starts ends
ID1 7 11
ID1 13 17
ID1 15 16
ID1 16 18
ID2 1 5
ID2 1 4
ID2 2 3
ID2 17 19
ID3 3 6
ID3 13 16
ID3 14 15
ID3 16 21")
code
myfun <- function( y ) { data.table::as.data.table( intervals::interval_union( intervals::Intervals( as.matrix( y ) ), check_valid = TRUE ) ) } DT[, myfun( .SD ), by = .(IDs)] # IDs V1 V2 # 1: ID1 7 11 # 2: ID1 13 18 # 3: ID2 1 5 # 4: ID2 17 19 # 5: ID3 3 6 # 6: ID3 13 21