我想在标识符的类别中采用一组可能重叠的间隔,并创建新的间隔,这些间隔要么完全重叠(即相同的开始/结束值),要么完全不重叠。这些新间隔应共同跨越原始间隔的范围,并且不包括不在原始间隔中的任何范围。
这需要相对快速的操作,因为我正在处理大量数据。
以下是一些示例数据:
library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id))
x <- data.table(start1,end1,id1,obs)
> x
start1 end1 id1 obs
1: 1 10 1 -0.79701638
2: 7 12 1 -0.09251333
3: 9 15 1 -0.08118742
4: 17 20 1 -2.33312797
5: 18 23 1 0.26581138
6: 1 3 2 -0.34314127
7: 3 5 2 -0.17196880
8: 20 25 2 0.11614842
输出应该是这样的:
id1 start1 end1 i.start1 i.end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 7 8 7 12 -0.09251333
4: 1 9 10 1 10 -0.79701638
5: 1 9 10 7 12 -0.09251333
6: 1 9 10 9 15 -0.08118742
7: 1 11 12 7 12 -0.09251333
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
我尝试直接编程,但速度很慢。
这是另一种选择。
#borrowing idea from https://stackoverflow.com/a/28938694/1989480
#group overlapping intervals together
x[, g := c(0L, cumsum(shift(start, -1L) > cummax(end))[-.N]), by=.(id)]
#cut those intervals into non-overlapping ones
itvl <- x[, {
s <- sort(c(start - 1L, start, end, end + 1L))
as.data.table(matrix(s[s %between% c(min(start), max(end))], ncol=2L, byrow=TRUE))
}, by=.(id, g)]
#get OP's desired output using non-equi join
x[itvl, on=.(id, start<=V1, end>=V1),
.(id1=id, start1=V1, end1=V2, i.start1=x.start, i.end1=x.end, obs),
allow.cartesian=TRUE]
输出:
id1 start1 end1 i.start1 i.end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 7 8 7 12 -0.09251333
4: 1 9 10 1 10 -0.79701638
5: 1 9 10 7 12 -0.09251333
6: 1 9 10 9 15 -0.08118742
7: 1 11 12 7 12 -0.09251333
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
数据:
library(data.table)
set.seed(1113)
id <- c(1,1,1,1,1,2,2,2)
x <- data.table(start=c(1,7,9, 17, 18,1,3,20),
end=c(10,12,15, 20, 23,3,5,25),
id=id,
obs=rnorm(length(id)))
这是我的解决方案。它基于此处的算法(https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541),但使用data.table,shift和vectorized ifelse语句来提高效率。它也与算法的不同之处在于,我的代码允许对由id_column标识的多个数据集分别执行此操作。我的方法也忽略了跟踪行(即“属性”),因为无论如何,当使用foverlaps
将间隔很容易地合并回原始数据时,没有必要定义它。 foverlaps也用于排除差距
请告诉我你是否看到效率低下
remove_overlaps <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
#excluding end_next when missing should cause this to ignore the last row in each group
#because this element will be NA as defined by shift
temp <- xd[,.SD[!is.na(end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp)
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
remove_overlaps(x, start_column="start1",end_column="end1",id_column="id1")
此外,对于它的价值我不认为关于that page的建议与如何排除差距是正确的。
这个答案没有考虑到差距(输出中不应出现间隙),所以我对它进行了改进:*如果e = false,则将a添加到S.如果e = true,则从S中取走a。*定义n'= n如果e =假或n'= n + 1如果e =真*如果f =假,则定义m'= m-1或如果f =真,则定义m'= m *如果n'<= m'且(e,不f)= false,输出(n',m',S),否则不输出。 - silentman.it 18年8月23日12:19
以下是在R中实现的此代码算法的第二个版本:remove_overlaps没有明确使用silentman.it的建议来排除间隙,而remove_overlaps1使用该建议。请注意,这两个函数都会通过后续调用foverlaps来排除间隙,如果它们与x中的那些(原始数据)部分匹配,则仅返回间隔。
library(data.table)
remove_overlaps1 <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
###subset to rows where (e & !f) = FALSE, as per comment suggestion on linked answer
temp <- xd[,.SD[!is.na(end_next)&!(end & !end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp) #this should exclude gaps since foverlaps by default subsets to
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
示例数据:
library(data.table)
x <-
structure(
list(
native_id = c(
"1",
"1",
"1",
"1",
"1"
),
n_start_date = c(14761, 14775,
14789, 14803, 14817),
n_end_date = c(14776, 14790, 14804, 14818,
14832),
obs = c(
31.668140525481,
34.8623263656539,
35.0841466093899,
37.2281249364127,
36.3726151694052
)
),
row.names = c(NA,-5L),
class = "data.frame",
.Names = c("native_id",
"n_start_date", "n_end_date", "obs")
)
setDT(x)
> x
native_id n_start_date n_end_date obs
1: 1 14761 14776 31.66814
2: 1 14775 14790 34.86233
3: 1 14789 14804 35.08415
4: 1 14803 14818 37.22812
5: 1 14817 14832 36.37262
结果:
> remove_overlaps(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14777 14788 14775 14790 34.86233
5: 1 14789 14790 14775 14790 34.86233
6: 1 14789 14790 14789 14804 35.08415
7: 1 14791 14802 14789 14804 35.08415
8: 1 14803 14804 14789 14804 35.08415
9: 1 14803 14804 14803 14818 37.22812
10: 1 14805 14816 14803 14818 37.22812
11: 1 14817 14818 14803 14818 37.22812
12: 1 14817 14818 14817 14832 36.37262
13: 1 14819 14832 14817 14832 36.37262
看似不正确,排除了太多的间隔:
> remove_overlaps1(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14789 14790 14775 14790 34.86233
5: 1 14789 14790 14789 14804 35.08415
6: 1 14803 14804 14789 14804 35.08415
7: 1 14803 14804 14803 14818 37.22812
8: 1 14817 14818 14803 14818 37.22812
9: 1 14817 14818 14817 14832 36.37262
10: 1 14819 14832 14817 14832 36.37262