# 计算每组重叠的日期间隔数

##### 问题描述投票：0回答：4

``dput``
）：

``````> df
group       from         to
1      A 2023-03-01 2023-03-02
2      A 2023-03-01 2023-03-03
3      A 2023-03-03 2023-03-07
4      A 2023-03-05 2023-03-08
5      A 2023-03-09 2023-03-10
6      A 2023-03-11 2023-03-11
7      B 2023-03-01 2023-03-02
8      B 2023-03-04 2023-03-06
9      B 2023-03-07 2023-03-07
10     B 2023-03-08 2023-03-11
11     B 2023-03-10 2023-03-12
12     B 2023-03-15 2023-03-16
``````

``````  group overlaying_intervals
1     A                    3
2     B                    1
``````

``dput``
df:

``````df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B", "B"), from = c("2023-03-01", "2023-03-01", "2023-03-03",
"2023-03-05", "2023-03-09", "2023-03-11", "2023-03-01", "2023-03-04",
"2023-03-07", "2023-03-08", "2023-03-10", "2023-03-15"), to = c("2023-03-02",
"2023-03-03", "2023-03-07", "2023-03-08", "2023-03-10", "2023-03-11",
"2023-03-02", "2023-03-06", "2023-03-07", "2023-03-11", "2023-03-12",
"2023-03-16")), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
``````
r dataframe dplyr intervals
##### 4个回答
3

``````library(lubridate)
library(dplyr)
library(purrr)

df %>%
group_by(group) %>%
mutate(int = interval(from, to),
# count overlapping intervals, subtracting overlap with self
overlays = (map_int(int, ~sum(int_overlaps(.x, int))))-1) %>%
# divide total by 2 since each pairwise overlap is counted twice
summarize(overlaying_intervals = sum(overlays)/2)
#> # A tibble: 2 × 2
#>   group overlaying_intervals
#>   <chr>                <dbl>
#> 1 A                        3
#> 2 B                        1
``````

2

``````by(df, df\$group, \(x){
dc <- c("from", "to")
x[dc] <- lapply(x[dc], \(x) as.numeric(as.Date(x)))
U <- apply(x[dc], 1, \(z) z[1]:z[2])
outer(U, U, Vectorize(\(x, y) length(intersect(x, y)) > 0)) |> `diag<-`(0) |> sum() |> base::`/`(2)
}) |> as.table() |> as.data.frame()
#   df.group Freq
# 1        A    3
# 2        B    1
``````

hand

``````df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B", "B"), from = c("2023-03-01", "2023-03-01", "2023-03-03",
"2023-03-05", "2023-03-09", "2023-03-11", "2023-03-01", "2023-03-04",
"2023-03-07", "2023-03-08", "2023-03-10", "2023-03-15"), to = c("2023-03-02",
"2023-03-03", "2023-03-07", "2023-03-08", "2023-03-10", "2023-03-11",
"2023-03-02", "2023-03-06", "2023-03-07", "2023-03-11", "2023-03-12",
"2023-03-16")), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
``````

2

``data.table``

``foverlaps``

``````setDT(df)
rev(
stack(
lapply(
split(
setkey(df[, lapply(.SD, as.IDate), group], from, to),
by = "group"
),
function(x) {
foverlaps(x, x, which = TRUE)[xid < yid, .N]
}
)
)
)
``````

``````  ind values
1   A      3
2   B      1
``````

1

``ivs::iv_count_overlaps()``

ivs 是一个专为间隔工作而设计的软件包，因此非常适合这个。

``[ )``
，所以你需要在你的
``to``

``````library(dplyr, warn.conflicts = FALSE)
library(ivs)

df <- tibble::tribble(
~group, ~from, ~to,
"A", "2023-03-01", "2023-03-02",
"A", "2023-03-01", "2023-03-03",
"A", "2023-03-03", "2023-03-07",
"A", "2023-03-05", "2023-03-08",
"A", "2023-03-09", "2023-03-10",
"A", "2023-03-11", "2023-03-11",
"B", "2023-03-01", "2023-03-02",
"B", "2023-03-04", "2023-03-06",
"B", "2023-03-07", "2023-03-07",
"B", "2023-03-08", "2023-03-11",
"B", "2023-03-10", "2023-03-12",
"B", "2023-03-15", "2023-03-16"
)

df <- df %>%
mutate(from = as.Date(from), to = as.Date(to)) %>%
mutate(range = iv(from, to + 1L), .keep = "unused")

df
#> # A tibble: 12 × 2
#>    group                    range
#>    <chr>               <iv<date>>
#>  1 A     [2023-03-01, 2023-03-03)
#>  2 A     [2023-03-01, 2023-03-04)
#>  3 A     [2023-03-03, 2023-03-08)
#>  4 A     [2023-03-05, 2023-03-09)
#>  5 A     [2023-03-09, 2023-03-11)
#>  6 A     [2023-03-11, 2023-03-12)
#>  7 B     [2023-03-01, 2023-03-03)
#>  8 B     [2023-03-04, 2023-03-07)
#>  9 B     [2023-03-07, 2023-03-08)
#> 10 B     [2023-03-08, 2023-03-12)
#> 11 B     [2023-03-10, 2023-03-13)
#> 12 B     [2023-03-15, 2023-03-17)

# Count all overlaps, then:
# - Subtract 1 for self-overlaps
# - Divide by 2 to get rid of doubly counted pairwise overlaps
df %>%
mutate(count = iv_count_overlaps(range, range), .by = group) %>%
mutate(count = count - 1L) %>%
summarise(count = sum(count) / 2, .by = group)
#> # A tibble: 2 × 2
#>   group count
#>   <chr> <dbl>
#> 1 A         3
#> 2 B         1
``````