我想构造一个“累计和”,计算在相关观察开始之前已结束的一组观察的数量。
我更喜欢使用dplyr的答案,但是优先考虑低内存开销,因为这是一个巨大的数据集。
下面的MWE,我要在其中创建的变量称为“ prior_ended_obs”。
mwe <- data.frame(group = c("a", "a", "a", "a", "a", "a", "a", "a", "a", "b", "b", "b"),
start = c("1/1/1990", "1/2/1990", "1/3/1990", "1/3/1990", "1/3/1990", "1/4/1990", "1/5/1990", "1/6/1990", "1/7/1990", "1/1/1990", "1/2/1990", "1/3/1990"),
end = c("1/2/1990", "1/2/1990", "1/3/1990", "1/4/1990", "1/5/1990", "1/5/1990", "1/5/1990", "1/6/1990", "1/8/1990", "1/1/1990", "1/2/1990", "1/3/1990"),
prior_ended_obs = c(0, 0, 2, 2, 2, 3, 4, 7, 8, 0, 1, 2)) %>%
mutate(start = mdy(start),
end = mdy(end)) %>%
group_by(group) %>%
mutate(lag_end = lag(end),
ones = 1,
cumsum = cumsum(ones)-1)
尝试:
mwe <- data.frame(group = c("a", "a", "a", "a", "a", "a", "a", "a", "a", "b", "b", "b"),
start = c("1/1/1990", "1/2/1990", "1/3/1990", "1/3/1990", "1/3/1990", "1/4/1990", "1/5/1990", "1/6/1990", "1/7/1990", "1/1/1990", "1/2/1990", "1/3/1990"),
end = c("1/2/1990", "1/2/1990", "1/3/1990", "1/4/1990", "1/5/1990", "1/5/1990", "1/5/1990", "1/6/1990", "1/8/1990", "1/1/1990", "1/2/1990", "1/3/1990"),
prior_ended_obs = c(0, 0, 2, 2, 2, 3, 4, 7, 8, 0, 1, 2)) %>%
mutate(start = mdy(start),
end = mdy(end)) %>%
group_by(group) %>%
mutate(
prior_ended_obs = sapply(1:n(), function(x) sum(end[1:(x-1)] < start[x], na.rm = TRUE))
)
输出:
# A tibble: 12 x 4
# Groups: group [2]
group start end prior_ended_obs
<fct> <date> <date> <int>
1 a 1990-01-01 1990-01-02 0
2 a 1990-01-02 1990-01-02 0
3 a 1990-01-03 1990-01-03 2
4 a 1990-01-03 1990-01-04 2
5 a 1990-01-03 1990-01-05 2
6 a 1990-01-04 1990-01-05 3
7 a 1990-01-05 1990-01-05 4
8 a 1990-01-06 1990-01-06 7
9 a 1990-01-07 1990-01-08 8
10 b 1990-01-01 1990-01-01 0
11 b 1990-01-02 1990-01-02 1
12 b 1990-01-03 1990-01-03 2