我有一个 R 数据框,其中包含来自机构的数据。访客 (ID) 进入代理机构 (CREATE),然后被叫到不同的房间 (CALL),并离开这些房间 (DISCHARGE)。现在,我想计算某人在机构中的总体停留时间、他们在一个房间中的停留时间以及他们在两个房间之间等待的时间。我创建了一个示例 data.frame,其中最后三列已经包含我最终想要实现的所需结果。这些列在实际数据框中不存在。
对于
diff_since_create
,它只需要计算 ID 中每一行自 CREATE 以来的时间。
variable time_in_room
应代表自上次“CALL”事件以来 id 的分钟数。这表示访客在房间内停留的时间。对于初始“CALL”,该时间从 0 开始,以“DISCHARGE”事件结束。
变量
time_waiting
应表示自上次“DISCHARGE”事件以来 id 的分钟数。这表示访客(通过 ID 标识)在房间外花费的时间。
“CALL”和“DISCHARGE”事件之间,或者“DISCHARGE”和“CALL”事件之间可以出现多行。
有人知道如何用 R 计算最后 3 行吗?我将非常感谢您的帮助!
structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), time = structure(c(1713164400,
1713164700, 1713165000, 1713165600, 1713165720, 1713165780, 1713166200,
1713166500, 1713167100, 1713164400, 1713164700, 1713165000, 1713165600,
1713166020, 1713166200, 1713166500, 1713166620, 1713167100), class = c("POSIXct",
"POSIXt"), tzone = ""), action = c("create", "call", "discharge",
"call", "work", "work", "discharge", "call", "discharge", "create",
"call", "discharge", "call", "work", "discharge", "call", "work",
"discharge"), room = c("", "room 1", "room 1", "room 2", "room 2",
"room 2", "room 2", "room 3", "room 3", "", "room 1", "room 1",
"room 2", "room 2", "room 2", "room 3", "room 3", "room 3"),
diff_since_create = c(0L, 5L, 10L, 20L, 22L, 23L, 30L, 35L,
45L, 0L, 5L, 10L, 20L, 20L, 30L, 35L, 37L, 45L), time_in_room = c("na",
"0", "5", "0", "2", "3", "10", "0", "10", "na", "0", "5",
"0", "7", "10", "0", "2", "10"), time_waiting = c(0L, 5L,
0L, 10L, 0L, 0L, 0L, 5L, 0L, 0L, 5L, 0L, 10L, 0L, 0L, 5L,
0L, 0L)), row.names = c(NA, -18L), class = "data.frame")
df |>
mutate(dsc = (time - min(time))/lubridate::dminutes(), .by = id,
.after = diff_since_create) |>
mutate(tir = if_else(action == "create", NA_real_,
(time - min(time))/lubridate::dminutes()),
.after = time_in_room,
.by = c(id, room)) |>
mutate(last_disch = if_else(action %in% c("discharge", "create"), time, NA_POSIXct_)) |>
group_by(id) |> fill(last_disch) |> ungroup() |>
mutate(tw = if_else(action == "call", (time - last_disch)/lubridate::dminutes(), 0),
.after = time_waiting) |>
select(-last_disch)
结果(这大部分匹配。我认为第 14 行 diff_since_create 应该是 27。)
# A tibble: 18 × 10
id time action room diff_since_create dsc time_in_room tir time_waiting tw
<int> <dttm> <chr> <chr> <int> <dbl> <chr> <dbl> <int> <dbl>
1 1 2024-04-15 00:00:00 create "" 0 0 na NA 0 0
2 1 2024-04-15 00:05:00 call "room 1" 5 5 0 0 5 5
3 1 2024-04-15 00:10:00 discharge "room 1" 10 10 5 5 0 0
4 1 2024-04-15 00:20:00 call "room 2" 20 20 0 0 10 10
5 1 2024-04-15 00:22:00 work "room 2" 22 22 2 2 0 0
6 1 2024-04-15 00:23:00 work "room 2" 23 23 3 3 0 0
7 1 2024-04-15 00:30:00 discharge "room 2" 30 30 10 10 0 0
8 1 2024-04-15 00:35:00 call "room 3" 35 35 0 0 5 5
9 1 2024-04-15 00:45:00 discharge "room 3" 45 45 10 10 0 0
10 2 2024-04-15 00:00:00 create "" 0 0 na NA 0 0
11 2 2024-04-15 00:05:00 call "room 1" 5 5 0 0 5 5
12 2 2024-04-15 00:10:00 discharge "room 1" 10 10 5 5 0 0
13 2 2024-04-15 00:20:00 call "room 2" 20 20 0 0 10 10
14 2 2024-04-15 00:27:00 work "room 2" 20 27 7 7 0 0
15 2 2024-04-15 00:30:00 discharge "room 2" 30 30 10 10 0 0
16 2 2024-04-15 00:35:00 call "room 3" 35 35 0 0 5 5
17 2 2024-04-15 00:37:00 work "room 3" 37 37 2 2 0 0
18 2 2024-04-15 00:45:00 discharge "room 3" 45 45 10 10 0 0
略有不同
tidyverse
与dplyr::lag
一起服用:
libray(tidyverse)
conflicted::conflict_prefer("lag", winner = "dplyr") # or just `dplyr::lag`
# ----------------
df %>%
as_tibble() %>%
mutate(
.by = id,
visit_id = cumsum(if_else(action == "call", 1, 0)),
diff_since_create = time - min(time),
time_waiting = if_else(visit_id == lag(visit_id) + 1, time - lag(time), as.duration(NA))) %>%
mutate(.by = c(id, visit_id), time_in_room = as.duration(time - min(time))) %>%
select(id:room, time_in_room, time_waiting)
输出:
# A tibble: 18 × 6
id time action room time_in_room time_waiting
<int> <dttm> <chr> <chr> <Duration> <Duration>
1 1 2024-04-15 04:00:00 create "" 0s NA
2 1 2024-04-15 04:05:00 call "room 1" 0s 300s (~5 minutes)
3 1 2024-04-15 04:10:00 discharge "room 1" 300s (~5 minutes) NA
4 1 2024-04-15 04:20:00 call "room 2" 0s 600s (~10 minutes)
5 1 2024-04-15 04:22:00 work "room 2" 120s (~2 minutes) NA
6 1 2024-04-15 04:23:00 work "room 2" 180s (~3 minutes) NA
7 1 2024-04-15 04:30:00 discharge "room 2" 600s (~10 minutes) NA
8 1 2024-04-15 04:35:00 call "room 3" 0s 300s (~5 minutes)
9 1 2024-04-15 04:45:00 discharge "room 3" 600s (~10 minutes) NA
10 2 2024-04-15 04:00:00 create "" 0s NA
11 2 2024-04-15 04:05:00 call "room 1" 0s 300s (~5 minutes)
12 2 2024-04-15 04:10:00 discharge "room 1" 300s (~5 minutes) NA
13 2 2024-04-15 04:20:00 call "room 2" 0s 600s (~10 minutes)
14 2 2024-04-15 04:27:00 work "room 2" 420s (~7 minutes) NA
15 2 2024-04-15 04:30:00 discharge "room 2" 600s (~10 minutes) NA
16 2 2024-04-15 04:35:00 call "room 3" 0s 300s (~5 minutes)
17 2 2024-04-15 04:37:00 work "room 3" 120s (~2 minutes) NA
18 2 2024-04-15 04:45:00 discharge "room 3" 600s (~10 minutes) NA
或者,正如 @Jon Spring 选择的那样,在几分钟内:
> mutate(df, across(where(is.duration), \(x) x/dminutes()))
# A tibble: 18 × 6
id time action room time_in_room time_waiting
<int> <dttm> <chr> <chr> <dbl> <dbl>
1 1 2024-04-15 04:00:00 create "" 0 NA
2 1 2024-04-15 04:05:00 call "room 1" 0 5
3 1 2024-04-15 04:10:00 discharge "room 1" 5 NA
4 1 2024-04-15 04:20:00 call "room 2" 0 10
5 1 2024-04-15 04:22:00 work "room 2" 2 NA
6 1 2024-04-15 04:23:00 work "room 2" 3 NA
7 1 2024-04-15 04:30:00 discharge "room 2" 10 NA
8 1 2024-04-15 04:35:00 call "room 3" 0 5
9 1 2024-04-15 04:45:00 discharge "room 3" 10 NA
10 2 2024-04-15 04:00:00 create "" 0 NA
11 2 2024-04-15 04:05:00 call "room 1" 0 5
12 2 2024-04-15 04:10:00 discharge "room 1" 5 NA
13 2 2024-04-15 04:20:00 call "room 2" 0 10
14 2 2024-04-15 04:27:00 work "room 2" 7 NA
15 2 2024-04-15 04:30:00 discharge "room 2" 10 NA
16 2 2024-04-15 04:35:00 call "room 3" 0 5
17 2 2024-04-15 04:37:00 work "room 3" 2 NA
18 2 2024-04-15 04:45:00 discharge "room 3" 10 NA