我有一个包含 2 个分组的数据集,我也想按月分组。每个月有 1 到 8 次观察。我想要一个列,其中包含最近 3 个月的滚动平均值。所以如果这个月有 2 个 obs,上个月有 3 个,然后是 1 个,我想要最后 6 行的平均值。
我尝试使用 zoo、总结、使用 slide_dbl 进行变异,总是遇到让每个组的 n() 正确的问题。
我的数据:
REGION TYPE DATE RESULT
1 3226H5 Type A 2021-01-04 10:03:00 0.71
2 3226H5 Type A 2021-01-04 10:14:39 1.47
3 3226H5 Type A 2021-01-04 10:21:23 1.28
4 3226H5 Type A 2021-01-07 09:20:00 0.90
5 3226H5 Type A 2021-02-01 10:00:00 1.39
6 3226H5 Type A 2021-02-01 10:10:00 1.42
library(slider)
library(lubridate)
my_data <- structure(list(REGION = c("3226H5", "3226H5", "3226H5", "3226H5",
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5",
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5",
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5",
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5",
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5",
"3226H5", "3285", "3285", "3285", "3285", "3285", "3285", "3285",
"3285", "3285", "3285"), TYPE = c("Type A", "Type A", "Type A",
"Type A", "Type A", "Type A", "Type A", "Type A", "Type A", "Type A",
"Type A", "Type A", "Type A", "Type A", "Type A", "Type A", "Type A",
"Type A", "Type A", "Type A", "Type B", "Type B", "Type B", "Type B",
"Type B", "Type B", "Type B", "Type B", "Type B", "Type B", "Type B",
"Type B", "Type B", "Type B", "Type B", "Type B", "Type B", "Type B",
"Type B", "Type B", "Type B", "Type B", "Type B", "Type B", "Type B",
"Type B", "Type B", "Type B", "Type B", "Type B"), DATE = structure(c(1609754580,
1609755279, 1609755683, 1610011200, 1612173600, 1612174200, 1612174800,
1612432800, 1614593100, 1614594000, 1614594600, 1614851400, 1617617409,
1617618215, 1617618894, 1617875485, 1620040008, 1620041400, 1620042300,
1620295200, 1609754580, 1609755279, 1609755683, 1610011200, 1612173600,
1612174200, 1612174800, 1612432800, 1614593100, 1614594000, 1614594600,
1614851400, 1617617409, 1617618215, 1617618894, 1617875485, 1620040008,
1620041400, 1620042300, 1620295200, 1609748700, 1609758900, 1612169112,
1612177800, 1614588900, 1614599100, 1617612277, 1617622409, 1620034319,
1620042163), tzone = "UTC", class = c("POSIXct", "POSIXt")),
RESULT = c(0.71, 1.47, 1.28, 0.9, 1.39, 1.42, 1.54, 0.37,
2.94, 2.25, 2.01, 0.52, 10.5, 1.02, 1.16, 0.62, 58.67, 0.75,
0.94, 2.72, 0.012, 0.004, 0.005, 0.014, 0.014, 0.005, 0.006,
0.009, 0.013, 0.007, 0.006, 0.015, 0.022, 0.004, 0.004, 0.009,
0.021, 0.005, 0.005, 0.011, 0.012, 0.012, 0.01, 0.012, 0.019,
0.011, 0.013, 0.012, 0.016, 0.014)), row.names = c(NA, -50L
), class = "data.frame")
我的解决方案:
grouped_roll <- my_data %>%
add_count(REGION, TYPE, DATE, name = "new_cases") %>%
arrange(REGION, TYPE, DATE) %>%
group_by(REGION, TYPE) %>%
mutate(
MONTHLY = floor_date(DATE, "month"),
rolling_mean = slide_index_dbl(RESULT,
.i = MONTHLY, # index on date
.f = mean,
.after = months(6)),
rolling_sd = slide_index_dbl(
RESULT,
.i = MONTHLY,
.f = ~ sd(.x, na.rm = TRUE),
.after = months(6)
)
)
view(grouped_roll)
grouped_roll <- grouped_roll %>%
group_by(REGION, TYPE, MONTHLY) %>%
slice(seq_len(1))
grouped_roll
Groups: REGION, TYPE, MONTHLY [15]
REGION TYPE DATE RESULT new_cases MONTHLY rolling_mean rolling_sd
<chr> <chr> <dttm> <dbl> <int> <dttm> <dbl> <dbl>
1 3226H5 Type A 2021-01-04 10:03:00 0.71 1 2021-01-01 00:00:00 4.66 12.9
2 3226H5 Type A 2021-02-01 10:00:00 1.39 1 2021-02-01 00:00:00 5.55 14.4
3 3226H5 Type A 2021-03-01 10:05:00 2.94 1 2021-03-01 00:00:00 7.01 16.5
4 3226H5 Type A 2021-04-05 10:10:09 10.5 1 2021-04-01 00:00:00 9.55 20.1
对于前 2 个月 + 当前月份的所有观察(即 3 个月的观察),您可能需要这样的东西:
library(lubridate); library(slider)
my_data %>%
group_by(REGION, TYPE) %>%
mutate(MONTHLY = floor_date(DATE, "month"),
rolling_mean = slide_index_dbl(RESULT,
MONTHLY,
mean,
.before = months(2)),
rolling_count = slide_index_dbl(RESULT, # to show how many rows included
MONTHLY,
~sum(!is.na(.x)),
.before = months(2)))
结果
# A tibble: 50 × 7
# Groups: REGION, TYPE [3]
REGION TYPE DATE RESULT MONTHLY rolling_mean rolling_count
<chr> <chr> <dttm> <dbl> <dttm> <dbl> <dbl>
1 3226H5 Type A 2021-01-04 10:03:00 0.71 2021-01-01 00:00:00 1.09 4
2 3226H5 Type A 2021-01-04 10:14:39 1.47 2021-01-01 00:00:00 1.09 4
3 3226H5 Type A 2021-01-04 10:21:23 1.28 2021-01-01 00:00:00 1.09 4
4 3226H5 Type A 2021-01-07 09:20:00 0.9 2021-01-01 00:00:00 1.09 4
5 3226H5 Type A 2021-02-01 10:00:00 1.39 2021-02-01 00:00:00 1.14 8
6 3226H5 Type A 2021-02-01 10:10:00 1.42 2021-02-01 00:00:00 1.14 8
7 3226H5 Type A 2021-02-01 10:20:00 1.54 2021-02-01 00:00:00 1.14 8
8 3226H5 Type A 2021-02-04 10:00:00 0.37 2021-02-01 00:00:00 1.14 8
9 3226H5 Type A 2021-03-01 10:05:00 2.94 2021-03-01 00:00:00 1.4 12
10 3226H5 Type A 2021-03-01 10:20:00 2.25 2021-03-01 00:00:00 1.4 12
# … with 40 more rows
假设想要的是:
请注意,
rollapplyr
的 width= 参数可以是宽度的向量,即我们可以使用 cases
并且 yearmon 类在内部将年和月表示为年 + 分数,其中分数为 0、1/12、.. ., 12 个月的 11/12,所以 2 个月前比当前的 yearmon 日期少 2/12。
library(dplyr)
library(zoo)
my_data %>%
group_by(REGION, TYPE) %>%
mutate(DATE = as.yearmon(DATE),
cases = 1:n() - match(DATE - 2/12, DATE, nomatch = 1) + 1, ##
mean = rollapplyr(RESULT, cases, mean, fill = NA),
sd = rollapplyr(RESULT, cases, sd, fill = NA)
) %>%
group_by(DATE, .add = TRUE) %>%
slice_tail(n = 1) %>%
ungroup %>%
select(-RESULT)
给予:
# A tibble: 15 × 6
REGION TYPE DATE cases mean sd
<chr> <chr> <yearmon> <dbl> <dbl> <dbl>
1 3226H5 Type A Jan 2021 4 1.09 0.347
2 3226H5 Type A Feb 2021 8 1.14 0.425
3 3226H5 Type A Mar 2021 12 1.4 0.743
4 3226H5 Type A Apr 2021 12 2.14 2.73
5 3226H5 Type A May 2021 12 7.01 16.5
6 3226H5 Type B Jan 2021 4 0.00875 0.00499
7 3226H5 Type B Feb 2021 8 0.00863 0.00421
8 3226H5 Type B Mar 2021 12 0.00917 0.00415
9 3226H5 Type B Apr 2021 12 0.0095 0.00549
10 3226H5 Type B May 2021 12 0.0102 0.00638
11 3285 Type B Jan 2021 2 0.012 0
12 3285 Type B Feb 2021 4 0.0115 0.001
13 3285 Type B Mar 2021 6 0.0127 0.00320
14 3285 Type B Apr 2021 6 0.0128 0.00319
15 3285 Type B May 2021 6 0.0142 0.00293
请注意,
cases
的另一种表达方式是将标记为## 的行替换为:
cases = 1:n() - findInterval(DATE - 3/12, DATE),
创建函数:
summary <- function(.my_data) {summarise(my_data, moving_SD = sd(RESULT), moving_avg = mean(RESULT, na.rm = TRUE), num_observations = n())}
用于:
.f = summary, .before = 5, .complete = FALSE))```