当每个组具有不同数量的值时,R 是否有办法获得滚动平均值?

问题描述 投票:0回答:3

我有一个包含 2 个分组的数据集,我也想按月分组。每个月有 1 到 8 次观察。我想要一个列,其中包含最近 3 个月的滚动平均值。所以如果这个月有 2 个 obs,上个月有 3 个,然后是 1 个,我想要最后 6 行的平均值。

我尝试使用 zoo、总结、使用 slide_dbl 进行变异,总是遇到让每个组的 n() 正确的问题。

我的数据:


 REGION   TYPE                DATE RESULT
1 3226H5 Type A 2021-01-04 10:03:00   0.71
2 3226H5 Type A 2021-01-04 10:14:39   1.47
3 3226H5 Type A 2021-01-04 10:21:23   1.28
4 3226H5 Type A 2021-01-07 09:20:00   0.90
5 3226H5 Type A 2021-02-01 10:00:00   1.39
6 3226H5 Type A 2021-02-01 10:10:00   1.42

library(slider)
library(lubridate)
my_data <- structure(list(REGION = c("3226H5", "3226H5", "3226H5", "3226H5", 
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", 
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", 
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", 
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", 
"3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", "3226H5", 
"3226H5", "3285", "3285", "3285", "3285", "3285", "3285", "3285", 
"3285", "3285", "3285"), TYPE = c("Type A", "Type A", "Type A", 
"Type A", "Type A", "Type A", "Type A", "Type A", "Type A", "Type A", 
"Type A", "Type A", "Type A", "Type A", "Type A", "Type A", "Type A", 
"Type A", "Type A", "Type A", "Type B", "Type B", "Type B", "Type B", 
"Type B", "Type B", "Type B", "Type B", "Type B", "Type B", "Type B", 
"Type B", "Type B", "Type B", "Type B", "Type B", "Type B", "Type B", 
"Type B", "Type B", "Type B", "Type B", "Type B", "Type B", "Type B", 
"Type B", "Type B", "Type B", "Type B", "Type B"), DATE = structure(c(1609754580, 
1609755279, 1609755683, 1610011200, 1612173600, 1612174200, 1612174800, 
1612432800, 1614593100, 1614594000, 1614594600, 1614851400, 1617617409, 
1617618215, 1617618894, 1617875485, 1620040008, 1620041400, 1620042300, 
1620295200, 1609754580, 1609755279, 1609755683, 1610011200, 1612173600, 
1612174200, 1612174800, 1612432800, 1614593100, 1614594000, 1614594600, 
1614851400, 1617617409, 1617618215, 1617618894, 1617875485, 1620040008, 
1620041400, 1620042300, 1620295200, 1609748700, 1609758900, 1612169112, 
1612177800, 1614588900, 1614599100, 1617612277, 1617622409, 1620034319, 
1620042163), tzone = "UTC", class = c("POSIXct", "POSIXt")), 
    RESULT = c(0.71, 1.47, 1.28, 0.9, 1.39, 1.42, 1.54, 0.37, 
    2.94, 2.25, 2.01, 0.52, 10.5, 1.02, 1.16, 0.62, 58.67, 0.75, 
    0.94, 2.72, 0.012, 0.004, 0.005, 0.014, 0.014, 0.005, 0.006, 
    0.009, 0.013, 0.007, 0.006, 0.015, 0.022, 0.004, 0.004, 0.009, 
    0.021, 0.005, 0.005, 0.011, 0.012, 0.012, 0.01, 0.012, 0.019, 
    0.011, 0.013, 0.012, 0.016, 0.014)), row.names = c(NA, -50L
), class = "data.frame")

我的解决方案:

grouped_roll <- my_data %>%
  
  add_count(REGION, TYPE, DATE, name = "new_cases") %>%

  arrange(REGION, TYPE, DATE) %>%   
  
  group_by(REGION, TYPE)    %>%        
    
  mutate( 
     MONTHLY = floor_date(DATE, "month"),
    rolling_mean = slide_index_dbl(RESULT,
      .i = MONTHLY,      # index on date 
      .f = mean,                      
      .after = months(6)),             
    rolling_sd = slide_index_dbl(
      RESULT,
      .i = MONTHLY,
      .f = ~ sd(.x, na.rm = TRUE),
      .after = months(6)
    )
  )

view(grouped_roll)

grouped_roll <- grouped_roll %>%
  group_by(REGION, TYPE, MONTHLY) %>%
  slice(seq_len(1))
grouped_roll

Groups:   REGION, TYPE, MONTHLY [15]
   REGION TYPE   DATE                RESULT new_cases MONTHLY             rolling_mean rolling_sd
   <chr>  <chr>  <dttm>               <dbl>     <int> <dttm>                     <dbl>      <dbl>
 1 3226H5 Type A 2021-01-04 10:03:00  0.71          1 2021-01-01 00:00:00      4.66      12.9    
 2 3226H5 Type A 2021-02-01 10:00:00  1.39          1 2021-02-01 00:00:00      5.55      14.4    
 3 3226H5 Type A 2021-03-01 10:05:00  2.94          1 2021-03-01 00:00:00      7.01      16.5    
 4 3226H5 Type A 2021-04-05 10:10:09 10.5           1 2021-04-01 00:00:00      9.55      20.1    
r time-series mean summarize mutate
3个回答
0
投票

对于前 2 个月 + 当前月份的所有观察(即 3 个月的观察),您可能需要这样的东西:

library(lubridate); library(slider)
my_data %>%
  group_by(REGION, TYPE) %>%
  mutate(MONTHLY = floor_date(DATE, "month"),
         rolling_mean = slide_index_dbl(RESULT, 
                                        MONTHLY,
                                        mean,
                                        .before = months(2)),
         rolling_count = slide_index_dbl(RESULT, # to show how many rows included
                                        MONTHLY,
                                        ~sum(!is.na(.x)),
                                        .before = months(2)))

结果

# A tibble: 50 × 7
# Groups:   REGION, TYPE [3]
   REGION TYPE   DATE                RESULT MONTHLY             rolling_mean rolling_count
   <chr>  <chr>  <dttm>               <dbl> <dttm>                     <dbl>         <dbl>
 1 3226H5 Type A 2021-01-04 10:03:00   0.71 2021-01-01 00:00:00         1.09             4
 2 3226H5 Type A 2021-01-04 10:14:39   1.47 2021-01-01 00:00:00         1.09             4
 3 3226H5 Type A 2021-01-04 10:21:23   1.28 2021-01-01 00:00:00         1.09             4
 4 3226H5 Type A 2021-01-07 09:20:00   0.9  2021-01-01 00:00:00         1.09             4
 5 3226H5 Type A 2021-02-01 10:00:00   1.39 2021-02-01 00:00:00         1.14             8
 6 3226H5 Type A 2021-02-01 10:10:00   1.42 2021-02-01 00:00:00         1.14             8
 7 3226H5 Type A 2021-02-01 10:20:00   1.54 2021-02-01 00:00:00         1.14             8
 8 3226H5 Type A 2021-02-04 10:00:00   0.37 2021-02-01 00:00:00         1.14             8
 9 3226H5 Type A 2021-03-01 10:05:00   2.94 2021-03-01 00:00:00         1.4             12
10 3226H5 Type A 2021-03-01 10:20:00   2.25 2021-03-01 00:00:00         1.4             12
# … with 40 more rows

0
投票

假设想要的是:

  • 每个地区/类型/年/月的一行输出
  • cases 是应用 rollapplyr 的行数

请注意,

rollapplyr
的 width= 参数可以是宽度的向量,即我们可以使用
cases
并且 yearmon 类在内部将年和月表示为年 + 分数,其中分数为 0、1/12、.. ., 12 个月的 11/12,所以 2 个月前比当前的 yearmon 日期少 2/12。

library(dplyr)
library(zoo)

my_data %>%
  group_by(REGION, TYPE) %>%
  mutate(DATE = as.yearmon(DATE),
    cases = 1:n() - match(DATE - 2/12, DATE, nomatch = 1) + 1, ##
    mean = rollapplyr(RESULT, cases, mean, fill = NA),
    sd = rollapplyr(RESULT, cases, sd, fill = NA)
  ) %>%
  group_by(DATE, .add = TRUE) %>%
  slice_tail(n = 1) %>%
  ungroup %>%
  select(-RESULT)

给予:

# A tibble: 15 × 6
   REGION TYPE   DATE      cases    mean       sd
   <chr>  <chr>  <yearmon> <dbl>   <dbl>    <dbl>
 1 3226H5 Type A Jan 2021      4 1.09     0.347  
 2 3226H5 Type A Feb 2021      8 1.14     0.425  
 3 3226H5 Type A Mar 2021     12 1.4      0.743  
 4 3226H5 Type A Apr 2021     12 2.14     2.73   
 5 3226H5 Type A May 2021     12 7.01    16.5    
 6 3226H5 Type B Jan 2021      4 0.00875  0.00499
 7 3226H5 Type B Feb 2021      8 0.00863  0.00421
 8 3226H5 Type B Mar 2021     12 0.00917  0.00415
 9 3226H5 Type B Apr 2021     12 0.0095   0.00549
10 3226H5 Type B May 2021     12 0.0102   0.00638
11 3285   Type B Jan 2021      2 0.012    0      
12 3285   Type B Feb 2021      4 0.0115   0.001  
13 3285   Type B Mar 2021      6 0.0127   0.00320
14 3285   Type B Apr 2021      6 0.0128   0.00319
15 3285   Type B May 2021      6 0.0142   0.00293

请注意,

cases
的另一种表达方式是将标记为## 的行替换为:

cases = 1:n() - findInterval(DATE - 3/12, DATE), 

0
投票

创建函数:

summary <- function(.my_data) {summarise(my_data, moving_SD = sd(RESULT), moving_avg = mean(RESULT, na.rm = TRUE), num_observations = n())}

用于:

      .f = summary, .before = 5, .complete = FALSE))```
© www.soinside.com 2019 - 2024. All rights reserved.