在 R 中,我想创建一个新列,它是前 n 年(即排除当前年份)的移动平均值。然而,有些年份缺失了,在这种情况下,我希望在更少的年份上计算移动平均值。目前,我的代码忽略了 TIME_PERIOD 信息,而只是根据前 n 年的滞后进行计算。
希望我的示例代码能够清楚地说明我所追求的是什么
library(dplyr)
# Create example data frame
df <- data.frame(
NUTS = rep("XX", 12),
TIME_PERIOD = c(2008, 2009, 2010, 2011,
# note 2012 is missing
2013, 2014, 2015,
# note 2016 is missing
2017, 2018, 2019, 2020, 2021),
DATA = c(524288.2008, 1048576.201, 2097152.201, 4194304.201, 16777216.2, 33554432.2, 67108864.2, 268435456.2, 536870912.2, 1073741824, 2147483648, 4294967296),
INDICATOR_CODE = rep("Dummy", 12),
wanted_calculation= c(NA, NA, NA, "DATA average of 2008, 2009, 2010",
"DATA average of 2009, 2010, 2011",
"2012 is missing so DATA average of 2011, 2013",
"2012 is missing so DATA average of 2013, 2014",
"DATA average of 2013, 2014, 2015",
"2016 is missing so DATA average of 2015, 2017",
"2016 is missing so DATA average of 2017, 2018",
"DATA average of 2017, 2018, 2019",
"DATA average of 2018, 2019, 2020"
)
)
# Sort the dataframe by INDICATOR_CODE, NUTS, and TIME_PERIOD
df <- df %>% arrange(INDICATOR_CODE, NUTS, TIME_PERIOD)
# Calculate a moving average (of specified years ) excluding current year with dynamic column naming
moving_avg_length <- 3 # compare data with previous chosen number of years
df <- df %>%
group_by(INDICATOR_CODE, NUTS) %>%
mutate(my_bugged_moving_avg = sapply(1:n(), function(i) {
if (i <= moving_avg_length || INDICATOR_CODE[i] != INDICATOR_CODE[i-1] || NUTS[i] != NUTS[i-1]) { # resets the calculation when changing NUTS or Indicators
return(NA)
} else {
return(mean(DATA[(i - moving_avg_length):(i-1)], na.rm = TRUE))
}
})) %>%
ungroup()
wanted_result <- data.frame(
wanted_moving_avg = c(NA, NA, NA, 1223338.868, 2446677.534, 10485760.2, 25165824.2, 39146837.53, 167772160.2, 402653184.2, 626349397.5, 1252698795)
)
df <- df%>%
cbind(wanted_result)
rm(wanted_result)
这是一个带有
tidyr::complete
和 zoo::rollapply
的解决方案。
library(tidyverse)
library(zoo)
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
# Create example data frame
df <- data.frame(
NUTS = rep("XX", 12),
TIME_PERIOD = c(2008, 2009, 2010, 2011,
# note 2012 is missing
2013, 2014, 2015,
# note 2016 is missing
2017, 2018, 2019, 2020, 2021),
DATA = c(524288.2008, 1048576.201, 2097152.201, 4194304.201, 16777216.2, 33554432.2, 67108864.2, 268435456.2,
536870912.2, 1073741824, 2147483648, 4294967296),
INDICATOR_CODE = rep("Dummy", 12),
wanted_calculation= c(NA, NA, NA, "DATA average of 2008, 2009, 2010",
"DATA average of 2009, 2010, 2011",
"2012 is missing so DATA average of 2011, 2013",
"2012 is missing so DATA average of 2013, 2014",
"DATA average of 2013, 2014, 2015",
"2016 is missing so DATA average of 2015, 2017",
"2016 is missing so DATA average of 2017, 2018",
"DATA average of 2017, 2018, 2019",
"DATA average of 2018, 2019, 2020"
)
)
# Sort the dataframe by INDICATOR_CODE, NUTS, and TIME_PERIOD
df <- df %>% arrange(INDICATOR_CODE, NUTS, TIME_PERIOD)
# solution with `rollapply` and `lag`
df |>
complete(TIME_PERIOD = seq(min(df$TIME_PERIOD), max(df$TIME_PERIOD))) |>
mutate(
roll_mean = DATA |>
rollapply(width = 3, align = "right", FUN = mean, fill = NA, na.rm = TRUE) |>
lag()
) |>
select(-NUTS, -INDICATOR_CODE)
#> # A tibble: 14 × 4
#> TIME_PERIOD DATA wanted_calculation roll_mean
#> <dbl> <dbl> <chr> <dbl>
#> 1 2008 524288. <NA> NA
#> 2 2009 1048576. <NA> NA
#> 3 2010 2097152. <NA> NA
#> 4 2011 4194304. DATA average of 2008, 2009, 2010 1.22e6
#> 5 2012 NA <NA> 2.45e6
#> 6 2013 16777216. DATA average of 2009, 2010, 2011 3.15e6
#> 7 2014 33554432. 2012 is missing so DATA average of 2011, 2… 1.05e7
#> 8 2015 67108864. 2012 is missing so DATA average of 2013, 2… 2.52e7
#> 9 2016 NA <NA> 3.91e7
#> 10 2017 268435456. DATA average of 2013, 2014, 2015 5.03e7
#> 11 2018 536870912. 2016 is missing so DATA average of 2015, 2… 1.68e8
#> 12 2019 1073741824 2016 is missing so DATA average of 2017, 2… 4.03e8
#> 13 2020 2147483648 DATA average of 2017, 2018, 2019 6.26e8
#> 14 2021 4294967296 DATA average of 2018, 2019, 2020 1.25e9
创建于 2023-11-28,使用 reprex v2.0.2