年度数据插值至月度数据

问题描述 投票:0回答:2

我正在尝试将年度数据插入月度数据中。我在网上看到了几个例子,但我无法让它们工作 这是一个这是另一个,我遇到了各种各样的错误。

我的数据是这样的

structure(list(countryname = c("Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Angola", 
"Angola", "Angola", "Angola", "Angola", "Angola", "Angola", "Angola", 
"Angola", "Angola", "Angola", "Angola", "Angola", "Angola", "Angola", 
"Angola", "Angola", "Angola", "Angola", "Angola", "Anguilla", 
"Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla", 
"Anguilla", "Anguilla", "Anguilla"), year = c(1986, 1987, 1988, 
1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 
2011, 2012, 2013, 2014, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 
1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 
1976, 1977, 1978, 1979, 1980, 1981, 1982, 1984, 1985, 1986, 1987, 
1988, 1989, 1990, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 
2014, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 
2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 1990, 
1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999), total = c(286.511, 
344.328, 467.524, 539.787, 609.052, 690.367, 742.735, 793.093, 
946.521, 1017.982, 1121.778, 1185.47, 1248.148, 1376.097, 1502.349, 
1581.668, 1783.635, 2012.886, 2090.128, 2283.757, 2382.169, 2479.276, 
2578.016, 2555.033, 2582.489, 2656.015, 2747.218, 2888.874, 3009.389, 
1527, 2037, 2244, 2162, 1760, 1713, 1867, 2252, 2395, 2449, 2488.4, 
2872.3, 2853.1, 2928, 3660.3, 4280.4, 5214.8, 5016.4, 6243.1, 
5493.6, 6770.4, 9789.5, 9603.1, 10053, 4185.38, 4787.87, 6728.06, 
8773.05, 12698.5, 17579.82, 19972.779, 26547.45, 16855.486058, 
33202.5772169922, 48354.2870012583, 64742.29199, 84061.57373, 
42170.25521, 46008.7436596187, 49215.7766771853, 46365.3341990244, 
0.718514, 26.547404, 90.561909, 90.0346783, 441.87615963, 1845.82841034, 
7453.3254978, 23556.243282338, 54332.057918413, 89699.02591742, 
138396.82327341, 273708.10979785, 492519.2833002, 801048.13352684, 
1285702.23670599, 1532272.06111931, 1973567.61658344, 2451384.32434691, 
2819717.17874511, 2852043.16889253, 135.972994003056, 165.698, 
190, 200.98, 199.009, 212.701, 240.183, 282.136, 308.716, 358.751
), hh = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
12409.359888, 2669.05197895018, 34449.662, 3595.39872, 10187.97902, 
10224.02617, 11409.47128, 9743.61473, 8246.96039, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 50.331, 55.832, 79.786, 85.04, 93.316, 104.006, 126.771, 
147.306, 171.725), corp = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 4446.12617, 30533.525238042, 13904.6250012583, 
61146.89327, 73873.59471, 31946.22904, 34599.2723796187, 39472.1619471853, 
38118.3738090244, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 115.367, 134.168, 121.194, 
113.969, 119.385, 136.177, 155.365, 161.41, 187.026), agr = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1.08, 2.09, 1.415, 1.276, 
1.703, 1.85, 2.434, 2.385, 2.161), manufac = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, 0.103556200987511, 0.241631135637525, 
0.691812953819343, 1.02980888759802, 4.50900958466454, 5.02391402846355, 
4.91891954690677, 4.36805808887598, 9.904), real_estate = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.327, 13.813, 6.75, 
13.223, 11.223, 13.284, 13.509, 11.515, 11.234), whole_sale = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 77.119, 92.189, 91.358, 
79.837, 76.949, 79.75, 92.241, 95.118, 109.549), transport_comm = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14.572, 13.047, 12.774, 
11.398, 8.998, 10.174, 10.226, 10.132, 8.803), others = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 9.16544379901249, 12.7873688643625, 
8.20518704618064, 7.20519111240198, 16.0029904153354, 26.0950859715364, 
32.0360804530932, 37.891941911124, 45.375), error = c(NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_)), row.names = c(NA, 
-100L), class = c("tbl_df", "tbl", "data.frame"))

请注意,这是一组国家/地区,因此应针对每个国家/地区分别进行插值。

迄今为止我最好的尝试是调整第二个链接中的示例

expand_data <- function(x) {
  years <- min(x$year):max(x$year)
  months_data <- 1:12
  grid <- expand.grid(month=months_data, year=years)
  x$quarter <- 1
  merged <- merge(grid, x, by=c('year', 'month'), all.x=TRUE)  # Use merge instead of left_join
  merged$country <- x$country[1]
  return(merged)
}

interpolate_data <- function(data) {
  xout <- 1:nrow(data)
  y <- data$value
  interpolation <- approx(x=xout[!is.na(y)], y=y[!is.na(y)], xout=xout)
  data$value_interpolated <- interpolation$y
  return(data)
}

expand_and_interpolate <- function(x) interpolate_data(expand_data(x))

monthly_data <- credit_data %>%
  group_by(countryname) %>%
  do(expand_and_interpolate(.))

但是我收到这个错误

Error in fix.by(by.y, y) : 'by' must specify a uniquely valid column
r dplyr interpolation
2个回答
1
投票

此过程将

year
转换为
Date
类列,然后将其扩展为第一个和最后一个观察日期之间的每个月。从那里开始,它在所有其他字段中使用
approx

out <- credit_data |>
  mutate(date = as.Date(paste0(year, "-01-01"))) |>
  reframe(
    .by = countryname, 
    month = seq(min(date), max(date), by = "month"), 
    year = as.integer(format(month, format = "%Y")), 
    across(-c(year, date, month), 
           ~ if (any(!is.na(.x))) approx(date, .x, xout = month, na.rm = TRUE)$y else rep(.x[1], length(month)))
  )

filter(out, countryname == "Anguilla")[20:30,]
# # A tibble: 11 × 13
#    countryname month       year total    hh  corp   agr manufac real_estate whole_sale transport_comm others error
#    <chr>       <date>     <int> <dbl> <dbl> <dbl> <dbl>   <dbl>       <dbl>      <dbl>          <dbl>  <dbl> <chr>
#  1 Anguilla    1991-08-01  1991  180.  53.5  126.  1.67   0.184        13.6       85.9           13.7   11.3 NA   
#  2 Anguilla    1991-09-01  1991  182.  54.0  128.  1.75   0.195        13.7       87.2           13.6   11.6 NA   
#  3 Anguilla    1991-10-01  1991  184.  54.4  129.  1.84   0.207        13.7       88.4           13.4   11.9 NA   
#  4 Anguilla    1991-11-01  1991  186.  54.9  131.  1.92   0.219        13.7       89.7           13.3   12.2 NA   
#  5 Anguilla    1991-12-01  1991  188.  55.4  133.  2.00   0.230        13.8       90.9           13.2   12.5 NA   
#  6 Anguilla    1992-01-01  1992  190   55.8  134.  2.09   0.242        13.8       92.2           13.0   12.8 NA   
#  7 Anguilla    1992-02-01  1992  191.  57.9  133.  2.03   0.280        13.2       92.1           13.0   12.4 NA   
#  8 Anguilla    1992-03-01  1992  192.  59.8  132.  1.98   0.315        12.7       92.1           13.0   12.0 NA   
#  9 Anguilla    1992-04-01  1992  193.  61.8  131.  1.92   0.354        12.1       92.0           13.0   11.6 NA   
# 10 Anguilla    1992-05-01  1992  194.  63.8  130.  1.87   0.390        11.5       91.9           13.0   11.3 NA   
# 11 Anguilla    1992-06-01  1992  195.  65.8  129.  1.81   0.429        10.9       91.8           12.9   10.9 NA   

使用

.by=
需要
dplyr_1.1.0
或更新版本;如果您有旧版本,请从
mutate(.by=c(..), stuff)
更改为
group_by(..) |> mutate(stuff) |> ungroup()


1
投票

使用

by
你可以做这样的事情。

> res <- by(dat, dat$countryname, \(s) {
+   s <- transform(s, date=paste0(s$year, '-01-01'))
+   s <- merge(s, 
+              data.frame(
+                countryname=s$countryname[1],
+                year=s$year[1],
+                date=do.call('seq.Date',
+                             c(as.list(range(as.Date(s$date))), 'month')) |> as.character()),
+              all=TRUE)
+   s <- s[order(s$date), ]
+   s[4:12] <- lapply(4:12, \(j) tryCatch(
+     approx(s[[j]], xout=seq_len(nrow(s)))$y,
+     error=\(e) rep_len(NA_real_, nrow(s))
+   ))
+   s
+ }) |> c(make.row.names=FALSE) |>  do.call(what='rbind')
> head(res)
  countryname year       date    total        hh      corp agr manufac real_estate whole_sale transport_comm others error
1 Afghanistan 2006 2006-01-01 16855.49 12409.360  4446.126  NA      NA          NA         NA             NA     NA  <NA>
2 Afghanistan 2006 2006-02-01 18112.95 11660.105  6452.849  NA      NA          NA         NA             NA     NA  <NA>
3 Afghanistan 2006 2006-03-01 19370.42 10910.851  8459.572  NA      NA          NA         NA             NA     NA  <NA>
4 Afghanistan 2006 2006-04-01 20627.89 10161.597 10466.295  NA      NA          NA         NA             NA     NA  <NA>
5 Afghanistan 2006 2006-05-01 21885.36  9412.342 12473.018  NA      NA          NA         NA             NA     NA  <NA>
6 Afghanistan 2006 2006-06-01 23142.83  8663.088 14479.741  NA      NA          NA         NA             NA     NA  <NA>
> tail(res)
    countryname year       date    total       hh     corp      agr  manufac real_estate whole_sale transport_comm   others error
205    Anguilla 1991 1998-09-01 339.5068 162.3331 177.1737 2.247154 7.774792    11.34208   103.9986       9.314154 42.49690  <NA>
206    Anguilla 1991 1998-10-01 343.3556 164.2115 179.1442 2.229923 8.200633    11.32046   105.1087       9.211923 43.07252  <NA>
207    Anguilla 1991 1998-11-01 347.2045 166.0898 181.1146 2.212692 8.626475    11.29885   106.2188       9.109692 43.64814  <NA>
208    Anguilla 1991 1998-12-01 351.0533 167.9682 183.0851 2.195462 9.052317    11.27723   107.3288       9.007462 44.22376  <NA>
209    Anguilla 1991 1999-01-01 354.9022 169.8466 185.0555 2.178231 9.478158    11.25562   108.4389       8.905231 44.79938  <NA>
210    Anguilla 1999 1999-01-01 358.7510 171.7250 187.0260 2.161000 9.904000    11.23400   109.5490       8.803000 45.37500  <NA>

数据:

> dput(dat)
structure(list(countryname = c("Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Anguilla", "Anguilla", "Anguilla", 
"Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla"
), year = c(2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 
1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999), total = c(16855.486058, 
33202.5772169922, 48354.2870012583, 64742.29199, 84061.57373, 
42170.25521, 46008.7436596187, 49215.7766771853, 46365.3341990244, 
165.698, 190, 200.98, 199.009, 212.701, 240.183, 282.136, 308.716, 
358.751), hh = c(12409.359888, 2669.05197895018, 34449.662, 3595.39872, 
10187.97902, 10224.02617, 11409.47128, 9743.61473, 8246.96039, 
50.331, 55.832, 79.786, 85.04, 93.316, 104.006, 126.771, 147.306, 
171.725), corp = c(4446.12617, 30533.525238042, 13904.6250012583, 
61146.89327, 73873.59471, 31946.22904, 34599.2723796187, 39472.1619471853, 
38118.3738090244, 115.367, 134.168, 121.194, 113.969, 119.385, 
136.177, 155.365, 161.41, 187.026), agr = c(NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 1.08, 2.09, 1.415, 1.276, 1.703, 1.85, 2.434, 
2.385, 2.161), manufac = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
0.103556200987511, 0.241631135637525, 0.691812953819343, 1.02980888759802, 
4.50900958466454, 5.02391402846355, 4.91891954690677, 4.36805808887598, 
9.904), real_estate = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.327, 
13.813, 6.75, 13.223, 11.223, 13.284, 13.509, 11.515, 11.234), 
    whole_sale = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 77.119, 
    92.189, 91.358, 79.837, 76.949, 79.75, 92.241, 95.118, 109.549
    ), transport_comm = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    14.572, 13.047, 12.774, 11.398, 8.998, 10.174, 10.226, 10.132, 
    8.803), others = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 9.16544379901249, 
    12.7873688643625, 8.20518704618064, 7.20519111240198, 16.0029904153354, 
    26.0950859715364, 32.0360804530932, 37.891941911124, 45.375
    ), error = c(NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_)), row.names = c(62L, 
63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 92L, 93L, 94L, 95L, 96L, 
97L, 98L, 99L, 100L), class = c("tbl_df", "tbl", "data.frame"
))
© www.soinside.com 2019 - 2024. All rights reserved.