在没有足够的非 NA 值的情况下对不同组的值进行插值

问题描述 投票:0回答:2

我有一个数据框,包括对给定点 ID.x 的 LimAm 和 LimAv 列固定的间隔内的所有距离(DIST.y 列)和高度 (Z) 的观测值以及通过移动平均值 (ma_Z) 校正的高度给定距离 Dist.x。我的目的是对每组 ID.x 的 DIST.y 和 ma_Z 进行回归。这是我使用的代码:

df_sl %>% 
  do({
    mod = lm(ma_Z ~ DIST.y, data = .)
    data.frame(int = coef(mod)[1], slope = coef(mod)[2])
  })

此代码可以正常工作。

但是,在进行回归之前,我想通过使用函数

zoo::na.approx()
进行插值来填充 ma_Z 列中的 NA。 不幸的是,对于数据框中的某些组 ID.x,我收到一条错误消息,指出没有足够的非 NA 值使代码正常工作(在本例中,组 ID.x =“188473”和“188473”) ”)。

我不明白为什么 ID.x“188473”和“188474”有两个非 NA 值时出现此错误消息。而 ID.x“9383”和“9384”不会触发错误消息,而是各有一个非 NA 值。

df_sl <- structure(list(ID.x = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 
2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 
5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 9383, 9383, 9384, 9384, 
188473, 188473, 188473, 188473, 188474, 188474, 188474, 188474
), DIST.x = c(0, 0, 0, 0, 0, 0, 0, 0, 11.515675154, 11.515675154, 
11.515675154, 11.515675154, 11.515675154, 11.515675154, 11.515675154, 
11.515675154, 21.823439218, 21.823439218, 21.823439218, 21.823439218, 
21.823439218, 21.823439218, 21.823439218, 21.823439218, 21.988363443, 
21.988363443, 21.988363443, 21.988363443, 21.988363443, 21.988363443, 
21.988363443, 21.988363443, 32.961571068, 32.961571068, 32.961571068, 
32.961571068, 32.961571068, 32.961571068, 32.961571068, 32.961571068, 
43.934778692, 43.934778692, 43.934778692, 43.934778692, 43.934778692, 
43.934778692, 43.934778692, 43.934778692, 0, 0, 13.891845289, 
13.891845289, 0, 0, 0, 0, 0, 0, 0, 0), ID.y = c(1, 2, 3, 4, 5, 
6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 
3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 
8, 9383, 9384, 9383, 9384, 188473, 188474, 188475, 188476, 188473, 
188474, 188475, 188476), LimAm = c(-375, -363.484324846, -353.176560782, 
-353.011636557, -342.038428932, -331.065221308, -330.991532192, 
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557, 
-342.038428932, -331.065221308, -330.991532192, -320.018324568, 
-375, -363.484324846, -353.176560782, -353.011636557, -342.038428932, 
-331.065221308, -330.991532192, -320.018324568, -375, -363.484324846, 
-353.176560782, -353.011636557, -342.038428932, -331.065221308, 
-330.991532192, -320.018324568, -375, -363.484324846, -353.176560782, 
-353.011636557, -342.038428932, -331.065221308, -330.991532192, 
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557, 
-342.038428932, -331.065221308, -330.991532192, -320.018324568, 
-375, -361.108154711, -375, -361.108154711, -375, -375, -375, 
-362.193750234, -375, -375, -375, -362.193750234), LimAv = c(375, 
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692, 
419.008467808, 429.981675432, 375, 386.515675154, 396.823439218, 
396.988363443, 407.961571068, 418.934778692, 419.008467808, 429.981675432, 
375, 386.515675154, 396.823439218, 396.988363443, 407.961571068, 
418.934778692, 419.008467808, 429.981675432, 375, 386.515675154, 
396.823439218, 396.988363443, 407.961571068, 418.934778692, 419.008467808, 
429.981675432, 375, 386.515675154, 396.823439218, 396.988363443, 
407.961571068, 418.934778692, 419.008467808, 429.981675432, 375, 
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692, 
419.008467808, 429.981675432, 375, 388.891845289, 375, 388.891845289, 
375, 375, 375, 387.806249766, 375, 375, 375, 387.806249766), 
    DIST.y = c(0, 11.515675154, 21.823439218, 21.988363443, 32.961571068, 
    43.934778692, 44.008467808, 54.981675432, 0, 11.515675154, 
    21.823439218, 21.988363443, 32.961571068, 43.934778692, 44.008467808, 
    54.981675432, 0, 11.515675154, 21.823439218, 21.988363443, 
    32.961571068, 43.934778692, 44.008467808, 54.981675432, 0, 
    11.515675154, 21.823439218, 21.988363443, 32.961571068, 43.934778692, 
    44.008467808, 54.981675432, 0, 11.515675154, 21.823439218, 
    21.988363443, 32.961571068, 43.934778692, 44.008467808, 54.981675432, 
    0, 11.515675154, 21.823439218, 21.988363443, 32.961571068, 
    43.934778692, 44.008467808, 54.981675432, 0, 13.891845289, 
    0, 13.891845289, 0, 0, 0, 12.806249766, 0, 0, 0, 12.806249766
    ), Z = c(193.07513428, 193.15454102, 192.17289734, 192.17289734, 
    190.82974243, 190.63618469, 190.63618469, 189.45043945, 193.07513428, 
    193.15454102, 192.17289734, 192.17289734, 190.82974243, 190.63618469, 
    190.63618469, 189.45043945, 193.07513428, 193.15454102, 192.17289734, 
    192.17289734, 190.82974243, 190.63618469, 190.63618469, 189.45043945, 
    193.07513428, 193.15454102, 192.17289734, 192.17289734, 190.82974243, 
    190.63618469, 190.63618469, 189.45043945, 193.07513428, 193.15454102, 
    192.17289734, 192.17289734, 190.82974243, 190.63618469, 190.63618469, 
    189.45043945, 193.07513428, 193.15454102, 192.17289734, 192.17289734, 
    190.82974243, 190.63618469, 190.63618469, 189.45043945, 353.19342041, 
    353.02838135, 353.19342041, 353.02838135, 344.16003418, 344.16003418, 
    344.16003418, 344.1892395, 344.16003418, 344.16003418, 344.16003418, 
    344.1892395), ma_Z = c(193.07513428, NA, 192.800857546667, 
    NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428, 
    NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875, 
    NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325, 
    NA, 191.0687522875, NA, 193.07513428, NA, 192.800857546667, 
    NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428, 
    NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875, 
    NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325, 
    NA, 191.0687522875, NA, 353.19342041, NA, 353.19342041, NA, 
    344.16003418, NA, 344.16003418, NA, 344.16003418, NA, 344.16003418, 
    NA)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(ID.x = c(1, 
2, 3, 4, 5, 6, 9383, 9384, 188473, 188474), .rows = structure(list(
    1:8, 9:16, 17:24, 25:32, 33:40, 41:48, 49:50, 51:52, 53:56, 
    57:60), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-10L), .drop = TRUE))
df_sl %>% group_by(ID.x) %>% mutate(ma_Z = zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2)) 

我得到的错误:

Error in `mutate()`:
ℹ In argument: `ma_Z = ifelse(...)`.
ℹ In group 9: `ID.x = 188473`.
Caused by error in `approx()`:
! need at least two non-NA values to interpolate

我仍然需要具有 NA 值的组进行线性回归(即使只有一个非 NA 值它也能工作)。此外,完整的数据集由数百万个观测值组成。我不能让自己检查所有这些。

我有一个想法,如果非 NA 的数量低于 2,则使用

ifelse()
函数来绕过插值。但是错误是相同的。如果我提高条件阈值 (>2),则不会出现错误,但它会为每组的所有观察值提供唯一值(则无需插值)...

df_sl %>% 
  group_by(ID.x) %>%
  mutate(ma_Z = ifelse(
    length(na.omit(ma_Z)) > 1, 
    zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2),
    ma_Z)
  )

如何修复整个数据集的错误并在 ma_Z 列中获得正确的插值?

r dplyr bigdata interpolation zoo
2个回答
0
投票

这是你想要的吗?

> df_sl |> 
+   transform(ma_Z=ave(
+     ma_Z, ID.x, FUN=\(x) {
+       if (all(is.na(x))) {
+         rep_len(NA_real_, length(x))
+       }
+       else if (len <- length(x) < 3L) {
+         rep_len(na.omit(x), len)
+       } 
+       else {
+         approx(x, xout=seq_along(x), rule=2)$y 
+       }
+     })
+   )
     ID.x   DIST.x   ID.y     LimAm    LimAv   DIST.y        Z     ma_Z
1       1  0.00000      1 -375.0000 375.0000  0.00000 193.0751 193.0751
2       1  0.00000      2 -363.4843 386.5157 11.51568 193.1545 192.9380
3       1  0.00000      3 -353.1766 396.8234 21.82344 192.1729 192.8009
4       1  0.00000      4 -353.0116 396.9884 21.98836 192.1729 192.4417
5       1  0.00000      5 -342.0384 407.9616 32.96157 190.8297 192.0825
6       1  0.00000      6 -331.0652 418.9348 43.93478 190.6362 191.5756
7       1  0.00000      7 -330.9915 419.0085 44.00847 190.6362 191.0688
8       1  0.00000      8 -320.0183 429.9817 54.98168 189.4504 191.0688
9       2 11.51568      1 -375.0000 375.0000  0.00000 193.0751 193.0751
10      2 11.51568      2 -363.4843 386.5157 11.51568 193.1545 192.9380
11      2 11.51568      3 -353.1766 396.8234 21.82344 192.1729 192.8009
12      2 11.51568      4 -353.0116 396.9884 21.98836 192.1729 192.4417
13      2 11.51568      5 -342.0384 407.9616 32.96157 190.8297 192.0825
14      2 11.51568      6 -331.0652 418.9348 43.93478 190.6362 191.5756
15      2 11.51568      7 -330.9915 419.0085 44.00847 190.6362 191.0688
16      2 11.51568      8 -320.0183 429.9817 54.98168 189.4504 191.0688
17      3 21.82344      1 -375.0000 375.0000  0.00000 193.0751 193.0751
18      3 21.82344      2 -363.4843 386.5157 11.51568 193.1545 192.9380
19      3 21.82344      3 -353.1766 396.8234 21.82344 192.1729 192.8009
20      3 21.82344      4 -353.0116 396.9884 21.98836 192.1729 192.4417
21      3 21.82344      5 -342.0384 407.9616 32.96157 190.8297 192.0825
22      3 21.82344      6 -331.0652 418.9348 43.93478 190.6362 191.5756
23      3 21.82344      7 -330.9915 419.0085 44.00847 190.6362 191.0688
24      3 21.82344      8 -320.0183 429.9817 54.98168 189.4504 191.0688
25      4 21.98836      1 -375.0000 375.0000  0.00000 193.0751 193.0751
26      4 21.98836      2 -363.4843 386.5157 11.51568 193.1545 192.9380
27      4 21.98836      3 -353.1766 396.8234 21.82344 192.1729 192.8009
28      4 21.98836      4 -353.0116 396.9884 21.98836 192.1729 192.4417
29      4 21.98836      5 -342.0384 407.9616 32.96157 190.8297 192.0825
30      4 21.98836      6 -331.0652 418.9348 43.93478 190.6362 191.5756
31      4 21.98836      7 -330.9915 419.0085 44.00847 190.6362 191.0688
32      4 21.98836      8 -320.0183 429.9817 54.98168 189.4504 191.0688
33      5 32.96157      1 -375.0000 375.0000  0.00000 193.0751 193.0751
34      5 32.96157      2 -363.4843 386.5157 11.51568 193.1545 192.9380
35      5 32.96157      3 -353.1766 396.8234 21.82344 192.1729 192.8009
36      5 32.96157      4 -353.0116 396.9884 21.98836 192.1729 192.4417
37      5 32.96157      5 -342.0384 407.9616 32.96157 190.8297 192.0825
38      5 32.96157      6 -331.0652 418.9348 43.93478 190.6362 191.5756
39      5 32.96157      7 -330.9915 419.0085 44.00847 190.6362 191.0688
40      5 32.96157      8 -320.0183 429.9817 54.98168 189.4504 191.0688
41      6 43.93478      1 -375.0000 375.0000  0.00000 193.0751 193.0751
42      6 43.93478      2 -363.4843 386.5157 11.51568 193.1545 192.9380
43      6 43.93478      3 -353.1766 396.8234 21.82344 192.1729 192.8009
44      6 43.93478      4 -353.0116 396.9884 21.98836 192.1729 192.4417
45      6 43.93478      5 -342.0384 407.9616 32.96157 190.8297 192.0825
46      6 43.93478      6 -331.0652 418.9348 43.93478 190.6362 191.5756
47      6 43.93478      7 -330.9915 419.0085 44.00847 190.6362 191.0688
48      6 43.93478      8 -320.0183 429.9817 54.98168 189.4504 191.0688
49   9383  0.00000   9383 -375.0000 375.0000  0.00000 353.1934 353.1934
50   9383  0.00000   9384 -361.1082 388.8918 13.89185 353.0284 353.1934
51   9384 13.89185   9383 -375.0000 375.0000  0.00000 353.1934 353.1934
52   9384 13.89185   9384 -361.1082 388.8918 13.89185 353.0284 353.1934
53 188473  0.00000 188473 -375.0000 375.0000  0.00000 344.1600 344.1600
54 188473  0.00000 188474 -375.0000 375.0000  0.00000 344.1600 344.1600
55 188473  0.00000 188475 -375.0000 375.0000  0.00000 344.1600 344.1600
56 188473  0.00000 188476 -362.1938 387.8062 12.80625 344.1892 344.1600
57 188474  0.00000 188473 -375.0000 375.0000  0.00000 344.1600 344.1600
58 188474  0.00000 188474 -375.0000 375.0000  0.00000 344.1600 344.1600
59 188474  0.00000 188475 -375.0000 375.0000  0.00000 344.1600 344.1600
60 188474  0.00000 188476 -362.1938 387.8062 12.80625 344.1892 344.1600

0
投票

问题是组 188473 的 DIST 同样为零。目前尚不清楚在这种情况下您想要什么,但如果您想在这种情况下简单地忽略 DIST,请使用

na.approx2
而不是
na.approx
,其中 na.approx2 是:

na.approx2 <- function(object, x = index(object), ...) {
  if (var(x) == 0) na.approx(object, x = index(object), ...)
  else na.approx(object, x, ...)
}
© www.soinside.com 2019 - 2024. All rights reserved.