在调查包中的svyglm中使用na.exclude

Question

我有一个问题，几年前在这里被问过，但没有答案。似乎在用

svyglm

拟合模型时，无论是否设置

na.action

，NA都会被省略。这在尝试使用

predict

函数在现有 data.frames 中创建新列时也很重要。

重申旧帖子中给出的相同示例：

library('survey')
y  = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1)
x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 0, 1, 0, 0, 0, 2, 2, 2)
x2 = c(10, 21, 33, 55, 40, 30, 26, 84, NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87)
x3 = runif(20)
foo = data.frame(y, x1, x2, x3)

m1 = glm(y ~ x1 + x2, 
         family = binomial(logit), 
         na.action = na.exclude)

svy1 = svydesign(ids = ~ 0,
                 data = foo,
                 weights = ~ x3)

m2 <- svyglm(y ~ x1 + x2, 
             design = svy1, 
             family = binomial(logit), 
             na.action = na.exclude)

predict(m1)
predict(m2)

foo2 = foo
foo2$x2 = runif(nrow(foo), 15, 50)
foo2$x2[c(8, 15, 19)] = NA

predict(m1, newdata = foo2)
predict(m2, newdata = foo2)

predict.glm

在这里产生包含缺失值的预测，而

predict.svyglm

则不。我错过了什么吗？

Answer 1

原因是，

survey:::predict.svyglm

（

predict

派遣到的地方）没有

na.action

参数，而

stats:::predict.glm

有。所以显然

na.action

在前者中被忽略（参见源代码）并且

NA

默认被删除。

> args(survey:::predict.svyglm)
function (object, newdata = NULL, total = NULL, type = c("link", 
    "response", "terms"), se.fit = (type != "terms"), vcov = FALSE, 
    ...) 
NULL

> args(stats:::predict.glm)
function (object, newdata = NULL, type = c("link", "response", 
    "terms"), se.fit = FALSE, dispersion = NULL, terms = NULL, 
    na.action = na.pass, ...) 
NULL

您可以通过在其中实现

na.action

来重写该方法，例如这样：

predict.svyglm <- function(object, newdata=NULL, total=NULL, 
          type=c("link", "response", "terms"), se.fit=(type != "terms"), vcov=FALSE, 
          ...) {
  if (is.null(newdata)) 
    newdata <- model.frame(object$survey.design)
  type <- match.arg(type)
  na.act <- object$na.action
  object$na.action <- NULL
  if (type == "terms") {
    eta <- survey:::predterms(object, se=se.fit, ...)
    if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
      eta <- stats:::napredict.exclude(na.act, eta)
    }
    return(eta)
  }
  tt <- delete.response(terms(formula(object)))
  mf <- model.frame(tt, data=newdata, xlev=object$xlevels)
  mm <- model.matrix(tt, mf, contrasts.arg=object$contrasts)
  if (!is.null(total) && attr(tt, "intercept")) {
    mm[, attr(tt, "intercept")] <- mm[, attr(tt, "intercept")] * 
      total
  }
  eta <- drop(mm %*% coef(object))
  d <- drop(object$family$mu.eta(eta))
  eta <- switch(type, link=eta, response=object$family$linkinv(eta))
  if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
    eta <- stats:::napredict.exclude(na.act, eta)
  }
  if (se.fit) {
    if (vcov) {
      vv <- mm %*% vcov(object) %*% t(mm)
      if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
        vv <- stats:::napredict.exclude(na.act, vv)
        vv <- t(stats:::napredict.exclude(na.act, t(vv)))
      }
      attr(eta, "var") <- switch(type, link=vv, response=d * 
                                   (t(vv * d)))
    }
    else {
      vv <- drop(rowSums((mm %*% vcov(object)) * mm))
      if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
        vv <- stats:::napredict.exclude(na.act, vv)
      } 
      attr(eta, "var") <- switch(type, link=vv, response=drop(d * t(vv * d)))
    }
  }
  attr(eta, "statistic") <- type
  class(eta) <- "svystat"
  eta
}

用法

library(survey)

svy1 <- svydesign(ids=~ 0, data=foo, weights=~ x3)

## na.omit
predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.omit)) 
#        link     SE
# 1  -6.73770 3.6953
# 2  -9.38180 5.2227
# 3  -2.28047 1.3933
# 4   1.98296 1.4552
# 5  -0.92393 0.8891
# 6  -2.86185 1.6672
# 7   1.13880 1.4473
# 8  12.37876 7.0821
# 10 12.96013 7.3990
# 11 -4.79977 2.6585
# 12 -9.38180 5.2227
# 13 -8.99422 5.0145
# 14 -3.83081 2.1532
# 16 -1.82390 1.5194
# 17  1.27677 1.4769
# 18 12.37876 7.0821
# 19  9.85946 5.7148
# 20 12.96013 7.3990
# Warning message:
#   In eval(family$initialize) : non-integer #successes in a binomial glm!

## na.exclude
predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.exclude))
#        link     SE
# 1  -6.73770 3.6953
# 2  -9.38180 5.2227
# 3  -2.28047 1.3933
# 4   1.98296 1.4552
# 5  -0.92393 0.8891
# 6  -2.86185 1.6672
# 7   1.13880 1.4473
# 8  12.37876 7.0821
# 9        NA     NA
# 10 12.96013 7.3990
# 11 -4.79977 2.6585
# 12 -9.38180 5.2227
# 13 -8.99422 5.0145
# 14 -3.83081 2.1532
# 15       NA     NA
# 16 -1.82390 1.5194
# 17  1.27677 1.4769
# 18 12.37876 7.0821
# 19  9.85946 5.7148
# 20 12.96013 7.3990
# Warning message:
#   In eval(family$initialize) : non-integer #successes in a binomial glm!

资料：

foo <- structure(list(y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 
0, 1, 1, 1, 1, 1), x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 
0, 1, 0, 0, 0, 2, 2, 2), x2 = c(10, 21, 33, 55, 40, 30, 26, 84, 
NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87), x3 = c(0.560080145020038, 
0.260075693251565, 0.0398760288953781, 0.508186420192942, 0.496490396093577, 
0.372899192618206, 0.57026850595139, 0.968372587347403, 0.0817912963684648, 
0.722660716157407, 0.78440785780549, 0.0374867296777666, 0.201261537149549, 
0.332104755565524, 0.566964805591851, 0.010931215249002, 0.0853019708301872, 
0.46296559041366, 0.449913740158081, 0.759272540919483)), class = "data.frame", row.names = c(NA, 
-20L))

Answer 2

svyglm

与

glm

和

na.exclude

的行为相同（自版本 4.1-1 起）：与

glm

一样，它省略了对

NA

s 的观察，但将

NA

s 重新插入到残差中。在你的例子中

> length(resid(m1))
[1] 20
> length(resid(m2))
[1] 20
> length(fitted(m1))
[1] 20
> length(fitted(m2))
[1] 20

predict.svyglm

不支持

na.exclude

并且只会在您实际提供数据时为您提供预测。原因是

predict.svyglm

有能力计算预测的方差-协方差矩阵，这要求它们都存在。

如果您写信给维护者并要求

predict.svyglm

在未请求协方差矩阵时支持

na.exclude

，他可能会添加该功能。

Answer 3

谢谢你们的回复。 @jay.sf 我认为你修改后的功能是我需要的；它在我提供的示例中运行良好。但是，我现在意识到它似乎并没有在不同但相似的示例中产生预期的输出。使用 na.exclude，我希望预测值与提供的数据具有相同的长度，但在这里，它们不是。知道为什么吗？

y  = sample(c(0, 1), size = 100, replace = T)
x1 = sample(1:5, size = 100, replace = T)
x2 = runif(100)
x2[5] = NA
x3 = runif(100)
foo = data.frame(y, x1, x2, x3)

m1 = glm(y ~ x1 + x2, 
         family = 'binomial', 
         na.action = na.exclude)

svy = svydesign(ids = ~ 0,
                data = foo,
                weights = ~ x3)

m2 = svyglm(y ~ x1 + x2, 
            design = svy, 
            family = 'binomial')

m3 = svyglm(y ~ x1 + x2, 
            design = svy, 
            na.action = na.exclude,
            family = 'binomial')

# This works as expected:
length(predict(m1))
# [1] 100
length(predict(m2))
# [1] 99
length(predict(m3))
# [1] 100

foo2 = foo
foo2$x2[c(8, 15, 19)] = NA

# This doesn't:
length(predict(m1, newdata = foo2))
# [1] 100
length(predict(m2, newdata = foo2))
# [1] 96
length(predict(m3, newdata = foo2))
# [1] 97

编辑：只需对@jay.sf 的函数进行小幅修改即可解决此问题。只是改变

  mf = model.frame(tt, data = newdata, xlev = object$xlevels)

到

  mf = model.frame(tt, data = newdata, xlev = object$xlevels, na.action = na.act)

在调查包中的svyglm中使用na.exclude

问题描述投票：0回答：3

3个回答

用法

最新问题

在调查包中的svyglm中使用na.exclude

问题描述 投票：0回答：3

3个回答

用法

最新问题

问题描述投票：0回答：3