我有一个问题,几年前在这里被问过,但没有答案。似乎在用
svyglm
拟合模型时,无论是否设置na.action
,NA都会被省略。这在尝试使用 predict
函数在现有 data.frames 中创建新列时也很重要。
重申旧帖子中给出的相同示例:
library('survey')
y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1)
x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 0, 1, 0, 0, 0, 2, 2, 2)
x2 = c(10, 21, 33, 55, 40, 30, 26, 84, NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87)
x3 = runif(20)
foo = data.frame(y, x1, x2, x3)
m1 = glm(y ~ x1 + x2,
family = binomial(logit),
na.action = na.exclude)
svy1 = svydesign(ids = ~ 0,
data = foo,
weights = ~ x3)
m2 <- svyglm(y ~ x1 + x2,
design = svy1,
family = binomial(logit),
na.action = na.exclude)
predict(m1)
predict(m2)
foo2 = foo
foo2$x2 = runif(nrow(foo), 15, 50)
foo2$x2[c(8, 15, 19)] = NA
predict(m1, newdata = foo2)
predict(m2, newdata = foo2)
predict.glm
在这里产生包含缺失值的预测,而 predict.svyglm
则不。我错过了什么吗?
原因是,
survey:::predict.svyglm
(predict
派遣到的地方)没有na.action
参数,而stats:::predict.glm
有。所以显然 na.action
在前者中被忽略(参见源代码)并且 NA
默认被删除。
> args(survey:::predict.svyglm)
function (object, newdata = NULL, total = NULL, type = c("link",
"response", "terms"), se.fit = (type != "terms"), vcov = FALSE,
...)
NULL
> args(stats:::predict.glm)
function (object, newdata = NULL, type = c("link", "response",
"terms"), se.fit = FALSE, dispersion = NULL, terms = NULL,
na.action = na.pass, ...)
NULL
您可以通过在其中实现
na.action
来重写该方法,例如这样:
predict.svyglm <- function(object, newdata=NULL, total=NULL,
type=c("link", "response", "terms"), se.fit=(type != "terms"), vcov=FALSE,
...) {
if (is.null(newdata))
newdata <- model.frame(object$survey.design)
type <- match.arg(type)
na.act <- object$na.action
object$na.action <- NULL
if (type == "terms") {
eta <- survey:::predterms(object, se=se.fit, ...)
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
eta <- stats:::napredict.exclude(na.act, eta)
}
return(eta)
}
tt <- delete.response(terms(formula(object)))
mf <- model.frame(tt, data=newdata, xlev=object$xlevels)
mm <- model.matrix(tt, mf, contrasts.arg=object$contrasts)
if (!is.null(total) && attr(tt, "intercept")) {
mm[, attr(tt, "intercept")] <- mm[, attr(tt, "intercept")] *
total
}
eta <- drop(mm %*% coef(object))
d <- drop(object$family$mu.eta(eta))
eta <- switch(type, link=eta, response=object$family$linkinv(eta))
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
eta <- stats:::napredict.exclude(na.act, eta)
}
if (se.fit) {
if (vcov) {
vv <- mm %*% vcov(object) %*% t(mm)
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
vv <- stats:::napredict.exclude(na.act, vv)
vv <- t(stats:::napredict.exclude(na.act, t(vv)))
}
attr(eta, "var") <- switch(type, link=vv, response=d *
(t(vv * d)))
}
else {
vv <- drop(rowSums((mm %*% vcov(object)) * mm))
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
vv <- stats:::napredict.exclude(na.act, vv)
}
attr(eta, "var") <- switch(type, link=vv, response=drop(d * t(vv * d)))
}
}
attr(eta, "statistic") <- type
class(eta) <- "svystat"
eta
}
library(survey)
svy1 <- svydesign(ids=~ 0, data=foo, weights=~ x3)
## na.omit
predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.omit))
# link SE
# 1 -6.73770 3.6953
# 2 -9.38180 5.2227
# 3 -2.28047 1.3933
# 4 1.98296 1.4552
# 5 -0.92393 0.8891
# 6 -2.86185 1.6672
# 7 1.13880 1.4473
# 8 12.37876 7.0821
# 10 12.96013 7.3990
# 11 -4.79977 2.6585
# 12 -9.38180 5.2227
# 13 -8.99422 5.0145
# 14 -3.83081 2.1532
# 16 -1.82390 1.5194
# 17 1.27677 1.4769
# 18 12.37876 7.0821
# 19 9.85946 5.7148
# 20 12.96013 7.3990
# Warning message:
# In eval(family$initialize) : non-integer #successes in a binomial glm!
## na.exclude
predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.exclude))
# link SE
# 1 -6.73770 3.6953
# 2 -9.38180 5.2227
# 3 -2.28047 1.3933
# 4 1.98296 1.4552
# 5 -0.92393 0.8891
# 6 -2.86185 1.6672
# 7 1.13880 1.4473
# 8 12.37876 7.0821
# 9 NA NA
# 10 12.96013 7.3990
# 11 -4.79977 2.6585
# 12 -9.38180 5.2227
# 13 -8.99422 5.0145
# 14 -3.83081 2.1532
# 15 NA NA
# 16 -1.82390 1.5194
# 17 1.27677 1.4769
# 18 12.37876 7.0821
# 19 9.85946 5.7148
# 20 12.96013 7.3990
# Warning message:
# In eval(family$initialize) : non-integer #successes in a binomial glm!
资料:
foo <- structure(list(y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1), x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0,
0, 1, 0, 0, 0, 2, 2, 2), x2 = c(10, 21, 33, 55, 40, 30, 26, 84,
NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87), x3 = c(0.560080145020038,
0.260075693251565, 0.0398760288953781, 0.508186420192942, 0.496490396093577,
0.372899192618206, 0.57026850595139, 0.968372587347403, 0.0817912963684648,
0.722660716157407, 0.78440785780549, 0.0374867296777666, 0.201261537149549,
0.332104755565524, 0.566964805591851, 0.010931215249002, 0.0853019708301872,
0.46296559041366, 0.449913740158081, 0.759272540919483)), class = "data.frame", row.names = c(NA,
-20L))
svyglm
与 glm
和 na.exclude
的行为相同(自版本 4.1-1 起):与 glm
一样,它省略了对 NA
s 的观察,但将 NA
s 重新插入到残差中。在你的例子中
> length(resid(m1))
[1] 20
> length(resid(m2))
[1] 20
> length(fitted(m1))
[1] 20
> length(fitted(m2))
[1] 20
predict.svyglm
不支持 na.exclude
并且只会在您实际提供数据时为您提供预测。原因是predict.svyglm
有能力计算预测的方差-协方差矩阵,这要求它们都存在。
如果您写信给维护者并要求
predict.svyglm
在未请求协方差矩阵时支持 na.exclude
,他可能会添加该功能。
谢谢你们的回复。 @jay.sf 我认为你修改后的功能是我需要的;它在我提供的示例中运行良好。但是,我现在意识到它似乎并没有在不同但相似的示例中产生预期的输出。使用 na.exclude,我希望预测值与提供的数据具有相同的长度,但在这里,它们不是。知道为什么吗?
y = sample(c(0, 1), size = 100, replace = T)
x1 = sample(1:5, size = 100, replace = T)
x2 = runif(100)
x2[5] = NA
x3 = runif(100)
foo = data.frame(y, x1, x2, x3)
m1 = glm(y ~ x1 + x2,
family = 'binomial',
na.action = na.exclude)
svy = svydesign(ids = ~ 0,
data = foo,
weights = ~ x3)
m2 = svyglm(y ~ x1 + x2,
design = svy,
family = 'binomial')
m3 = svyglm(y ~ x1 + x2,
design = svy,
na.action = na.exclude,
family = 'binomial')
# This works as expected:
length(predict(m1))
# [1] 100
length(predict(m2))
# [1] 99
length(predict(m3))
# [1] 100
foo2 = foo
foo2$x2[c(8, 15, 19)] = NA
# This doesn't:
length(predict(m1, newdata = foo2))
# [1] 100
length(predict(m2, newdata = foo2))
# [1] 96
length(predict(m3, newdata = foo2))
# [1] 97
编辑:只需对@jay.sf 的函数进行小幅修改即可解决此问题。只是改变
mf = model.frame(tt, data = newdata, xlev = object$xlevels)
到
mf = model.frame(tt, data = newdata, xlev = object$xlevels, na.action = na.act)