我使用
R2jags
包构建了一个 JAGS 模型,以将二次函数拟合到经验数据并估计函数的参数。我想使用这个模型来预测来自栅格层的新数据帧的新观测值。问题是我不知道该怎么做。
第一次尝试:我尝试应用“第一次尝试”部分中的以下代码,但对我来说似乎不正确(见下图)。
我预计数据中会有更多噪音,这意味着数据中有更多波动。第二次尝试:然后,我使用新的数据框从
“第二次尝试”部分重新运行模型,其中我创建了带有 NA 的列 y(即响应变量),但我收到此错误消息:
Error in all$sims.array[, , "deviance", drop = FALSE] :
subscript out of bounds
In addition: Warning message:
In FUN(X[[i]], ...) : Failed to set trace monitor for deviance
There are no observed stochastic nodes
这是第一次根据 JAGS 模型进行预测,因此我们将不胜感激。
可重现的代码
library(geodata)
library(R2jags)
raw_data <- data.frame(T = c(-24, -22, -20, -19, -18, -16, -15, -12, -10 , -5, -2, -1, seq(1, 34, by = 1), 35, 37, 39, 41, 42, 43, 45, 46, 47),
dev = c(0, 0, 0, 0, 0, 0, 0, 0.06, 0.6, 0.9, 0.9, 1, rep(1, length(seq(1, 34, by = 1))), c(0.8, 0.8, 0.4, 0, 0, 0, 0, 0, 0)))
## plot(raw_data$T, raw_data$dev)
## Define a JAGS model
mod <- function(){
## Priors
q ~ dunif(0, 1)
T0 ~ dunif(-24, -2)
Tn ~ dunif(35, 47)
sigma ~ dunif(0, 1000)
tau <- 1 / (sigma * sigma)
## Likelihood
for(i in 1:n){
mu[i] <- -1 * q * (T[i] - T0) * (T[i] - Tn) * (Tn > T[i]) * (T0 < T[i]) * (-1 * q * (T[i] - T0) * (T[i] - Tn) < 1) + (-1 * q * (T[i] - T0) * (T[i] - Tn) > 1)
y[i] ~ dnorm(mu[i], tau)
}
}
## Group the data
data <- list(y = raw_data$dev, n = dim(raw_data)[1], T = raw_data$T)
## Define the parameters of interest to be estimated
param <- c("q", "T0", "Tn", "sigma", "mu")
## Define the initial values for the parameters of interest to be estimated
inits <- function(){list(q = 0.01, T0 = -20, Tn = 40, sigma = rlnorm(1))}
## Run the JAGS model
jags_mod <- R2jags::jags(data = data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)
第一次尝试:
## Import the raster layer
bio_r <- worldclim_country(country = "France", var="bio", res = 10, path = tempdir())
## plot(bio_r$wc2.1_30s_bio_1)
## Convert the raster layer to a data fame
T_df <- terra::as.data.frame(bio_r$wc2.1_30s_bio_1, xy = TRUE, na.rm = TRUE)
## summary(T_df)
colnames(T_df) <- c("x", "y", "T")
## Apply the quadratic function to the raster layer
q <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "q", "mean"]
T0 <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "T0", "mean"]
Tn <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "Tn", "mean"]
T_df$dev <- -1 * q * (T_df$T - T0) * (T_df$T - Tn) * (Tn > T_df$T) * (T0 < T_df$T) * (-1 * q * (T_df$T - T0) * (T_df$T - Tn) < 1) + (-1 * q * (T_df$T - T0) * (T_df$T - Tn) > 1)
## summary(T_df)
plot(T_df$T, T_df$dev, pch = 16)
第二次尝试:
T_df$dev <- NA
new_data <- list(y = T_df$dev, n = dim(T_df)[1], T = T_df$T)
jags_mod <- R2jags::jags(data = new_data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)
mu
,然后对于
mu
的每个值,从相关的正态分布中进行绘制。然后,您可以总结您案例中自变量 (
T
) 的每个新值的绘制情况。我就是这样做的。首先,我们将加载您的数据并运行模型。
library(geodata)
library(R2jags)
library(ggplot2)
raw_data <- data.frame(T = c(-24, -22, -20, -19, -18, -16, -15, -12, -10 , -5, -2, -1, seq(1, 34, by = 1), 35, 37, 39, 41, 42, 43, 45, 46, 47),
dev = c(0, 0, 0, 0, 0, 0, 0, 0.06, 0.6, 0.9, 0.9, 1, rep(1, length(seq(1, 34, by = 1))), c(0.8, 0.8, 0.4, 0, 0, 0, 0, 0, 0)))
## plot(raw_data$T, raw_data$dev)
## Define a JAGS model
mod <- function(){
## Priors
q ~ dunif(0, 1)
T0 ~ dunif(-24, -2)
Tn ~ dunif(35, 47)
sigma ~ dunif(0, 1000)
tau <- 1 / (sigma * sigma)
## Likelihood
for(i in 1:n){
mu[i] <- -1 * q * (T[i] - T0) * (T[i] - Tn) * (Tn > T[i]) * (T0 < T[i]) * (-1 * q * (T[i] - T0) * (T[i] - Tn) < 1) + (-1 * q * (T[i] - T0) * (T[i] - Tn) > 1)
y[i] ~ dnorm(mu[i], tau)
}
}
## Group the data
data <- list(y = raw_data$dev, n = dim(raw_data)[1], T = raw_data$T)
## Define the parameters of interest to be estimated
param <- c("q", "T0", "Tn", "sigma", "mu")
## Define the initial values for the parameters of interest to be estimated
inits <- function(){list(q = 0.01, T0 = -20, Tn = 40, sigma = rlnorm(1))}
## Run the JAGS model
jags_mod <- R2jags::jags(data = data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)
#> module glm loaded
#> Compiling model graph
#> Resolving undeclared variables
#> Allocating nodes
#> Graph information:
#> Observed stochastic nodes: 55
#> Unobserved stochastic nodes: 4
#> Total graph size: 622
#>
#> Initializing model
现在,获取新数据。
bio_r <- worldclim_country(country = "France", var="bio", res = 10, path = tempdir())
## Convert the raster layer to a data fame
T_df <- terra::as.data.frame(bio_r$wc2.1_30s_bio_1, xy = TRUE, na.rm = TRUE)
colnames(T_df) <- c("x", "y", "T")
T_df$dev <- NA
new_data <- list(y = T_df$dev, n = dim(T_df)[1], T = T_df$T)
导致计算量增大的一件事是新数据中有超过 160 万个点。然而,T
仅有大约 11k 个唯一值。因此,我们可以只使用唯一值,然后如果您需要让它们与原始 1.6M 值相对应,您可以将它们合并回
T
。
new_data_small <- list(T = sort(unique(new_data$T)))
new_data_small$n <- length(new_data_small$T)
从模型对象的参数中提取数据并将其转换为数据框。
pars <- as.mcmc(jags_mod)
pars <- do.call(rbind, pars)
pars <- as.data.frame(pars)
接下来,将 pars
的每一行设为列表中的不同元素,我们将其称为
sp
。
sp <- split(pars[,c("q", "T0", "Tn")], 1:nrow(pars))
现在,我们可以为 new_data 制作 post_mu
-
mu
的后验值。
post_mu <- sapply(sp, \(x)with(new_data_small, -1 * x$q * (T - x$T0) * (T - x$Tn) * (x$Tn > T) * (x$T0 < T) * (-1 * x$q * (T - x$T0) * (T - x$Tn) < 1) + (-1 * x$q * (T - x$T0) * (T - x$Tn) > 1)))
接下来,我们使用 mu
和
sigma
的后验值从每次后验绘制的正态分布中进行绘制。
post_obs <- sapply(1:ncol(post_mu), \(i)rnorm(nrow(post_mu), post_mu[,i], pars$sigma[i]))
接下来,我们总结后验均值和后验预测绘制以获得 95% 可信区间。
post_mu_sum <- apply(post_mu, 1, \(x)c(mean(x), median(x), quantile(x, .025), quantile(x, .975)))
post_mu_sum <- t(post_mu_sum)
colnames(post_mu_sum) <- c("mu_mean", "mu_median", "mu_lwr", "mu_upr")
ppd_sum <- apply(post_obs, 1, \(x)c(mean(x), median(x), quantile(x, .025), quantile(x, .975)))
ppd_sum <- t(ppd_sum)
colnames(ppd_sum) <- c("ppd_mean", "ppd_median", "ppd_lwr", "ppd_upr")
post_mu_sum <- as.data.frame(post_mu_sum)
post_mu_sum$T <- new_data_small$T
post_mu_sum <- cbind(post_mu_sum, ppd_sum)
最后,您可以绘制 T
与因变量的后验预测绘制之间的关系。
ggplot(post_mu_sum, aes(x=T, y=ppd_mean, ymin=ppd_lwr, ymax=ppd_upr)) +
geom_ribbon(alpha=.25) +
geom_line() +
theme_classic()
创建于 2024-04-27,使用 reprex v2.0.2
顺便说一句,我不鼓励您使用T
作为变量名。它是逻辑值
TRUE
的缩写,因此可能会导致混乱,特别是当您正在开发或与其他人共享代码时。