样条回归错误信息:“设计错误[idx, i - 1] <- (x[idx] - knots[i - 1])^3 : NAs are not allowed in subscripted assignments"

问题描述 投票:0回答:0

我计算了 R 中三次样条回归的系数,使用 SVD 方法而不使用内置 R 包。

作为我的数据,我从 R 数据集“airquality”中选择了 x=Ozone 和 y=Solar.R 列。

一开始一切都很好,我找到了系数,但样条图看起来像这样:

到目前为止我的代码:

dd <- na.omit(airquality)

plot(dd$Ozone, dd$Solar.R)
length(x)
length(y)

data(airquality)
x <- dd$Ozone
y <- dd$Solar.R

svd <- function(mat) {
  eigen <- eigen(t(mat) %*% mat)
  d <- eigen$values
  v <- eigen$vectors
  u <- mat %*% v / sqrt(d)
  return(list(u = u, d = d, v = v))
}

knots <- quantile(x, c(0.25, 0.5, 0.75))

design <- matrix(0, nrow = length(x), ncol = 4)
for (i in 2:length(knots)) {
  idx <- x >= knots[i - 1] & x <= knots[i]
  design[idx, i - 1] <- (x[idx] - knots[i - 1])^3
  design[idx, i] <- (x[idx] - knots[i - 1])^2
  design[idx, i + 1] <- (x[idx] - knots[i - 1])
}

svd_design <- svd(design)

coefficients <- svd_design$v %*% diag(1 / svd_design$d) %*% t(svd_design$u) %*% y

library(ggplot2)
df <- data.frame(x = x, y = y)
df <- df[order(df$x),]
df$spline <- design %*% coefficients
ggplot(df, aes(x, y)) + 
  geom_point() + 
  geom_line(aes(y = spline), color = "red")

我怀疑红线没有穿过数据是因为我没有选择正确的结

为了解决这个问题,我试图找到最佳结数:

 x <- dd$Ozone
y <- dd$Solar.R

# Define number of folds for cross-validation
n.folds <- 10

# Define function to calculate mean squared error (MSE)
mse <- function(y.true, y.pred) {
  mean((y.true - y.pred)^2)
}

# Define function to calculate the spline coefficients for a given set of knots
calc.coefficients <- function(x, y, knots) {
  # Check for NA or NaN values in input vectors
  if (any(is.na(x)) || any(is.nan(x))) {
    stop("Input vector x contains NA or NaN values")
  }
  if (any(is.na(y)) || any(is.nan(y))) {
    stop("Input vector y contains NA or NaN values")
  }
  if (any(is.na(knots)) || any(is.nan(knots))) {
    stop("Input vector knots contains NA or NaN values")
  }
  
  # Initialize design matrix
  design <- matrix(0, nrow = length(x), ncol = length(knots) + 3)
  
  # Populate design matrix
  design[, 1] <- 1
  for (i in 2:length(knots)) {
    idx <- which(is.finite(x) & x >= knots[i - 1] & x <= knots[i] & !is.na(y))
    if (length(idx) >= 2) {  # check if there are at least 2 data points in the current knot interval
      design[idx, i - 1] <- (x[idx] - knots[i - 1])^3
      design[idx, i] <- (x[idx] - knots[i - 1])^2
      design[idx, i + 1] <- (x[idx] - knots[i - 1])
    }
  }
  design[, length(knots) + 2] <- x^3
  design[, length(knots) + 3] <- x^2
  
  # Calculate spline coefficients
  idx <- which(rowSums(is.na(design)) == 0)
  if (length(idx) < (length(knots) + 3)) {
    stop("Insufficient data points to generate cubic spline")
  }
  svd <- svd(design[idx, ])
  coefficients <- svd$v %*% diag(1 / svd$d) %*% t(svd$u) %*% y[idx]
  return(coefficients)
}
# Define function to perform cross-validation and select optimal knots
cv.knots <- function(x, y, n.folds, knots.range) {
  # Create matrix to store cross-validation results
  cv.results <- matrix(0, nrow = length(knots.range), ncol = n.folds)
  # Loop over candidate knots and perform cross-validation
  for (i in seq_along(knots.range)) {
    knots <- quantile(x, knots.range[i])
    for (fold in 1:n.folds) {
      # Create training and test sets for cross-validation
      test.idx <- ((fold - 1) * floor(length(x) / n.folds) + 1):(fold * floor(length(x) / n.folds))
      train.idx <- setdiff(seq_along(x), test.idx)
      # Calculate spline coefficients on training set
      coefficients <- calc.coefficients(x[train.idx], y[train.idx], knots)
      # Predict on test set and calculate MSE
      y.pred <- calc.spline(x[test.idx], coefficients, knots)
      cv.results[i, fold] <- mse(y[test.idx], y.pred)
    }
  }
  # Calculate mean and standard error of MSE across folds for each candidate knot
  cv.mean <- apply(cv.results, 1, mean)
  cv.se <- apply(cv.results, 1, sd) / sqrt(n.folds)
  # Select knots with lowest cross-validation MSE
  best.knots <- knots.range[which.min(cv.mean)]
  return(list(best.knots = best.knots, cv.mean = cv.mean, cv.se = cv.se))
}

# Define function to calculate spline values for a given set of knots and coefficients
calc.spline <- function(x, coefficients, knots) {
  # Create design matrix with basis functions for each segment
  design <- matrix(0, nrow = length(x), ncol = 4)
for (i in 2:length(knots)) {
idx <- x >= knots[i - 1] & x <= knots[i]
design[idx, i - 1] <- (x[idx] - knots[i - 1])^3
design[idx, i] <- (x[idx] - knots[i - 1])^2
design[idx, i + 1] <- (x[idx] - knots[i - 1])
}
y <- design %*% coefficients
return(y)
}

knots.range <- seq(0.1, 0.9, by = 0.1)

cv.results <- cv.knots(x, y, n.folds, knots.range)
best.knots <- cv.results$best.knots

coefficients <- calc.coefficients(x, y, best.knots)
y.spline <- calc.spline(x, coefficients, best.knots)

plot(x, y, xlab = "Ozone", ylab = "Solar.R", main = "Cubic spline interpolation with optimal knots")
lines(x, y.spline, col = "red", lwd = 2)

但是,我收到此错误消息:

Error in design[idx, i - 1] <- (x[idx] - knots[i - 1])^3 : 
  NAs are not allowed in subscripted assignments

偶尔当我尝试修复代码时,这条消息:

Error in if (sum(idx) >= 2) { : missing value where TRUE/FALSE needed

感谢任何帮助,我不确定这里缺少什么。

r ggplot2 plot spline
© www.soinside.com 2019 - 2024. All rights reserved.