我计算了 R 中三次样条回归的系数,使用 SVD 方法而不使用内置 R 包。
作为我的数据,我从 R 数据集“airquality”中选择了 x=Ozone 和 y=Solar.R 列。
一开始一切都很好,我找到了系数,但样条图看起来像这样:
到目前为止我的代码:
dd <- na.omit(airquality)
plot(dd$Ozone, dd$Solar.R)
length(x)
length(y)
data(airquality)
x <- dd$Ozone
y <- dd$Solar.R
svd <- function(mat) {
eigen <- eigen(t(mat) %*% mat)
d <- eigen$values
v <- eigen$vectors
u <- mat %*% v / sqrt(d)
return(list(u = u, d = d, v = v))
}
knots <- quantile(x, c(0.25, 0.5, 0.75))
design <- matrix(0, nrow = length(x), ncol = 4)
for (i in 2:length(knots)) {
idx <- x >= knots[i - 1] & x <= knots[i]
design[idx, i - 1] <- (x[idx] - knots[i - 1])^3
design[idx, i] <- (x[idx] - knots[i - 1])^2
design[idx, i + 1] <- (x[idx] - knots[i - 1])
}
svd_design <- svd(design)
coefficients <- svd_design$v %*% diag(1 / svd_design$d) %*% t(svd_design$u) %*% y
library(ggplot2)
df <- data.frame(x = x, y = y)
df <- df[order(df$x),]
df$spline <- design %*% coefficients
ggplot(df, aes(x, y)) +
geom_point() +
geom_line(aes(y = spline), color = "red")
我怀疑红线没有穿过数据是因为我没有选择正确的结
为了解决这个问题,我试图找到最佳结数:
x <- dd$Ozone
y <- dd$Solar.R
# Define number of folds for cross-validation
n.folds <- 10
# Define function to calculate mean squared error (MSE)
mse <- function(y.true, y.pred) {
mean((y.true - y.pred)^2)
}
# Define function to calculate the spline coefficients for a given set of knots
calc.coefficients <- function(x, y, knots) {
# Check for NA or NaN values in input vectors
if (any(is.na(x)) || any(is.nan(x))) {
stop("Input vector x contains NA or NaN values")
}
if (any(is.na(y)) || any(is.nan(y))) {
stop("Input vector y contains NA or NaN values")
}
if (any(is.na(knots)) || any(is.nan(knots))) {
stop("Input vector knots contains NA or NaN values")
}
# Initialize design matrix
design <- matrix(0, nrow = length(x), ncol = length(knots) + 3)
# Populate design matrix
design[, 1] <- 1
for (i in 2:length(knots)) {
idx <- which(is.finite(x) & x >= knots[i - 1] & x <= knots[i] & !is.na(y))
if (length(idx) >= 2) { # check if there are at least 2 data points in the current knot interval
design[idx, i - 1] <- (x[idx] - knots[i - 1])^3
design[idx, i] <- (x[idx] - knots[i - 1])^2
design[idx, i + 1] <- (x[idx] - knots[i - 1])
}
}
design[, length(knots) + 2] <- x^3
design[, length(knots) + 3] <- x^2
# Calculate spline coefficients
idx <- which(rowSums(is.na(design)) == 0)
if (length(idx) < (length(knots) + 3)) {
stop("Insufficient data points to generate cubic spline")
}
svd <- svd(design[idx, ])
coefficients <- svd$v %*% diag(1 / svd$d) %*% t(svd$u) %*% y[idx]
return(coefficients)
}
# Define function to perform cross-validation and select optimal knots
cv.knots <- function(x, y, n.folds, knots.range) {
# Create matrix to store cross-validation results
cv.results <- matrix(0, nrow = length(knots.range), ncol = n.folds)
# Loop over candidate knots and perform cross-validation
for (i in seq_along(knots.range)) {
knots <- quantile(x, knots.range[i])
for (fold in 1:n.folds) {
# Create training and test sets for cross-validation
test.idx <- ((fold - 1) * floor(length(x) / n.folds) + 1):(fold * floor(length(x) / n.folds))
train.idx <- setdiff(seq_along(x), test.idx)
# Calculate spline coefficients on training set
coefficients <- calc.coefficients(x[train.idx], y[train.idx], knots)
# Predict on test set and calculate MSE
y.pred <- calc.spline(x[test.idx], coefficients, knots)
cv.results[i, fold] <- mse(y[test.idx], y.pred)
}
}
# Calculate mean and standard error of MSE across folds for each candidate knot
cv.mean <- apply(cv.results, 1, mean)
cv.se <- apply(cv.results, 1, sd) / sqrt(n.folds)
# Select knots with lowest cross-validation MSE
best.knots <- knots.range[which.min(cv.mean)]
return(list(best.knots = best.knots, cv.mean = cv.mean, cv.se = cv.se))
}
# Define function to calculate spline values for a given set of knots and coefficients
calc.spline <- function(x, coefficients, knots) {
# Create design matrix with basis functions for each segment
design <- matrix(0, nrow = length(x), ncol = 4)
for (i in 2:length(knots)) {
idx <- x >= knots[i - 1] & x <= knots[i]
design[idx, i - 1] <- (x[idx] - knots[i - 1])^3
design[idx, i] <- (x[idx] - knots[i - 1])^2
design[idx, i + 1] <- (x[idx] - knots[i - 1])
}
y <- design %*% coefficients
return(y)
}
knots.range <- seq(0.1, 0.9, by = 0.1)
cv.results <- cv.knots(x, y, n.folds, knots.range)
best.knots <- cv.results$best.knots
coefficients <- calc.coefficients(x, y, best.knots)
y.spline <- calc.spline(x, coefficients, best.knots)
plot(x, y, xlab = "Ozone", ylab = "Solar.R", main = "Cubic spline interpolation with optimal knots")
lines(x, y.spline, col = "red", lwd = 2)
但是,我收到此错误消息:
Error in design[idx, i - 1] <- (x[idx] - knots[i - 1])^3 :
NAs are not allowed in subscripted assignments
偶尔当我尝试修复代码时,这条消息:
Error in if (sum(idx) >= 2) { : missing value where TRUE/FALSE needed
感谢任何帮助,我不确定这里缺少什么。