# but cannot handle categorical variables
my_lm <- function(explanatory_matrix, response_vec) {
exp_mat <- as.matrix(explanatory_matrix)
intercept <- rep(1, nrow(exp_mat))
exp_mat <- cbind(exp_mat, intercept)
solve(t(exp_mat) %*% exp_mat) %*% (t(exp_mat) %*% response_vec)
}
上面的代码在解释矩阵中存在分类变量时将不起作用。我该如何实施?
这里是一个带有一个分类变量的数据集的示例:
set.seed(123)
x <- 1:10
a <- 2
b <- 3
y <- a*x + b + rnorm(10)
# categorical variable
x2 <- sample(c("A", "B"), 10, replace = T)
# one-hot encoding
x2 <- as.integer(d$x2 == "A")
xm <- matrix(c(x, x2, rep(1, length(x))), ncol = 3, nrow = 10)
ym <- matrix(y, ncol = 1, nrow = 10)
beta_hat <- MASS::ginv(t(xm) %*% xm) %*% t(xm) %*% ym
beta_hat
这给出了(注意系数的顺序-它与预测值列的顺序匹配:]:
[,1]
[1,] 1.9916754
[2,] -0.7594809
[3,] 3.2723071
与lm
的输出相同:
d <- data.frame(x = x,
x2 = x2,
y = y)
lm(y ~ ., data = d)
输出
# Call:
# lm(formula = y ~ ., data = d)
#
# Coefficients:
# (Intercept) x x2
# 3.2723 1.9917 -0.7595
对于分类处理,您应该使用一键编码。做类似的事情>>
formula <- dep_var ~ indep_var
exp_mat <- model.matrix(formula, explanatory_matrix)
solve(t(exp_mat) %*% exp_mat) %*% (t(exp_mat) %*% response_vec)