我正在尝试在线性模型中使用效果编码来比较不同样本大小的组的“加权平均值”。在这个虚拟示例中,我们有一组 19 个观测值,平均值为 5.04,还有两组分别为 25 和 26 个观测值,其联合平均值为 6.10。 如何指定对比,以便获得这两个均值之间差异的系数?请参阅我的代码中的问号。第二个对比是为了区分 versicolor 和 virginica,并不是特别有趣。
注意:我不是在问未加权的对比,所以不是例如
contrasts(df$Species) <- matrix(c(.6666, -.3333, -.3333, 0, .5, -.5), ncol = 2)
set.seed(5)
df <- iris[sample.int(nrow(iris), 70), c("Sepal.Length", "Species")]
table(df$Species)
mean(df$Sepal.Length[df$Species == "setosa"])
mean(df$Sepal.Length[!df$Species == "setosa"])
contrasts(df$Species) <- ???
res <- lm(Sepal.Length~Species, df)
stopifnot(mean(df$Sepal.Length[df$Species == "setosa"])-mean(df$Sepal.Length[!df$Species == "setosa"]) == coef(res)[2])
谢谢您的帮助!
library(emmeans)
set.seed(5)
df <- iris[sample.int(nrow(iris), 70), c("Sepal.Length", "Species")]
res <- lm(Sepal.Length~Species, df)
ctr <- contrast(emmeans(res, "Species"),
list(weighted = c(1, -(25/51), -(26/51))))
set.seed(9)
n = 89
condition <- sample(c("nothing", "t-shirt", "chalk ball", "chalk powder", "liquid chalk"), n, replace = TRUE)
means <- c("nothing" = 25, "t-shirt" = 35, "chalk ball" = 27, "chalk powder" = 22, "liquid chalk" = 21)
hangtime <- means[condition] + rnorm(n, 0,10)
hangtime <- abs(hangtime)
chalk <- data.frame(condition = factor(condition, levels = c("nothing", "t-shirt", "chalk ball", "chalk powder", "liquid chalk")), hangtime = hangtime)
mat <- cbind(c(1/5,1/5,1/5,1/5,1/5),
c(-1.5, -1.5, 1,1,1),
c(-1,1, 0, 0, 0),
c(0,0,-.5,-.5, 1),
c(0,0,-1,1,0))
mat <- solve(t(mat))
mat <- mat[, -1]
cont <- contrasts(chalk$condition)
cont[,] <- mat
colnames(cont) <- c("nothingshirtVchalk", "nothingVshirt", "dryVliquid", "ballVpowder")
contrasts(chalk$condition) <- cont
summary(lm(hangtime~condition, chalk))