我有一个实验,其中参与者填写一份调查问卷(每个参与者对此都有一个 sumpdi 分数)和一个具有两种类型(参与者内部)条件(0 和 25)的实验。
我想在 sumpdi 分数和条件下绘制平均任务响应,但我在描述置信区间时遇到困难。
我首先汇总数据:
agg_forplot<- aggregate(Response ~ sumpdi+ Condition, df_forplot,
FUN=mean)
然后我在 ggplot2 中创建一个绘图:
Plot_C<-ggplot(agg_forplot, aes(x=sumpdi, y=Response, colour=Condition))+ geom_smooth(method='lm', se=T, fill='light blue')+geom_point()+
ylab("Mean % 'Tone Present' Responses")+ xlab('Summary PDI Score') + labs(color= 'Condition') +
scale_color_manual(labels = c("No tone", "25%"), values= c("#5CC0C0","#FF88FF"))+ mytheme
Plot_C
geom_smooth() 中的“se”设置为汇总 PDI 分数=0 附近的置信区间生成负值。但是,显然不可能得到负百分比。
为什么会发生这种情况?我该如何纠正?我尝试添加参数
+ylim(0,1)
但这会完全从图表的这一部分中删除置信区间。
编辑 这是 dput(agg_forplot) 的输出,用于显示我的数据:
structure(list(sumpdi = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 11), Condition = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), levels = c("0", "25", "50", "75"), class = "factor"),
Response = c(0.0636323497219099, 0.0851942941465814, 0.0885739075300753,
0.0917113758765922, 0.0329839417565963, 0.0784596540124087,
0.102004605958778, 0.0945945945945946, 0.0762726196516151,
0.0422535211267606, 0.263157894736842, 0.276791358215455,
0.343362166227905, 0.314720315633304, 0.264764719482896,
0.23228019576997, 0.291160062315599, 0.238023716545795, 0.540540540540541,
0.36038961038961, 0.117647058823529, 0.533333333333333)), row.names = c(NA,
-22L), class = "data.frame")
此后我还注意到,通过聚合 Pt_ID 来生成平均值可以防止负置信区间值(参见下图),但我不确定这是为什么:
agg_forplot<- aggregate(Response ~ sumpdi+ Condition+ Pt_ID, df_forplot, FUN=mean)
dput(agg_forplot) 的输出:
structure(list(sumpdi = c(5, 5, 6, 6, 4, 4, 3, 3, 6, 6, 0, 0,
6, 6, 0, 0, 5, 5, 0, 0, 0, 0, 6, 6, 3, 3, 2, 2, 3, 3, 3, 3, 0,
0, 5, 5, 0, 0, 3, 3, 8, 8, 2, 2, 2, 2, 4, 4, 7, 7, 3, 3, 0, 0,
6, 6, 0, 0, 5, 5, 3, 3, 3, 3, 1, 1, 5, 5, 6, 6, 5, 5, 4, 4, 5,
5, 0, 0, 1, 1, 2, 2, 8, 8, 2, 2, 11, 11, 4, 4, 5, 5, 9, 9, 5,
5, 0, 0, 1, 1, 1, 1, 0, 0), Condition = structure(c(1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L), levels = c("0", "25", "50", "75"), class = "factor"),
Pt_ID = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L,
12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L,
18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L,
24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L,
30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L,
36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L,
42L, 43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L,
48L, 49L, 49L, 50L, 50L, 51L, 51L, 52L, 52L), levels = c("1006",
"1007", "1013", "1014", "1017", "1020", "1021", "1022", "1024",
"1026", "1028", "1032", "1033", "1034", "1036", "1037", "1039",
"1040", "1041", "1042", "1043", "1045", "1046", "1047", "1051",
"1053", "1055", "1056", "1057", "1058", "1059", "1060", "1061",
"1063", "1064", "1065", "1066", "1069", "1071", "1072", "1073",
"1074", "1078", "1079", "1085", "1086", "1087", "1088", "1089",
"1090", "1092", "1093"), class = "factor"), Response = c(0.0882352941176471,
0.108108108108108, 0.230769230769231, 0.25, 0.117647058823529,
0.303030303030303, 0.0303030303030303, 0.24390243902439,
0.175675675675676, 0.472222222222222, 0.115942028985507,
0.166666666666667, 0.0289855072463768, 0.024390243902439,
0.0526315789473684, 0.0882352941176471, 0.0133333333333333,
0.307692307692308, 0.0571428571428571, 0.189189189189189,
0, 0.0571428571428571, 0.114285714285714, 0.27027027027027,
0.0571428571428571, 0.354838709677419, 0.0294117647058824,
0.129032258064516, 0.0571428571428571, 0.515151515151515,
0.0579710144927536, 0.0294117647058823, 0.027027027027027,
0.846153846153846, 0.0714285714285714, 0.263157894736842,
0.145161290322581, 0.131578947368421, 0.150684931506849,
0.255813953488372, 0.174603174603175, 0.333333333333333,
0.222222222222222, 0.34375, 0.0422535211267606, 0.0769230769230769,
0.0135135135135135, 0.114285714285714, 0.0945945945945946,
0.540540540540541, 0.0657894736842105, 0.315789473684211,
0.0273972602739726, 0.266666666666667, 0.0675675675675676,
0.233333333333333, 0.0410958904109589, 0.142857142857143,
0.0138888888888889, 0.166666666666667, 0.0833333333333333,
0.2, 0.228571428571429, 0.205882352941176, 0.0441176470588235,
0.0857142857142857, 0.028169014084507, 0.375, 0.0163934426229508,
0.193548387096774, 0.391304347826087, 0.5, 0, 0.194444444444444,
0.0625, 0.333333333333333, 0.0307692307692308, 0.457142857142857,
0.128571428571429, 0.205128205128205, 0.0933333333333333,
0.473684210526316, 0, 0.378378378378378, 0.0769230769230769,
0.542857142857143, 0.263157894736842, 0.533333333333333,
0, 0.36, 0.05, 0.3125, 0.0422535211267606, 0.117647058823529,
0, 0.257142857142857, 0.0151515151515152, 0.235294117647059,
0.157894736842105, 0.696969696969697, 0.0277777777777778,
0.424242424242424, 0.2, 0.378378378378378)), row.names = c(NA,
-104L), class = "data.frame")
您面临的根本问题是您试图对比例数据进行线性回归。这是行不通的,因为误差被假定为高斯分布,因此模型拟合将具有无意义的置信区间。
如果您有原始的二进制数据,但您的数据似乎采用比例格式,那么最好的方法是通过逻辑回归。您可以使用beta回归,但据我所知,
betareg
包不允许置信区间。
另一种选择是将响应变量转换为逻辑量表,拟合模型,获得预测,然后反转转换:
library(tidyverse)
df <- agg_forplot %>%
group_by(sumpdi, Condition) %>%
summarize(Response = mean(Response), .groups = 'drop')
mod <- lm(Response ~ sumpdi*Condition, within(df, Response <- qlogis(Response)))
pred_df <- data.frame(sumpdi = rep(seq(0, 11, length = 200), 2),
Condition = rep(c('0', '25'), each = 200))
preds <- predict(mod, pred_df, se.fit = TRUE)
pred_df$Response <- plogis(preds$fit)
pred_df$upper <- plogis(preds$fit + 1.96 * preds$se.fit)
pred_df$lower <- plogis(preds$fit - 1.96 * preds$se.fit)
ggplot(df, aes(sumpdi, Response, colour = Condition, group = Condition)) +
geom_ribbon(data = pred_df, aes(ymax = upper, ymin = lower,
fill = after_scale(alpha(colour, 0.2))),
linetype = 0) +
geom_line(data = pred_df) +
geom_point() +
xlab('Summary PDI Score') +
scale_color_manual(labels = c("No tone", "25%"),
values= c("orangered","deepskyblue4")) +
scale_y_continuous("Mean % 'Tone Present' Responses", limits = c(0, 0.6)) +
theme_classic()