ggplot 中 geom_smooth() 的负置信区间

Question

我有一个实验，其中参与者填写一份调查问卷（每个参与者对此都有一个 sumpdi 分数）和一个具有两种类型（参与者内部）条件（0 和 25）的实验。

我想在 sumpdi 分数和条件下绘制平均任务响应，但我在描述置信区间时遇到困难。

我首先汇总数据：

 agg_forplot<- aggregate(Response ~ sumpdi+ Condition, df_forplot,
 FUN=mean)

然后我在 ggplot2 中创建一个绘图：

Plot_C<-ggplot(agg_forplot, aes(x=sumpdi, y=Response, colour=Condition))+ geom_smooth(method='lm', se=T, fill='light blue')+geom_point()+
     ylab("Mean % 'Tone Present' Responses")+ xlab('Summary PDI Score') + labs(color= 'Condition') +
      scale_color_manual(labels = c("No tone", "25%"), values= c("#5CC0C0","#FF88FF"))+  mytheme

Plot_C

geom_smooth() 中的“se”设置为汇总 PDI 分数=0 附近的置信区间生成负值。但是，显然不可能得到负百分比。

为什么会发生这种情况？我该如何纠正？我尝试添加参数

+ylim(0,1)

但这会完全从图表的这一部分中删除置信区间。

编辑这是 dput(agg_forplot) 的输出，用于显示我的数据：

structure(list(sumpdi = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 0, 
1, 2, 3, 4, 5, 6, 7, 8, 9, 11), Condition = structure(c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L), levels = c("0", "25", "50", "75"), class = "factor"), 
    Response = c(0.0636323497219099, 0.0851942941465814, 0.0885739075300753, 
    0.0917113758765922, 0.0329839417565963, 0.0784596540124087, 
    0.102004605958778, 0.0945945945945946, 0.0762726196516151, 
    0.0422535211267606, 0.263157894736842, 0.276791358215455, 
    0.343362166227905, 0.314720315633304, 0.264764719482896, 
    0.23228019576997, 0.291160062315599, 0.238023716545795, 0.540540540540541, 
    0.36038961038961, 0.117647058823529, 0.533333333333333)), row.names = c(NA, 
-22L), class = "data.frame")

此后我还注意到，通过聚合 Pt_ID 来生成平均值可以防止负置信区间值（参见下图），但我不确定这是为什么：

agg_forplot<- aggregate(Response ~ sumpdi+ Condition+ Pt_ID, df_forplot, FUN=mean)

dput(agg_forplot) 的输出：

structure(list(sumpdi = c(5, 5, 6, 6, 4, 4, 3, 3, 6, 6, 0, 0, 
6, 6, 0, 0, 5, 5, 0, 0, 0, 0, 6, 6, 3, 3, 2, 2, 3, 3, 3, 3, 0, 
0, 5, 5, 0, 0, 3, 3, 8, 8, 2, 2, 2, 2, 4, 4, 7, 7, 3, 3, 0, 0, 
6, 6, 0, 0, 5, 5, 3, 3, 3, 3, 1, 1, 5, 5, 6, 6, 5, 5, 4, 4, 5, 
5, 0, 0, 1, 1, 2, 2, 8, 8, 2, 2, 11, 11, 4, 4, 5, 5, 9, 9, 5, 
5, 0, 0, 1, 1, 1, 1, 0, 0), Condition = structure(c(1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L), levels = c("0", "25", "50", "75"), class = "factor"), 
    Pt_ID = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 
    6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 
    12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 
    18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 
    24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 
    30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 
    36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 
    42L, 43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 
    48L, 49L, 49L, 50L, 50L, 51L, 51L, 52L, 52L), levels = c("1006", 
    "1007", "1013", "1014", "1017", "1020", "1021", "1022", "1024", 
    "1026", "1028", "1032", "1033", "1034", "1036", "1037", "1039", 
    "1040", "1041", "1042", "1043", "1045", "1046", "1047", "1051", 
    "1053", "1055", "1056", "1057", "1058", "1059", "1060", "1061", 
    "1063", "1064", "1065", "1066", "1069", "1071", "1072", "1073", 
    "1074", "1078", "1079", "1085", "1086", "1087", "1088", "1089", 
    "1090", "1092", "1093"), class = "factor"), Response = c(0.0882352941176471, 
    0.108108108108108, 0.230769230769231, 0.25, 0.117647058823529, 
    0.303030303030303, 0.0303030303030303, 0.24390243902439, 
    0.175675675675676, 0.472222222222222, 0.115942028985507, 
    0.166666666666667, 0.0289855072463768, 0.024390243902439, 
    0.0526315789473684, 0.0882352941176471, 0.0133333333333333, 
    0.307692307692308, 0.0571428571428571, 0.189189189189189, 
    0, 0.0571428571428571, 0.114285714285714, 0.27027027027027, 
    0.0571428571428571, 0.354838709677419, 0.0294117647058824, 
    0.129032258064516, 0.0571428571428571, 0.515151515151515, 
    0.0579710144927536, 0.0294117647058823, 0.027027027027027, 
    0.846153846153846, 0.0714285714285714, 0.263157894736842, 
    0.145161290322581, 0.131578947368421, 0.150684931506849, 
    0.255813953488372, 0.174603174603175, 0.333333333333333, 
    0.222222222222222, 0.34375, 0.0422535211267606, 0.0769230769230769, 
    0.0135135135135135, 0.114285714285714, 0.0945945945945946, 
    0.540540540540541, 0.0657894736842105, 0.315789473684211, 
    0.0273972602739726, 0.266666666666667, 0.0675675675675676, 
    0.233333333333333, 0.0410958904109589, 0.142857142857143, 
    0.0138888888888889, 0.166666666666667, 0.0833333333333333, 
    0.2, 0.228571428571429, 0.205882352941176, 0.0441176470588235, 
    0.0857142857142857, 0.028169014084507, 0.375, 0.0163934426229508, 
    0.193548387096774, 0.391304347826087, 0.5, 0, 0.194444444444444, 
    0.0625, 0.333333333333333, 0.0307692307692308, 0.457142857142857, 
    0.128571428571429, 0.205128205128205, 0.0933333333333333, 
    0.473684210526316, 0, 0.378378378378378, 0.0769230769230769, 
    0.542857142857143, 0.263157894736842, 0.533333333333333, 
    0, 0.36, 0.05, 0.3125, 0.0422535211267606, 0.117647058823529, 
    0, 0.257142857142857, 0.0151515151515152, 0.235294117647059, 
    0.157894736842105, 0.696969696969697, 0.0277777777777778, 
    0.424242424242424, 0.2, 0.378378378378378)), row.names = c(NA, 
-104L), class = "data.frame")

Answer 1

您面临的根本问题是您试图对比例数据进行线性回归。这是行不通的，因为误差被假定为高斯分布，因此模型拟合将具有无意义的置信区间。

如果您有原始的二进制数据，但您的数据似乎采用比例格式，那么最好的方法是通过逻辑回归。您可以使用beta回归，但据我所知，

betareg

包不允许置信区间。

另一种选择是将响应变量转换为逻辑量表，拟合模型，获得预测，然后反转转换：

library(tidyverse)

df <- agg_forplot %>%
  group_by(sumpdi, Condition) %>%
  summarize(Response = mean(Response), .groups = 'drop')

mod <- lm(Response ~ sumpdi*Condition, within(df, Response <- qlogis(Response)))
pred_df <- data.frame(sumpdi = rep(seq(0, 11, length = 200), 2),
                      Condition = rep(c('0', '25'), each = 200))

preds <- predict(mod, pred_df, se.fit = TRUE)

pred_df$Response <- plogis(preds$fit)
pred_df$upper <- plogis(preds$fit + 1.96 * preds$se.fit)
pred_df$lower <- plogis(preds$fit - 1.96 * preds$se.fit)

ggplot(df, aes(sumpdi, Response, colour = Condition, group = Condition)) + 
  geom_ribbon(data = pred_df, aes(ymax = upper, ymin = lower,
                                  fill = after_scale(alpha(colour, 0.2))),
              linetype = 0) +
  geom_line(data = pred_df) +
  geom_point() +
  xlab('Summary PDI Score') + 
  scale_color_manual(labels = c("No tone", "25%"), 
                     values= c("orangered","deepskyblue4")) + 
  scale_y_continuous("Mean % 'Tone Present' Responses", limits = c(0, 0.6)) +
  theme_classic()

ggplot 中 geom_smooth() 的负置信区间

问题描述投票：0回答：1

1个回答

最新问题

ggplot 中 geom_smooth() 的负置信区间

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1