如何计算标准误差和 CI 以在 R 中绘图

问题描述 投票:0回答:1

我首先根据项目状态计算不同人群中高中毕业的受访者的百分比。这段代码让我得到了这些百分比:

d_perc <- d %>% 
  group_by(group, levels, program_cat, highschool) %>% 
  summarize(n = n()) %>% 
  mutate(percent = n/sum(n)*100) %>% 
  select(-n)

接下来,我想另外计算这些百分比的误差项。计算 SE 和相应 95% CI 的最佳方法是什么? (我的最终目标是使用

geom_point()
geom_errorbar
将它们绘制在一起,尽管我已经有代码可以做到这一点。)

我尝试过类似的事情:

d_perc$se <- sqrt(d_perc$percent*(1-d_perc$percent)/d_perc$percent)

然后是类似

+ and - 1.96*d_perc$se
的内容以获得上限和下限估计。然而,当我尝试上述操作时,我只得到 se 列的一系列 NaN。

这里的数据(抱歉数据太大;我使用 head(100) 来获得更真实的按组百分比):

d_perc <- structure(list(highschool= structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L), levels = c("no", 
"yes"), class = "factor"), program_cat = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L), levels = c("0", "1", "2"), class = "factor"), group = c("gender", 
"race", "cohort", "gender", "race", "cohort", "gender", "race", 
"cohort", "gender", "race", "cohort", "gender", "race", "cohort", 
"gender", "race", "cohort", "gender", "race", "cohort", "gender", 
"race", "cohort", "gender", "race", "cohort", "gender", "race", 
"cohort", "gender", "race", "cohort", "gender", "race", "cohort", 
"gender", "race", "cohort", "gender", "race", "cohort", "gender", 
"race", "cohort", "gender", "race", "cohort", "gender", "race", 
"cohort", "gender", "race", "cohort", "gender", "race", "cohort", 
"gender", "race", "cohort", "gender", "race", "cohort", "gender", 
"race", "cohort", "gender", "race", "cohort", "gender", "race", 
"cohort", "gender", "race", "cohort", "gender", "race", "cohort", 
"gender", "race", "cohort", "gender", "race", "cohort", "gender", 
"race", "cohort", "gender", "race", "cohort", "gender", "race", 
"cohort", "gender", "race", "cohort", "gender", "race", "cohort", 
"gender"), levels = structure(c(1L, 3L, 7L, 2L, 5L, 7L, 1L, 3L, 
6L, 2L, 4L, 6L, 1L, 5L, 7L, 1L, 3L, 7L, 1L, 3L, 6L, 1L, 3L, 6L, 
1L, 3L, 7L, 1L, 5L, 6L, 2L, 5L, 7L, 1L, 5L, 6L, 1L, 3L, 6L, 2L, 
3L, 7L, 1L, 3L, 6L, 1L, 4L, 6L, 1L, 5L, 6L, 1L, 5L, 6L, 1L, 4L, 
6L, 2L, 3L, 6L, 2L, 3L, 7L, 1L, 3L, 7L, 1L, 3L, 6L, 1L, 4L, 7L, 
1L, 4L, 7L, 1L, 3L, 7L, 1L, 3L, 7L, 1L, 4L, 7L, 1L, 3L, 7L, 1L, 
3L, 6L, 1L, 3L, 7L, 2L, 3L, 7L, 2L, 5L, 6L, 2L), levels = c("Female", 
"Male", "Black", "Hispanic", "White", "CohortA", "CohortB"), class = "factor")), row.names = c(NA, 
-100L), class = c("tbl_df", "tbl", "data.frame"))
r confidence-interval
1个回答
0
投票

正如评论中指出的,您的两个变量

group
levels
实际上代表三个不同的变量,应该转换为宽格式。我假设每三行代表对这些变量的一次观察。

要获得比例的上限和下限置信区间,最简单的方法可能就是使用

prop.test
,这样您就可以通过连续性校正获得正确的二项式置信区间。

library(tidyverse)

d_perc <- d %>% 
  mutate(id = (seq(nrow(.)) - 1) %/% 3) %>%
  pivot_wider(names_from = group, values_from = levels, 
              id_cols = c(id, program_cat, highschool)) %>%
  group_by(program_cat, gender, race, cohort) %>% 
  mutate(lower = prop.test(table(highschool))$conf[1],
         upper = prop.test(table(highschool))$conf[2],
         prop  = prop.test(table(highschool))$est) %>%
  ungroup() %>%
  filter(complete.cases(.))

这为我们提供了一个适合绘图的数据框:

d_perc
#> # A tibble: 33 x 9
#>       id program_cat highschool gender race     cohort   lower upper  prop
#>    <dbl> <fct>       <fct>      <fct>  <fct>    <fct>    <dbl> <dbl> <dbl>
#>  1     0 1           no         Female Black    CohortB 0.310  1     1    
#>  2     1 1           no         Male   White    CohortB 0.0546 1     1    
#>  3     2 0           no         Female Black    CohortA 0.299  0.989 0.8  
#>  4     3 1           no         Male   Hispanic CohortA 0.0546 1     1    
#>  5     4 0           yes        Female White    CohortB 0      0.945 0    
#>  6     5 1           no         Female Black    CohortB 0.310  1     1    
#>  7     6 1           no         Female Black    CohortA 0.198  1     1    
#>  8     7 0           no         Female Black    CohortA 0.299  0.989 0.8  
#>  9     8 0           no         Female Black    CohortB 0.299  0.989 0.8  
#> 10     9 0           yes        Female White    CohortA 0.0177 0.875 0.333
#> # i 23 more rows
#> # i Use `print(n = ...)` to see more rows

我们可以像这样使用 ggplot:

ggplot(d_perc, aes(cohort, prop, color = program_cat)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2,
                position = position_dodge(width = 0.6)) +
  geom_point(position = position_dodge(width = 0.6), size = 3) +
  facet_grid(gender~race, switch = "y") +
  scale_y_continuous(labels = scales::percent) +
  theme_bw(base_size = 16)

请注意,由于每个类别的样本量较小,因此没有足够的数据点来创建实际的置信区间,有些是 100% 或 0%,看起来有点奇怪 - 这是小示例数据集的限制,而不是方法。

© www.soinside.com 2019 - 2024. All rights reserved.