为什么直方图上方的累积线与`ggplot2`不正确?

问题描述 投票:0回答:1

我想用

ggplot
覆盖一列的累积份额,但百分比不正确

事实上,您可以看到红线从 95% 开始,而第一个 bin 则低于 82% 左右

df <- structure(list(col = c(1.42221064814815, 0.709669201387851, 0.00864583333333333,  3.35221946759356, 0.0138087731489429, 0.101736111111111, 0.000459247684037244,  0.0291767592590164, 0.171842569443915, 0.171538472222509, 0.0708449074074074,  0.0234837962962963, 0.25262748842714, 0.386477071758774, 125.055696030094,  0.0696409606492078, 0.0938078703703704, 0.192905092592593, 0.0031709722208756,  0.227335300925705, 0.0134506134247338, 0.040787037037037, 0.266623020834393,  0.00225040509193032, 0.473669131944577, 0.130830208333554, 3.61516203703704,  0.130288240741248, 0.536915474536794, 0.00138538194475351, 0.0113888888888889,  3.26379307870236, 0.12810640046166, 0.392849537037589, 0.71517319444429,  0.112205289351167, 0.431413553241226, 0.0178086342579789, 2.69385361110999,  0.220277777777778, 0.00206320601756926, 0.0808217592592593, 0.13211086805496,  1.90881438657365, 2.04585710648033, 0.845706018518518, 0.0741087962962963,  0.428182499999249, 0.00403622685207261, 0.0592311111120162, 0.0682201851849203,  1.24485666666594, 0.0189236111111111, 0.0453356481481481, 7.11538414351918,  0.0155092592592593, 0.0541087962962963, 0.0759213078711872, 0.00378994212934264,  0.00767912037118717, 0.0622061574072749, 22.5055494907416, 0.0707319328713196,  0.0851041666666667, 0.285934664353176, 0.0116175694432524, 0.709232141204454,  1.05187328703701, 0.0052125925929458, 0.112268171296627, 0.0400231481481481,  0.0341140393526466, 0.225503703703483, 0.0834027777777778, 0.929739918981989,  0.403400393517481, 0.0825652893522271, 0.458994571759745, 0.07600548611195,  0.0985681712958548, 0.0385900578703041, 0.359117986110074, 0.922757222221957,  186.031066087962, 2.39154376157456, 0.499594907407407, 0.0130671296296296,  2.86927083333333, 0.00584490740740741, 0.619270625001302, 0.0142964004642434,  0.0854832175925926, 1.39854887731373, 1.51077546296296, 0.00819540509195239,  0.750400266203063, 233.781311967594, 0.340315266204653, 0.879955011573103,  2.82027777777778)), row.names = c(NA, -100L), class = "data.frame")

library(ggplot); library(dplyr)

df %>%
  ggplot(aes(x = col)) +
  geom_histogram(aes(y = after_stat(cumsum(count / sum(count)))), breaks = 0:max(df$col, na.rm = T), binwidth = 1, fill = "blue", color = "black") +
  geom_line(stat = "bin", aes(y = after_stat(cumsum(count / sum(count)))), color = "red") +
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(xlim=c(0, 10), ylim=c(0, 1)) +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) +
  scale_y_continuous(breaks = seq(0, 1, by = 0.1), labels = scales::percent) 

r ggplot2 histogram cumulative-sum
1个回答
3
投票

问题是您对

geom_line
使用默认的 bin 数量 (=30),即
binwidth
计算为
diff(range(x)) / 30
,而对于
geom_col
,您已设置
binwidth=1

我想要相同的

count
,那么你必须对两个层使用相同的分箱。

另外请注意,对于您的情况,无需设置

breaks
。您可以使用
boundary=
center=
设置垃圾箱的起始位置。对于
geom_line
来说,涉及更多一点。在这里,我在应用
stage
后使用
x
来移动线条的
stat
位置。但您也可以使用
position = position_nudge(x = -.5)
来达到相同的结果。

library(ggplot2)

df |>
  ggplot(aes(x = col)) +
  geom_histogram(
    aes(y = after_stat(cumsum(count / sum(count)))),
    binwidth = 1, fill = "blue", color = "black",
    boundary = 0
  ) +
  geom_line(
    stat = "bin",
    aes(
      x = stage(col, after_stat = x - .5),
      y = after_stat(cumsum(count / sum(count)))
    ),
    color = "red",
    binwidth = 1,
    boundary = 0
  ) +
  coord_cartesian(xlim = c(0, 10), ylim = c(0, 1)) +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) +
  scale_y_continuous(
    breaks = seq(0, 1, by = 0.1),
    labels = scales::percent
  )

编辑 我希望该行从 (0, 0) 开始,那么最简单的方法是切换到

geom_freqpoly
,默认情况下会扩展该行:

library(ggplot2)
library(dplyr, warn = FALSE)

df |>
  ggplot(aes(x = col)) +
  geom_histogram(
    aes(y = after_stat(cumsum(count / sum(count)))),
    binwidth = 1, fill = "blue", color = "black",
    boundary = 0
  ) +
  geom_freqpoly(
    aes(
      x = stage(col, after_stat = x + .5),
      y = after_stat(cumsum(count / sum(count)))
    ),
    binwidth = 1, color = "red",
    boundary = 0
  ) +
  coord_cartesian(xlim = c(0, 10), ylim = c(0, 1)) +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) +
  scale_y_continuous(
    breaks = seq(0, 1, by = 0.1),
    labels = scales::percent
  )

© www.soinside.com 2019 - 2024. All rights reserved.