我想在直方图的 x 轴上设置宽限制,以便与同一文档中以前的绘图兼容。我认为我应该能够在
scale_x_continuous()
中设置限制,因为我想要的限制超出了数据范围。然而,这样做,即使有荒谬的高限制,也会通过有关丢失(而不是超出限制)数据的消息来审查数据。我可以用 coord_cartesian(xlim = c(-3, 5))
解决我的问题以获得我想要的东西,但我想了解我所看到的使用 scale_x_continuous(limits = c(-3, 5))
是否正在做它应该做的事情。我认为警告消息可以改进,并且 scale_x_continuous()
中关于 oob 的帮助表明 oob 可能已经为我解决了问题,但似乎不允许我使用 oob 参数!
这是一个我认为展示了整个故事的表述。
library(tidyverse)
tibScores <- structure(list(scores = c(1.3221868551862, 0.641571901750246,
0.679997504675969, 1.69269563506049, 0.0863798463356552, 1.17140778928003,
0.575259325846973, 1.76102607380206, 0.816982263242756, 1.36699542376874,
0.652688919189357, -0.0556963218639883, 0.447405816980878, 1.66074084421999,
2.1616205500135, 0.536966252715301, 0.863735587473018, -0.0838092754807627,
1.44139911944675, 0.28123124435063, 0.726886572797169, 1.28139850105958,
-1.06200317504591, 1.33947497571541, 1.25321267994842, 2.26719146654814,
0.0391905158098528, -0.325762556819256, 1.32308861983067, 0.67539246225335,
3.15411661921818, 2.06629001625931, 0.565898473951559, 1.53933053768839,
0.391723502639786, 8.20432000704363e-05, 2.06594609314108, 1.30682345061009,
0.605298461881756, 1.58497799066082, 1.68990269481407, 0.476040346123826,
0.844876986409903, -0.164609368672023, 2.20658953135608, 0.998615148039355,
1.15569174851264, 2.02756244449021, 2.47305476162932, 0.728964653094446,
1.48997692828698, 4.38774920562831, 2.00346155661968, 1.49193102247697,
2.28013856141223, 2.76792989289636, 3.26522037161048, 1.7030572330355,
1.58105144823292, 2.54872692975434, 2.91883589732548, 2.01003029310646,
2.6215879604812, 1.9739800810684, 2.35983308929714, 1.65115537149953,
0.895557980659816, 2.67904664720373, 1.61291641056518, 1.80793828366117,
3.14825826246364, 2.66897888275127, 1.49863234127131, 0.37011916381626,
2.67629185248689, 3.39374742622388, 2.52608956918892, 0.336324241851188,
1.87099745872943, 4.10493845361578),
grp = c("NHS", "NHS", "NHS",
"NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS",
"NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS",
"NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS",
"NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS",
"NHS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS",
"HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS",
"HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS",
"HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS")),
row.names = c(NA, -80L),
class = c("tbl_df", "tbl", "data.frame"))
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30)
### but, for comparison with a preceding plot, I want x limits at -3 and 5 so
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30) +
scale_x_continuous("Scores",
breaks = -3:5)
### but then I remember that the breaks don't set the limits so:
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30) +
scale_x_continuous("Scores",
breaks = -3:5,
limits = c(-3, 5))
### but now:
# Warning message:
# Removed 4 rows containing missing values (`geom_bar()`).
### Why four points censored out? There are no missing values so that message is misleading
### and it must be about the limits.
### so I sleuth around and find https://stackoverflow.com/questions/32505298/explain-ggplot2-warning-removed-k-rows-containing-missing-values
### and this gets me what I want but I still don't understand the censoring when I use scale_x_continuous with limits.
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30) +
scale_x_continuous("Scores",
breaks = -3:5) +
coord_cartesian(xlim = c(-3, 5))
limits = c(-3, 5))
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30)
### but, for comparison with a preceding plot, I want x limits at -3 and 5 so
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30) +
scale_x_continuous("Scores",
breaks = -3:5)
### but then I remember that the breaks don't set the limits so I do this
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30) +
scale_x_continuous("Scores",
breaks = -3:5,
limits = c(-3, 5))
### but that gets me:
# Warning message:
# Removed 4 rows containing missing values (`geom_bar()`).
### pushing the limits up to c(-3, 8) still gets the censoring
### there are no missing values so to me the warning is misleading
### and I really can't understand the behaviour
### I also discover oob (which is in the scale_x_continuous help, I'd just never used it)
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30) +
scale_x_continuous("Scores",
breaks = -3:5,
limits = c(-3, 5),
oob = keep)
### gives me:
# Error in `oob()`:
# ℹ In index: 1.
# Caused by error:
# ! `.p()` must return a single `TRUE` or `FALSE`, not NULL.
# Run `rlang::last_trace()` to see where the error occurred.
### and I'm sorry, but the help and its links there don't help me!
### but I do find coord_cartesian(limits = ...)
ggplot(data = tibScores,
aes(x = scores, fill = grp)) +
geom_histogram(aes(y = after_stat(density)),
position = "identity",
colour = "black",
alpha = .8,
bins = 30) +
scale_x_continuous("Scores",
breaks = -3:5) +
coord_cartesian(xlim = c(-3, 5))
### and that works but I think I am still baffled by
我希望有人能为我澄清一些事情。蒂亚,克里斯
首先,使用
oob = keep
的问题是它应该是oob = scales::oob_keep
。您正在将 purrr::keep
传递给 oob=
参数。因此你会得到一个错误。
其次,通过
coord_cartesian
设置限制时的区别在于后者不会更改数据,即它只会放大所需范围,而通过比例设置限制会降低数据。
第三,我同意文档和/或警告可能会得到改进。因此,请随意提出改进作为功能请求或......。
一般问题是,对于最外面的条形,
xmin/xmax
值超出了数据范围,即最外面的箱以数据的最大/最小值为中心。虽然默认限制考虑了这一点,但在设置限制时常常会忘记这一事实。如果是直方图,最外面的条会被丢弃。
通过一个更简单的示例(下面的代码)可以清楚地看到这一点。左上角显示带有默认限制的直方图,其中我为数据范围添加了两条垂直线。这表明最外面的条形超出了数据范围。因此,当确定数据范围的限制时,这些条会被删除,如右上角所示。除了通过
coord_cartesian
设置限制之外,还可以使用 oob_keep
来修复此问题,如左下角所示。或者作为另一种选择,您可以考虑使用 boundary=
将条形与垃圾箱的边界对齐。但请注意,这会更改分箱。
不接受您的真实数据。这里的问题有点特殊,因为当然所有数据甚至直方图最外面的条形都明显落在您设置的限制内。然而,正如我已经说过的,当您通过比例设置限制时,您就会改变数据。在您的情况下,问题是没有删除任何数据,而是添加了数据(:(也不知道这一点),即通过扩展限制,一些不可见的零条被添加到数据和绘图中。即使不可见,我之前所说的也适用于你的情况:最外面的栏被丢弃。
为了使其可见,我使用
after_stat
和 after_scale
使不可见的条形可见:
pp <- ggplot(
data = tibScores,
aes(x = scores, fill = grp)
) +
geom_histogram(
aes(
y = after_stat(if_else(density > 0, density, .5)),
fill = stage(grp, after_scale = if_else(density > 0, fill, NA_character_))
),
position = "identity",
colour = "black",
alpha = .8,
bins = 30
) +
geom_vline(xintercept = c(-3, 5), color = "red")
pp1 <- pp +
scale_x_continuous("Scores",
breaks = -3:5,
limits = c(-3, 5)
)
list(pp, pp1) |>
wrap_plots(ncol = 1)
代码
library(ggplot2)
set.seed(1)
dat <- data.frame(
x = runif(100, 0, 1)
)
p <- ggplot(dat, aes(x = x)) +
geom_vline(xintercept = c(0, 1), color = "red")
p0 <- p +
geom_histogram() +
labs(subtitle = "Default")
p1 <- p0 +
scale_x_continuous(
limits = c(0, 1)
) +
labs(subtitle = "Fixing the limits will censor\nor drop outermost bars")
p2 <- p0 +
scale_x_continuous(
limits = c(0, 1),
oob = scales::oob_keep
) +
labs(subtitle = "oob_keep will keep them")
p3 <- p +
geom_histogram(boundary = 0) +
scale_x_continuous(
limits = c(0, 1)
) +
labs(subtitle = "boundary = 0: Do not center the bars")
library(patchwork)
list(p0, p1, p2, p3) |>
wrap_plots()