我有三个散点图,用于比较三种癌症类型的两个数字连续列。两列的每一行都属于一种癌症类型。
这里是一小部分数据:
structure(list(cancer_type = c("Renal Cell Carcinoma", "Melanoma",
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma",
"Melanoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma",
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma",
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma",
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma",
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Renal Cell Carcinoma",
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma",
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma",
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma",
"Urothelial Carcinoma", "Urothelial Carcinoma"), Model1 = c(0.144175127148628,
0.145591989159584, 0.0984272509813309, 0.0906868129968643, 0.28544145822525,
0.138114541769028, 0.091837003827095, 0.0904595032334328, 0.211963757872581,
0.163982316851616, 0.0935302376747131, 0.127466395497322, 0.117602989077568,
0.18533518910408, 0.0753359571099281, 0.157020777463913, 0.211388036608696,
0.0914847329258919, 0.177859485149384, 0.137649402022362, 0.240238919854164,
0.10163140296936, 0.128856286406517, 0.1811293810606, 0.145569115877151,
0.108640238642693, 0.157251104712486, 0.141889616847038, 0.0737133473157883,
0.140953287482262, 0.196182891726494, 0.135421812534332, 0.174105599522591,
0.0961336940526962, 0.0573264583945274, 0.0880825147032738),
Model2 = c(0.314525783061981, 0.343217849731445, 0.315391361713409,
0.353350460529327, 0.562197327613831, 0.292534917593002,
0.616392850875854, 0.284660279750824, 0.532478809356689,
0.341239869594574, 0.35737070441246, 0.31985279917717, 0.619661331176758,
0.224026560783386, 0.268743008375168, 0.344117254018784,
0.542939126491547, 0.267527014017105, 0.2816281914711, 0.443801760673523,
0.552633106708527, 0.387285768985748, 0.186705753207207,
0.234086975455284, 0.287418365478516, 0.564366817474365,
0.392496168613434, 0.642540812492371, 0.688632488250732,
0.430574655532837, 0.360769122838974, 0.58690744638443, 0.510010659694672,
0.559859037399292, 0.665197253227234, 0.460800766944885),
stage = c("STAGE1", "STAGE1", "STAGE2", "STAGE4", "STAGE4",
"STAGE4", "STAGE3", "STAGE3", "STAGE1", "STAGE2", "STAGE2",
"STAGE1", "STAGE1", "STAGE3", "STAGE4", "STAGE1", "STAGE1",
"STAGE1", "STAGE1", "STAGE3", "STAGE2", "STAGE4", "STAGE4",
"STAGE1", "STAGE1", "STAGE1", "STAGE2", "STAGE4", "STAGE3",
"STAGE3", "STAGE2", "STAGE1", "STAGE1", "STAGE4", "STAGE3",
"STAGE2")), class = "data.frame", row.names = c("04d83340b8bd",
"122T", "1c2a5ac94492", "1d209304d988", "212T", "24ab7fecc92e",
"356T", "379fe8924c51", "39T", "3ec4d3fc8bd1", "3f78044299b5",
"4260f878a482", "430T", "43b757f285d8", "49c4c0e12e32", "55cc6edfad7f",
"62T", "689be0421d3c", "8237266761ca", "85d99ff60fa1", "9T",
"a4d25b70d77c", "a74ac0179106", "ac07fd7297c8", "c0f7a7b642cd",
"SAM3cb94b0d5297", "SAM47fc46c3d6be", "SAM4b0175e8db6e", "SAM4b7ea015fd9e",
"SAM553c3c35b847", "SAM560f23d6a3ad", "SAM5c139c5c1c4f", "SAM5cc2d9036053",
"SAM5cfa1699bdb7", "SAM5d989c86255e", "SAM6157c8f38b72"))
正如您在每个图中看到的那样,有一条水平线和一条垂直线。我当然可以随意调整那些线的位置。
共有三种颜色:黄色、蓝色和灰色。我需要每个季度每种颜色的数量。
例如 - 黑色素瘤图,右边的下四分之一只有一个黄点。左上角有两个黄色、1 个蓝色和 1 个灰色。在我的真实数据中有很多很多点,这只是一个小例子。
我需要每个地块每个季度的编号。我该怎么做?
这是制作情节的代码,您可以根据需要调整它:
scatterplot_for_models= function(data = data, Model_1 = Model_1, Model_2 = Model_2, x = x, y = y){
ggplot(data,aes(1-data[[Model_2]], data[[Model_1]], fill=stage)) +
geom_point(size=4,pch=21) + theme_classic()+
facet_wrap(.~cancer_type)+
scale_fill_manual(values=c('#E69F00', '#56B4E9','#999999','#999999')) +
xlab("Model 2") + ylab("Model 1") +
geom_hline(yintercept = y,linetype=2)+
geom_vline(xintercept = x,linetype=2)
}
只要您不尝试在 ggplot 中执行此操作,这就相当简单。事先做一些数据汇总,然后将其用于绘图。在这种情况下,我认为最直接的方法是首先定义标签绘图的坐标,然后计算每个象限的点数(您需要决定在哪里放置“领带”的点)。
library(tidyverse)
## assuming you know your cut off values
y_cut <- .16
x_cut <- .5
## basic data frame modificiation for easier coding
data <- data %>%
mutate(
x = 1 - Model2,
y = Model1
)
## first, get the label positions.
## We will define the quadrants clockwisefrom top right to top left
label_pos <-
data %>%
summarise(
x_right = mean(c(x_cut, max(x))),
x_left = mean(c(x_cut, min(x))),
y_top = mean(c(y_cut, max(y))),
y_bottom = mean(c(y_cut, min(y)))
)
label_coord <-
data.frame(matrix(unlist(c(rep(label_pos[1:2], each = 2), c(label_pos[3:4], label_pos[4:3]))), nrow = 4))
names(label_coord) <- c("x_lab", "y_lab")
## now, summarise how many dots are per quadrant based on those cut offs
## and add the label positions
data_count <- data %>%
mutate(quad = case_when(
x > x_cut & y > y_cut ~ "quad1",
x > x_cut & y < y_cut ~ "quad2",
x < x_cut & y < y_cut ~ "quad3",
x < x_cut & y > y_cut ~ "quad4"),
## convert both groups to factors
cancer_type = as.factor(cancer_type),
quad = as.factor(quad)
) %>%
count(cancer_type, quad, .drop = FALSE)
## now merge both
data_annot <- data_count %>%
bind_cols(label_coord[rep(1:nrow(label_coord), 3), ])
## do the data transformation before
ggplot(mapping = aes(x, Model1)) +
geom_point(data = data, aes(fill = stage), size = 4, pch = 21) +
## now you can simply use geom_text for annotation
geom_text(data = data_annot, mapping = aes(x_lab, y_lab, label = paste("n=", n))) +
facet_wrap(. ~ cancer_type) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#999999", "#999999")) +
xlab("Model 2") +
ylab("Model 1") +
theme_classic() +
geom_hline(yintercept = y_cut, linetype = 2) +
geom_vline(xintercept = x_cut, linetype = 2)
您可以访问 ggplot 处理的数据,通过从
data
的输出中提取 ggplot_build(p)
来渲染实际图,其中 p
是前一次 ggplot
运行的结果。例子:
ggplot_build(p)$data[[1]]
此数据很容易制成表格,例如使用 {dplyr},使用您的示例数据:
x_intercept = .5
y_intercept = .16
p <- df |> ## save ggplot object as "p"
ggplot(aes(1 - Model2, Model1, fill=stage)) +
geom_point(size=4,pch=21) + theme_classic()+
facet_wrap( ~ cancer_type)+
scale_fill_manual(values=c('#E69F00', '#56B4E9','#999999','#999999')) +
xlab("Model 1") + ylab("Model 2") +
geom_hline(yintercept = y_intercept, linetype=2) +
geom_vline(xintercept = x_intercept, linetype=2)
p ## display ggplot
制表ggplot数据:
library(dplyr)
ggplot_build(p)$data[[1]] |>
count(PANEL, ## the facet_wrap panel index (~ cancer_type)
fill, ## the fill (~ stage)
x > x_intercept,
y > y_intercept
)
输出(截断):
+ PANEL fill x > x_intercept y > y_intercept n
1 1 #56B4E9 FALSE TRUE 1
2 1 #999999 FALSE FALSE 1
3 1 #999999 FALSE TRUE 1
4 1 #E69F00 FALSE FALSE 1
可以用
case_when()
定义每个点落入的象限,然后用count()
,按stage
和cancer_type
分组
x=0.5
y=0.16
df %>% mutate(q = case_when(
Model2<x & Model1>y~"ul",
Model2>=x & Model1>y~"ur",
Model2<x & Model1<=y~"ll",
Model2>=x & Model1<=y~"lr",
)) %>% count(stage,cancer_type,q)
这是另一个使用
ggrepel
的建议:注意df
是原始数据框,df1
是仅用于标记的数据框:
工作原理:
1。创建仅用于标记目的的数据框。请注意,我们必须消除所有重复项!
library(tidyverse)
df1 <- df %>%
mutate(helper_x = ifelse(Model2 <= 0.5, "a", "b"),
helper_y = ifelse(Model1 <= 0.16, "a", "b")) %>%
group_by(cancer_type, helper_x, helper_y) %>%
arrange(.by_group = TRUE) %>%
add_count(stage) %>%
mutate(label = paste(stage, "N=", n, sep=" ")) %>%
group_by(cancer_type, helper_x, helper_y, stage) %>%
slice(1)
2。情节和标签:
library(tidyverse)
library(ggrepel)
df %>%
ggplot(aes(x=1-Model2, y=Model1, color=stage))+
geom_point(size=4, alpha=0.8)+
facet_wrap(.~cancer_type)+
scale_color_manual(values = c("#E69F00", "#56B4E9", "#999999", "#999999")) +
geom_vline(xintercept = 0.5, linetype = "longdash")+
geom_hline(yintercept = 0.16, linetype = "longdash")+
theme_classic()+
geom_label_repel(data=df1, aes(label = label),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50',
show.legend = FALSE) +
xlab("Model2")