计算散点图中标签的数量

问题描述 投票:0回答:4

我有三个散点图,用于比较三种癌症类型的两个数字连续列。两列的每一行都属于一种癌症类型。

这里是一小部分数据:

structure(list(cancer_type = c("Renal Cell Carcinoma", "Melanoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Melanoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Renal Cell Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma"), Model1 = c(0.144175127148628, 
0.145591989159584, 0.0984272509813309, 0.0906868129968643, 0.28544145822525, 
0.138114541769028, 0.091837003827095, 0.0904595032334328, 0.211963757872581, 
0.163982316851616, 0.0935302376747131, 0.127466395497322, 0.117602989077568, 
0.18533518910408, 0.0753359571099281, 0.157020777463913, 0.211388036608696, 
0.0914847329258919, 0.177859485149384, 0.137649402022362, 0.240238919854164, 
0.10163140296936, 0.128856286406517, 0.1811293810606, 0.145569115877151, 
0.108640238642693, 0.157251104712486, 0.141889616847038, 0.0737133473157883, 
0.140953287482262, 0.196182891726494, 0.135421812534332, 0.174105599522591, 
0.0961336940526962, 0.0573264583945274, 0.0880825147032738), 
    Model2 = c(0.314525783061981, 0.343217849731445, 0.315391361713409, 
    0.353350460529327, 0.562197327613831, 0.292534917593002, 
    0.616392850875854, 0.284660279750824, 0.532478809356689, 
    0.341239869594574, 0.35737070441246, 0.31985279917717, 0.619661331176758, 
    0.224026560783386, 0.268743008375168, 0.344117254018784, 
    0.542939126491547, 0.267527014017105, 0.2816281914711, 0.443801760673523, 
    0.552633106708527, 0.387285768985748, 0.186705753207207, 
    0.234086975455284, 0.287418365478516, 0.564366817474365, 
    0.392496168613434, 0.642540812492371, 0.688632488250732, 
    0.430574655532837, 0.360769122838974, 0.58690744638443, 0.510010659694672, 
    0.559859037399292, 0.665197253227234, 0.460800766944885), 
    stage = c("STAGE1", "STAGE1", "STAGE2", "STAGE4", "STAGE4", 
    "STAGE4", "STAGE3", "STAGE3", "STAGE1", "STAGE2", "STAGE2", 
    "STAGE1", "STAGE1", "STAGE3", "STAGE4", "STAGE1", "STAGE1", 
    "STAGE1", "STAGE1", "STAGE3", "STAGE2", "STAGE4", "STAGE4", 
    "STAGE1", "STAGE1", "STAGE1", "STAGE2", "STAGE4", "STAGE3", 
    "STAGE3", "STAGE2", "STAGE1", "STAGE1", "STAGE4", "STAGE3", 
    "STAGE2")), class = "data.frame", row.names = c("04d83340b8bd", 
"122T", "1c2a5ac94492", "1d209304d988", "212T", "24ab7fecc92e", 
"356T", "379fe8924c51", "39T", "3ec4d3fc8bd1", "3f78044299b5", 
"4260f878a482", "430T", "43b757f285d8", "49c4c0e12e32", "55cc6edfad7f", 
"62T", "689be0421d3c", "8237266761ca", "85d99ff60fa1", "9T", 
"a4d25b70d77c", "a74ac0179106", "ac07fd7297c8", "c0f7a7b642cd", 
"SAM3cb94b0d5297", "SAM47fc46c3d6be", "SAM4b0175e8db6e", "SAM4b7ea015fd9e", 
"SAM553c3c35b847", "SAM560f23d6a3ad", "SAM5c139c5c1c4f", "SAM5cc2d9036053", 
"SAM5cfa1699bdb7", "SAM5d989c86255e", "SAM6157c8f38b72"))

正如您在每个图中看到的那样,有一条水平线和一条垂直线。我当然可以随意调整那些线的位置。

共有三种颜色:黄色、蓝色和灰色。我需要每个季度每种颜色的数量。

例如 - 黑色素瘤图,右边的下四分之一只有一个黄点。左上角有两个黄色、1 个蓝色和 1 个灰色。在我的真实数据中有很多很多点,这只是一个小例子。

我需要每个地块每个季度的编号。我该怎么做?

这是制作情节的代码,您可以根据需要调整它:

scatterplot_for_models= function(data = data, Model_1 = Model_1, Model_2 = Model_2, x = x, y = y){
        
  ggplot(data,aes(1-data[[Model_2]], data[[Model_1]], fill=stage)) +
    geom_point(size=4,pch=21) + theme_classic()+
    facet_wrap(.~cancer_type)+
    scale_fill_manual(values=c('#E69F00', '#56B4E9','#999999','#999999')) +
    xlab("Model 2") + ylab("Model 1") +
    geom_hline(yintercept = y,linetype=2)+
    geom_vline(xintercept = x,linetype=2)
  
  
}
r ggplot2 plot scatter-plot
4个回答
3
投票

只要您不尝试在 ggplot 中执行此操作,这就相当简单。事先做一些数据汇总,然后将其用于绘图。在这种情况下,我认为最直接的方法是首先定义标签绘图的坐标,然后计算每个象限的点数(您需要决定在哪里放置“领带”的点)。

library(tidyverse)

## assuming you know your cut off values
y_cut <- .16
x_cut <- .5
## basic data frame modificiation for easier coding
data <- data %>%
  mutate(
    x = 1 - Model2,
    y = Model1
  )
## first, get the label positions.
## We will define the quadrants clockwisefrom top right to top left
label_pos <-
  data %>%
  summarise(
    x_right = mean(c(x_cut, max(x))),
    x_left = mean(c(x_cut, min(x))),
    y_top = mean(c(y_cut, max(y))),
    y_bottom = mean(c(y_cut, min(y)))
  )
label_coord <- 
  data.frame(matrix(unlist(c(rep(label_pos[1:2], each = 2), c(label_pos[3:4], label_pos[4:3]))), nrow = 4))
names(label_coord) <- c("x_lab", "y_lab")
## now, summarise how many dots are per quadrant based on those cut offs
## and add the label positions
data_count <- data %>%
  mutate(quad = case_when(
    x > x_cut & y > y_cut ~ "quad1",
    x > x_cut & y < y_cut ~ "quad2",
    x < x_cut & y < y_cut ~ "quad3",
    x < x_cut & y > y_cut ~ "quad4"),
    ## convert both groups to factors
    cancer_type = as.factor(cancer_type), 
    quad = as.factor(quad)
  ) %>%
  count(cancer_type, quad, .drop = FALSE)
## now merge both
data_annot <- data_count %>%
  bind_cols(label_coord[rep(1:nrow(label_coord), 3), ])
## do the data transformation before
ggplot(mapping = aes(x, Model1)) +
  geom_point(data = data, aes(fill = stage), size = 4, pch = 21) +
  ## now you can simply use geom_text for annotation
  geom_text(data = data_annot, mapping = aes(x_lab, y_lab, label = paste("n=", n))) +
  facet_wrap(. ~ cancer_type) +
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#999999", "#999999")) +
  xlab("Model 2") +
  ylab("Model 1") +
  theme_classic() +
  geom_hline(yintercept = y_cut, linetype = 2) +
  geom_vline(xintercept = x_cut, linetype = 2)


2
投票

您可以访问 ggplot 处理的数据,通过从

data
的输出中提取
ggplot_build(p)
来渲染实际图,其中
p
是前一次
ggplot
运行的结果。例子:

ggplot_build(p)$data[[1]]

此数据很容易制成表格,例如使用 {dplyr},使用您的示例数据:

x_intercept = .5
y_intercept = .16

p <-  df |> ## save ggplot object as "p"
  ggplot(aes(1 - Model2, Model1, fill=stage)) +
  geom_point(size=4,pch=21) + theme_classic()+
  facet_wrap( ~ cancer_type)+
  scale_fill_manual(values=c('#E69F00', '#56B4E9','#999999','#999999')) +
  xlab("Model 1") + ylab("Model 2") +
  geom_hline(yintercept = y_intercept, linetype=2) +
  geom_vline(xintercept = x_intercept, linetype=2)  
p ## display ggplot

制表ggplot数据:

library(dplyr)

ggplot_build(p)$data[[1]] |>
                count(PANEL, ## the facet_wrap panel index (~ cancer_type)
                      fill, ## the fill (~ stage)
                      x > x_intercept, 
                      y > y_intercept
                      )

输出(截断):

+    PANEL    fill x > x_intercept y > y_intercept n
1      1 #56B4E9           FALSE            TRUE 1
2      1 #999999           FALSE           FALSE 1
3      1 #999999           FALSE            TRUE 1
4      1 #E69F00           FALSE           FALSE 1

1
投票

可以用

case_when()
定义每个点落入的象限,然后用
count()
,按
stage
cancer_type

分组
x=0.5
y=0.16
df %>% mutate(q = case_when(
  Model2<x & Model1>y~"ul",
  Model2>=x & Model1>y~"ur",
  Model2<x & Model1<=y~"ll",
  Model2>=x & Model1<=y~"lr",
)) %>% count(stage,cancer_type,q)

0
投票

这是另一个使用

ggrepel
的建议:注意
df
是原始数据框,
df1
是仅用于标记的数据框:

工作原理:

1。创建仅用于标记目的的数据框。请注意,我们必须消除所有重复项!

library(tidyverse)

df1 <- df %>% 
  mutate(helper_x = ifelse(Model2 <= 0.5, "a", "b"),
         helper_y = ifelse(Model1 <= 0.16, "a", "b")) %>% 
  group_by(cancer_type, helper_x, helper_y) %>% 
  arrange(.by_group = TRUE) %>% 
  add_count(stage) %>% 
  mutate(label = paste(stage, "N=", n, sep=" ")) %>% 
  group_by(cancer_type, helper_x, helper_y, stage) %>% 
  slice(1)

2。情节和标签:

library(tidyverse)
library(ggrepel)

df %>%
  ggplot(aes(x=1-Model2, y=Model1, color=stage))+
  geom_point(size=4, alpha=0.8)+
  facet_wrap(.~cancer_type)+
  scale_color_manual(values = c("#E69F00", "#56B4E9", "#999999", "#999999")) +
  geom_vline(xintercept = 0.5, linetype = "longdash")+
  geom_hline(yintercept = 0.16, linetype = "longdash")+
  theme_classic()+
  geom_label_repel(data=df1, aes(label = label),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50',
                   show.legend = FALSE) +
  xlab("Model2")

© www.soinside.com 2019 - 2024. All rights reserved.