通过匹配范围在 R 中查找表

问题描述 投票:0回答:4

我有一张桌子:

CHR POS 
10  4342 
20  100
22  5422

我有另一个查询:

CHR start end Gene
10  4000  5999 ABC1
20  50    200  JHT
22  5000  6000 KLO

期望的输出:

CHR POS 
10  4342  ABC1
20  100   JHT
22  5422  KLO

实际上表 1 中有 700,000 个条目和大约 60000 个基因。我需要在染色体上进行匹配,然后让 POS 位于表 2 的开始和结束之间,以添加一个带有基因名称的新列。

我试过了:

library(dplyr)

# create sample data
df1 <- data.frame(chromosome = c("chr1", "chr1", "chr2", "chr3"), position = c(100, 200, 300, 400))
df2 <- data.frame(chromosome = c("chr1", "chr2", "chr3"), start = c(50, 250, 350), end = c(150, 350, 450), gene = c("geneA", "geneB", "geneC"))

# perform left join
joined_df <- left_join(df1, df2, by = "chromosome")

# create new column indicating if each row lies within a gene
result_df <- joined_df %>%
              mutate(in_gene = if_else(position >= start & position <= end, gene, NA_character_))

# view result
result_df

但是向量太大,无法存储。

r lookup
4个回答
2
投票

使用

dplyr
1.1.0,我们可以使用
join_by
进行非相等连接

library(dplyr)
left_join(df1, df2, by = join_by(CHR, 
    closest(POS >= start), closest(POS <= end))) %>%
   select(-start, -end)

-输出

 CHR  POS Gene
1  10 4342 ABC1
2  20  100  JHT
3  22 5422  KLO

或与

data.table

library(data.table)
 setDT(df1)[df2, Gene := i.Gene, on = .(CHR, POS >= start, POS <= end)]

-输出

> df1
   CHR  POS Gene
1:  10 4342 ABC1
2:  20  100  JHT
3:  22 5422  KLO

数据

df1 <- structure(list(CHR = c(10L, 20L, 22L), POS = c(4342L, 100L, 5422L
)), class = "data.frame", row.names = c(NA, -3L))

df2 <- structure(list(CHR = c(10L, 20L, 22L), start = c(4000L, 50L, 
5000L), end = c(5999L, 200L, 6000L), Gene = c("ABC1", "JHT", 
"KLO")), class = "data.frame", row.names = c(NA, -3L))

2
投票

这里是 left_join 的变体:

library(dplyr)

df1 %>% 
  left_join(df2, by="CHR") %>% 
  filter(between(POS, start, end)) %>% 
  select(-c(start, end))

  CHR  POS Gene
1  10 4342 ABC1
2  20  100  JHT
3  22 5422  KLO

2
投票

你可以使用

GenomicRanges
来做这样的事情。请参阅下面开头注释掉的代码以进行安装。

GRanges
类是基因组位置和相关注释的容器。

函数

makeGRangesFromDataFrame
将采用data.frame作为输入并自动找到描述基因组范围的列(默认为
start
end
stop
)。

下面使用提供的附加示例数据。

# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")

# BiocManager::install("GenomicRanges")

library(GenomicRanges)

gr1 <- GRanges(seqnames = df1$chromosome,
               IRanges(start = df1$position, width = 1))
gr2 <- makeGRangesFromDataFrame(df2, keep.extra.columns = TRUE)

df1$gene <- NA
ovlp <- findOverlaps(gr1, gr2)
df1$gene[queryHits(ovlp)] <- gr2$gene[subjectHits(ovlp)]

df1

输出

  chromosome position  gene
1       chr1      100 geneA
2       chr1      200  <NA>
3       chr2      300 geneB
4       chr3      400 geneC

1
投票

考虑到这些人,

m1
#   CHR  POS
# 1  10 4342
# 2  20  100
# 3  22 5422
# 4  11   10

m2
#   CHR start  end Gene
# 1  10  4000 5999 ABC1
# 2  20    50  200  JHT
# 3  22  5000 6000  KLO
# 4  11  5000 6000  KLO

你可以这样做:

merge(m1, m2) |> {\(.) subset(., data.table::between(.$POS, .$start, .$end))}()
#   CHR  POS start  end Gene
# 1  10 4342  4000 5999 ABC1
# 2  20  100    50  200  JHT
# 3  22 5422  5000 6000  KLO

资料:

m1 <- structure(list(CHR = c(10L, 20L, 22L, 11L), POS = c(4342L, 100L, 
5422L, 10L)), class = "data.frame", row.names = c(NA, -4L))


m2 <- structure(list(CHR = c(10L, 20L, 22L, 11L), start = c(4000L, 
50L, 5000L, 5000L), end = c(5999L, 200L, 6000L, 6000L), Gene = c("ABC1", 
"JHT", "KLO", "KLO")), class = "data.frame", row.names = c(NA, 
-4L))
© www.soinside.com 2019 - 2024. All rights reserved.