如果 col1 中的值在 col2 中且 col2 中的值在 col1 中,则删除行

问题描述 投票:0回答:4

我在基础 R 中工作(请不要整洁的诗句)。我在 R 中有以下数据框:

> geneA  
[1] "GNG7"  "GNG7"  "GNG7"  "GNG12" "GNG12" "GNG12" "GNG2"  "GNG2"  "GNG2"  
[10] "GNG5"  "GNG5"  "GNG5" 

> geneB  
 [1] "GNG12" "GNG5"  "GNG2"  "GNG7"  "GNG5"  "GNG2"  "GNG5"  "GNG12" "GNG7"  
[10] "GNG12" "GNG7"  "GNG2" 

(一些数据争论以创建额外的列 GENE_PAIR)

> GNGdata
    geneA geneB  GENE_PAIR  
1  GNG7 GNG12 GNG7;GNG12  
2  GNG7  GNG5  GNG7;GNG5  
3  GNG7  GNG2  GNG7;GNG2  
4 GNG12  GNG7 GNG12;GNG7  
5 GNG12  GNG5 GNG12;GNG5  
6 GNG12  GNG2 GNG12;GNG2  
7  GNG2  GNG5  GNG2;GNG5  
8  GNG2 GNG12 GNG2;GNG12  
9  GNG2  GNG7  GNG2;GNG7  
10  GNG5 GNG12 GNG5;GNG12  
11  GNG5  GNG7  GNG5;GNG7  
12  GNG5  GNG2  GNG5;GNG2

如您所见,有重复的 GENE_PAIR(1 和 4、2 和 11 等)。我只想保留一对。例如,GNG7;GNG12 对存在,所以我想从我的新数据框中排除 GNG12,GNG7 对。

我期待这样的结果:

> GNGdata_filtered  
    geneA geneB  GENE_PAIR  
1  GNG7 GNG12 GNG7;GNG12  
2  GNG7  GNG5  GNG7;GNG5  
3  GNG7  GNG2  GNG7;GNG2  
4 GNG12  GNG5 GNG12;GNG5  
5 GNG12  GNG2 GNG12;GNG2  
66  GNG2  GNG5  GNG2;GNG5  
r dataframe subset
4个回答
3
投票

使用

pmax
pmin
生成排序的基因对。然后删除重复的。

GNGdata$geneAB <- with(GNGdata, paste0(pmax(geneA, geneB), ";", pmin(geneA, geneB)))
GNGdata_filtered <- GNGdata[!duplicated(GNGdata$geneAB),]

  geneA geneB  GENE_PAIR     geneAB
1  GNG7 GNG12 GNG7;GNG12 GNG7;GNG12
2  GNG7  GNG5  GNG7;GNG5  GNG7;GNG5
3  GNG7  GNG2  GNG7;GNG2  GNG7;GNG2
5 GNG12  GNG5 GNG12;GNG5 GNG5;GNG12
6 GNG12  GNG2 GNG12;GNG2 GNG2;GNG12
7  GNG2  GNG5  GNG2;GNG5  GNG5;GNG2

1
投票

类似于@benson23的答案,但在

sort()
函数中使用
paste()

dat <- tibble::tribble(
  ~geneA, ~geneB,
"GNG7", "GNG12", 
"GNG7",  "GNG5", 
"GNG7",  "GNG2", 
"GNG12",  "GNG7",
"GNG12",  "GNG5",
"GNG12",  "GNG2",
"GNG2",  "GNG5", 
"GNG2", "GNG12", 
"GNG2",  "GNG7", 
"GNG5", "GNG12", 
"GNG5",  "GNG7", 
"GNG5",  "GNG2")

dat$gene_pair <- apply(dat[,c("geneA", "geneB")], 1, function(x)paste(sort(x), collapse=";"))
dat <- dat[!duplicated(dat$gene_pair), ]
dat
#> # A tibble: 6 × 3
#>   geneA geneB gene_pair 
#>   <chr> <chr> <chr>     
#> 1 GNG7  GNG12 GNG12;GNG7
#> 2 GNG7  GNG5  GNG5;GNG7 
#> 3 GNG7  GNG2  GNG2;GNG7 
#> 4 GNG12 GNG5  GNG12;GNG5
#> 5 GNG12 GNG2  GNG12;GNG2
#> 6 GNG2  GNG5  GNG2;GNG5

创建于 2023-05-11 与 reprex v2.0.2


1
投票

您可以在使用

idx
后创建重复基因的索引(
mixedsort
),然后在那些不重复的数据框上建立索引:

idx <- duplicated(unlist(lapply(lapply(strsplit(df$GENE_PAIR,";"), gtools::mixedsort),
       function(x) paste(x, collapse = ";"))))

df[!idx,]

#  geneA geneB  GENE_PAIR
# 1  GNG7 GNG12 GNG7;GNG12
# 2  GNG7  GNG5  GNG7;GNG5
# 3  GNG7  GNG2  GNG7;GNG2
# 5 GNG12  GNG5 GNG12;GNG5
# 6 GNG12  GNG2 GNG12;GNG2
# 7  GNG2  GNG5  GNG2;GNG5

0
投票

使用 base R,我们可以像下面这样尝试

order
+
unique

transform(
    setNames(
        as.data.frame(
            unique(t(matrix(
                c(t(df))[order(c(t(row(df))), c(t(df)))],
                ncol(df)
            )))
        ),
        names(df)
    ),
    GENE_PAIR = paste0(geneA, ";", geneB)
)

这给

  geneA geneB  GENE_PAIR
1 GNG12  GNG7 GNG12;GNG7
2  GNG5  GNG7  GNG5;GNG7
3  GNG2  GNG7  GNG2;GNG7
4 GNG12  GNG5 GNG12;GNG5
5 GNG12  GNG2 GNG12;GNG2
6  GNG2  GNG5  GNG2;GNG5

如果你不介意使用除base R之外的其他包,我们可以尝试如下

igraph

library(igraph)
df %>%
    graph_from_data_frame(directed = FALSE) %>%
    simplify() %>%
    get.data.frame() %>%
    set_names(names(df)) %>%
    mutate(GENE_PAIR = do.call(paste, c(., sep = ";")))

这给

  geneA geneB  GENE_PAIR
1  GNG7 GNG12 GNG7;GNG12
2  GNG7  GNG2  GNG7;GNG2
3  GNG7  GNG5  GNG7;GNG5
4 GNG12  GNG2 GNG12;GNG2
5 GNG12  GNG5 GNG12;GNG5
6  GNG2  GNG5  GNG2;GNG5

数据

> dput(df)
structure(list(geneA = c("GNG7", "GNG7", "GNG7", "GNG12", "GNG12",
"GNG12", "GNG2", "GNG2", "GNG2", "GNG5", "GNG5", "GNG5"), geneB = c("GNG12",
"GNG5", "GNG2", "GNG7", "GNG5", "GNG2", "GNG5", "GNG12", "GNG7",
"GNG12", "GNG7", "GNG2")), row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12"), class = "data.frame")
© www.soinside.com 2019 - 2024. All rights reserved.