我在基础 R 中工作(请不要整洁的诗句)。我在 R 中有以下数据框:
> geneA
[1] "GNG7" "GNG7" "GNG7" "GNG12" "GNG12" "GNG12" "GNG2" "GNG2" "GNG2"
[10] "GNG5" "GNG5" "GNG5"
> geneB
[1] "GNG12" "GNG5" "GNG2" "GNG7" "GNG5" "GNG2" "GNG5" "GNG12" "GNG7"
[10] "GNG12" "GNG7" "GNG2"
(一些数据争论以创建额外的列 GENE_PAIR)
> GNGdata
geneA geneB GENE_PAIR
1 GNG7 GNG12 GNG7;GNG12
2 GNG7 GNG5 GNG7;GNG5
3 GNG7 GNG2 GNG7;GNG2
4 GNG12 GNG7 GNG12;GNG7
5 GNG12 GNG5 GNG12;GNG5
6 GNG12 GNG2 GNG12;GNG2
7 GNG2 GNG5 GNG2;GNG5
8 GNG2 GNG12 GNG2;GNG12
9 GNG2 GNG7 GNG2;GNG7
10 GNG5 GNG12 GNG5;GNG12
11 GNG5 GNG7 GNG5;GNG7
12 GNG5 GNG2 GNG5;GNG2
如您所见,有重复的 GENE_PAIR(1 和 4、2 和 11 等)。我只想保留一对。例如,GNG7;GNG12 对存在,所以我想从我的新数据框中排除 GNG12,GNG7 对。
我期待这样的结果:
> GNGdata_filtered
geneA geneB GENE_PAIR
1 GNG7 GNG12 GNG7;GNG12
2 GNG7 GNG5 GNG7;GNG5
3 GNG7 GNG2 GNG7;GNG2
4 GNG12 GNG5 GNG12;GNG5
5 GNG12 GNG2 GNG12;GNG2
66 GNG2 GNG5 GNG2;GNG5
使用
pmax
和pmin
生成排序的基因对。然后删除重复的。
GNGdata$geneAB <- with(GNGdata, paste0(pmax(geneA, geneB), ";", pmin(geneA, geneB)))
GNGdata_filtered <- GNGdata[!duplicated(GNGdata$geneAB),]
geneA geneB GENE_PAIR geneAB
1 GNG7 GNG12 GNG7;GNG12 GNG7;GNG12
2 GNG7 GNG5 GNG7;GNG5 GNG7;GNG5
3 GNG7 GNG2 GNG7;GNG2 GNG7;GNG2
5 GNG12 GNG5 GNG12;GNG5 GNG5;GNG12
6 GNG12 GNG2 GNG12;GNG2 GNG2;GNG12
7 GNG2 GNG5 GNG2;GNG5 GNG5;GNG2
类似于@benson23的答案,但在
sort()
函数中使用paste()
:
dat <- tibble::tribble(
~geneA, ~geneB,
"GNG7", "GNG12",
"GNG7", "GNG5",
"GNG7", "GNG2",
"GNG12", "GNG7",
"GNG12", "GNG5",
"GNG12", "GNG2",
"GNG2", "GNG5",
"GNG2", "GNG12",
"GNG2", "GNG7",
"GNG5", "GNG12",
"GNG5", "GNG7",
"GNG5", "GNG2")
dat$gene_pair <- apply(dat[,c("geneA", "geneB")], 1, function(x)paste(sort(x), collapse=";"))
dat <- dat[!duplicated(dat$gene_pair), ]
dat
#> # A tibble: 6 × 3
#> geneA geneB gene_pair
#> <chr> <chr> <chr>
#> 1 GNG7 GNG12 GNG12;GNG7
#> 2 GNG7 GNG5 GNG5;GNG7
#> 3 GNG7 GNG2 GNG2;GNG7
#> 4 GNG12 GNG5 GNG12;GNG5
#> 5 GNG12 GNG2 GNG12;GNG2
#> 6 GNG2 GNG5 GNG2;GNG5
创建于 2023-05-11 与 reprex v2.0.2
您可以在使用
idx
后创建重复基因的索引(mixedsort
),然后在那些不重复的数据框上建立索引:
idx <- duplicated(unlist(lapply(lapply(strsplit(df$GENE_PAIR,";"), gtools::mixedsort),
function(x) paste(x, collapse = ";"))))
df[!idx,]
# geneA geneB GENE_PAIR
# 1 GNG7 GNG12 GNG7;GNG12
# 2 GNG7 GNG5 GNG7;GNG5
# 3 GNG7 GNG2 GNG7;GNG2
# 5 GNG12 GNG5 GNG12;GNG5
# 6 GNG12 GNG2 GNG12;GNG2
# 7 GNG2 GNG5 GNG2;GNG5
使用 base R,我们可以像下面这样尝试
order
+ unique
transform(
setNames(
as.data.frame(
unique(t(matrix(
c(t(df))[order(c(t(row(df))), c(t(df)))],
ncol(df)
)))
),
names(df)
),
GENE_PAIR = paste0(geneA, ";", geneB)
)
这给
geneA geneB GENE_PAIR
1 GNG12 GNG7 GNG12;GNG7
2 GNG5 GNG7 GNG5;GNG7
3 GNG2 GNG7 GNG2;GNG7
4 GNG12 GNG5 GNG12;GNG5
5 GNG12 GNG2 GNG12;GNG2
6 GNG2 GNG5 GNG2;GNG5
如果你不介意使用除base R之外的其他包,我们可以尝试如下
igraph
包
library(igraph)
df %>%
graph_from_data_frame(directed = FALSE) %>%
simplify() %>%
get.data.frame() %>%
set_names(names(df)) %>%
mutate(GENE_PAIR = do.call(paste, c(., sep = ";")))
这给
geneA geneB GENE_PAIR
1 GNG7 GNG12 GNG7;GNG12
2 GNG7 GNG2 GNG7;GNG2
3 GNG7 GNG5 GNG7;GNG5
4 GNG12 GNG2 GNG12;GNG2
5 GNG12 GNG5 GNG12;GNG5
6 GNG2 GNG5 GNG2;GNG5
> dput(df)
structure(list(geneA = c("GNG7", "GNG7", "GNG7", "GNG12", "GNG12",
"GNG12", "GNG2", "GNG2", "GNG2", "GNG5", "GNG5", "GNG5"), geneB = c("GNG12",
"GNG5", "GNG2", "GNG7", "GNG5", "GNG2", "GNG5", "GNG12", "GNG7",
"GNG12", "GNG7", "GNG2")), row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12"), class = "data.frame")