我有非常大的 csv 文件,并在存在缺失数据中搜索独特的基因,以及它们的数量。我的数据如下
df <- data.frame(
A = c("G1", "G2", "G3", "G4", "G5","G6","G7", "G8", "G9","G10"),
B = c(1, 0, 1, 0, 1, 1, 1, 0, 0, 0),
C = c(1, 0, 1, 0, 0, 0, 0, 1, 1, 0),
D = c(1, 1, 0, 0, 0, 0, 0, 0, 0, 1),
E = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0))
输出如下第一个是具有独特基因的数据框
df_uniq <- data.frame(
A = c("G4", "G5","G6","G7", "G8", "G9","G10"),
B = c(0, 1, 1, 1, 0, 0, 0),
C = c(0, 0, 0, 0, 1, 1, 0),
D = c(0, 0, 0, 0, 0, 0, 1),
E = c(1, 0, 0, 0, 0, 0, 0))
感谢您的帮助谢谢!
一个
dplyr
解决方案:
library(dplyr)
df_uniq <- df %>%
filter(rowSums(pick(-A)) == 1)
# A B C D E
# 4 G4 0 0 0 1
# 5 G5 1 0 0 0
# 6 G6 1 0 0 0
# 7 G7 1 0 0 0
# 8 G8 0 1 0 0
# 9 G9 0 1 0 0
# 10 G10 0 0 1 0
df_uniq %>%
summarise(across(-A, sum))
# B C D E
# 1 3 2 1 1
一个
dplyr
解决方案
library(dplyr)
df %>%
filter(if_any(-A))
colSums(df_uniq[-1])
B C D E
3 2 1 1