你好,这里是一个数据框架
Groups Names COL1 COL2 COL3 COL4
1 G1 SP1 1 0.400 0.500 Sequence1
2 G1 SP1 1 0.004 0.005 Sequence2
3 G1 SP1 0 0.004 0.005 Sequence3
4 G1 SP2 0 0.400 0.005 Sequence123
5 G1 SP2 0 0.004 0.500 Sequence14
6 G1 SP3 0 0.005 0.006 Sequence15
7 G1 SP5 1 0.400 0.006 Sequence16
8 G1 SP6 1 0.008 0.002 Sequence20
10 G2 Sp1 0 0.004 0.005 Sequence17
11 G2 SP1 0 0.050 0.600 Sequence18
12 G2 SP1 0 0.400 0.600 Sequence3
13 G2 SP2 0 0.004 0.005 Sequence22
14 G2 SP2 0 0.004 0.005 Sequence23
15 G2 SP5 0 0.004 0.005 Sequence16
16 G2 SP6 0 0.003 0.002 Sequence21
17 G2 SP7 0 0.560 0.760 Sequence67
18 G3 SP5 0 0.87 0.767 Sequence16
我想添加一个新的列 COL5
其中,我为Group中的每个Names都加了一个1,如果我们组内有共享的Sequences。例如,我们看一下G1。
的 SP1
拥有 Sequence3
载于 G1
并在 G2
,所以我把 row 3 and 12
.在此 (2)同上。SP5
具备 Sequence15
在 G1
和 Sequence15
在 G2
和 G3
,(这里的重复次数为3)
因此,对于所有的数据框架,我应该得到。
Groups Names COL1 COL2 COL3 COL4 COL5
1 G1 SP1 1 0.400 0.500 Sequence1 0
2 G1 SP1 1 0.004 0.005 Sequence2 0
3 G1 SP1 0 0.004 0.005 Sequence3 2
4 G1 SP2 0 0.400 0.005 Sequence123 0
5 G1 SP2 0 0.004 0.500 Sequence14 0
6 G1 SP3 0 0.005 0.006 Sequence15 0
7 G1 SP5 1 0.400 0.006 Sequence16 3
8 G1 SP6 1 0.008 0.002 Sequence20 0
10 G2 Sp1 0 0.004 0.005 Sequence17 0
11 G2 SP1 0 0.050 0.600 Sequence18 0
12 G2 SP1 0 0.400 0.600 Sequence3 2
13 G2 SP2 0 0.004 0.005 Sequence22 0
14 G2 SP2 0 0.004 0.005 Sequence23 0
15 G2 SP5 0 0.004 0.005 Sequence16 3
16 G2 SP6 0 0.003 0.002 Sequence21 0
17 G2 SP7 0 0.560 0.760 Sequence67 0
18 G3 SP5 0 0.87 0.767 Sequence16 3
这里是 dput
:
dput(test_df)
structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,3L), .Label = c("G1", "G2","G3"), class = "factor"),
Names = structure(c(2L, 2L, 2L, 3L, 3L, 4L, 5L, 6L, 1L, 2L,
2L, 3L, 3L, 5L, 6L, 7L,5L), .Label = c("Sp1", "SP1", "SP2",
"SP3", "SP5", "SP6", "SP7","SP5"), class = "factor"), COL1 = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
,0L), COL2 = c(0.4, 0.004, 0.004, 0.4, 0.004, 0.005, 0.4, 0.008,
0.004, 0.05, 0.4, 0.004, 0.004, 0.004, 0.003, 0.56,0.87), COL3 = c(0.5,
0.005, 0.005, 0.005, 0.5, 0.006, 0.006, 0.002, 0.005, 0.6,
0.6, 0.005, 0.005, 0.005, 0.002, 0.76,0.767 ), COL4 = structure(c(1L,
8L, 13L, 2L, 3L, 4L, 5L, 9L, 6L, 7L, 13L, 11L, 12L, 5L, 10L,
14L), .Label = c("Sequence1", "Sequence123", "Sequence14",
"Sequence15", "Sequence16", "Sequence17", "Sequence18", "Sequence2",
"Sequence20", "Sequence21", "Sequence22", "Sequence23", "Sequence3",
"Sequence67","Sequence16"), class = "factor")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "10", "11", "12", "13", "14",
"15", "16", "17","18"))
我们可以计算出独特的数量 Groups
对于每个 COL4
值,如果超过1,则赋值为10。
library(dplyr)
test_df %>%
group_by(COL4, Names) %>%
mutate(COL5 = {ind = n_distinct(Groups); if(ind > 1) ind else 0})
# Groups Names COL1 COL2 COL3 COL4 COL5
# <chr> <chr> <int> <dbl> <dbl> <chr> <dbl>
# 1 G1 SP1 1 0.4 0.5 Sequence1 0
# 2 G1 SP1 1 0.004 0.005 Sequence2 0
# 3 G1 SP1 0 0.004 0.005 Sequence3 2
# 4 G1 SP2 0 0.4 0.005 Sequence123 0
# 5 G1 SP2 0 0.004 0.5 Sequence14 0
# 6 G1 SP3 0 0.005 0.006 Sequence15 0
# 7 G1 SP5 1 0.4 0.006 Sequence16 3
# 8 G1 SP6 1 0.008 0.002 Sequence20 0
# 9 G2 Sp1 0 0.004 0.005 Sequence17 0
#10 G2 SP1 0 0.05 0.6 Sequence18 0
#11 G2 SP1 0 0.4 0.6 Sequence3 2
#12 G2 SP2 0 0.004 0.005 Sequence22 0
#13 G2 SP2 0 0.004 0.005 Sequence23 0
#14 G2 SP5 0 0.004 0.005 Sequence16 3
#15 G2 SP6 0 0.003 0.002 Sequence21 0
#16 G2 SP7 0 0.56 0.76 Sequence67 0
#17 G3 SP5 0 0.87 0.767 Sequence16 3
或者在 data.table
:
library(data.table)
setDT(test_df)[, COL5 := {ind = uniqueN(Groups); if(ind > 1) ind else 0}, .(COL4, Names)]
并以R为基数。
test_df$COL5 <- with(test_df, as.integer(ave(as.character(Groups), COL4, Names,
FUN = function(x) length(unique(x)))))
test_df$COL5[test_df$COL5 == 1] <- 0
我们还可以做到
library(dplyr)
test_df %>%
group_by(COL4, Names) %>%
mutate(COL5 = n_distinct(Groups) * (n_distinct(Groups) > 1))
或与 data.table
library(data.table)
setDT(test_df)[, COL5 := uniqueN(Groups) * (uniqueN(Groups) > 1), .(COL4, Names)]