根据R中的条件,添加元素代表的列和nb。

问题描述 投票:0回答:1

你好,这里是一个数据框架

   Groups Names COL1  COL2  COL3        COL4
1      G1   SP1    1 0.400 0.500   Sequence1
2      G1   SP1    1 0.004 0.005   Sequence2
3      G1   SP1    0 0.004 0.005   Sequence3
4      G1   SP2    0 0.400 0.005 Sequence123
5      G1   SP2    0 0.004 0.500  Sequence14
6      G1   SP3    0 0.005 0.006  Sequence15
7      G1   SP5    1 0.400 0.006  Sequence16
8      G1   SP6    1 0.008 0.002  Sequence20
10     G2   Sp1    0 0.004 0.005  Sequence17
11     G2   SP1    0 0.050 0.600  Sequence18
12     G2   SP1    0 0.400 0.600   Sequence3
13     G2   SP2    0 0.004 0.005  Sequence22
14     G2   SP2    0 0.004 0.005  Sequence23
15     G2   SP5    0 0.004 0.005  Sequence16
16     G2   SP6    0 0.003 0.002  Sequence21
17     G2   SP7    0 0.560 0.760  Sequence67
18     G3   SP5    0 0.87  0.767  Sequence16

我想添加一个新的列 COL5其中,我为Group中的每个Names都加了一个1,如果我们组内有共享的Sequences。例如,我们看一下G1。

SP1 拥有 Sequence3 载于 G1 并在 G2,所以我把 row 3 and 12.在此 (2)同上。SP5 具备 Sequence15G1Sequence15G2G3,(这里的重复次数为3)

因此,对于所有的数据框架,我应该得到。

   Groups Names COL1  COL2  COL3        COL4 COL5
1      G1   SP1    1 0.400 0.500   Sequence1 0
2      G1   SP1    1 0.004 0.005   Sequence2 0
3      G1   SP1    0 0.004 0.005   Sequence3 2
4      G1   SP2    0 0.400 0.005 Sequence123 0
5      G1   SP2    0 0.004 0.500  Sequence14 0
6      G1   SP3    0 0.005 0.006  Sequence15 0
7      G1   SP5    1 0.400 0.006  Sequence16 3
8      G1   SP6    1 0.008 0.002  Sequence20 0
10     G2   Sp1    0 0.004 0.005  Sequence17 0
11     G2   SP1    0 0.050 0.600  Sequence18 0
12     G2   SP1    0 0.400 0.600   Sequence3 2
13     G2   SP2    0 0.004 0.005  Sequence22 0
14     G2   SP2    0 0.004 0.005  Sequence23 0
15     G2   SP5    0 0.004 0.005  Sequence16 3
16     G2   SP6    0 0.003 0.002  Sequence21 0
17     G2   SP7    0 0.560 0.760  Sequence67 0
18     G3   SP5    0 0.87  0.767  Sequence16 3

这里是 dput:

dput(test_df)
structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,3L), .Label = c("G1", "G2","G3"), class = "factor"), 
    Names = structure(c(2L, 2L, 2L, 3L, 3L, 4L, 5L, 6L, 1L, 2L, 
    2L, 3L, 3L, 5L, 6L, 7L,5L), .Label = c("Sp1", "SP1", "SP2", 
    "SP3", "SP5", "SP6", "SP7","SP5"), class = "factor"), COL1 = c(1L, 
    1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
    ,0L), COL2 = c(0.4, 0.004, 0.004, 0.4, 0.004, 0.005, 0.4, 0.008, 
    0.004, 0.05, 0.4, 0.004, 0.004, 0.004, 0.003, 0.56,0.87), COL3 = c(0.5, 
    0.005, 0.005, 0.005, 0.5, 0.006, 0.006, 0.002, 0.005, 0.6, 
    0.6, 0.005, 0.005, 0.005, 0.002, 0.76,0.767 ), COL4 = structure(c(1L, 
    8L, 13L, 2L, 3L, 4L, 5L, 9L, 6L, 7L, 13L, 11L, 12L, 5L, 10L, 
    14L), .Label = c("Sequence1", "Sequence123", "Sequence14", 
    "Sequence15", "Sequence16", "Sequence17", "Sequence18", "Sequence2", 
    "Sequence20", "Sequence21", "Sequence22", "Sequence23", "Sequence3", 
    "Sequence67","Sequence16"), class = "factor")), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "10", "11", "12", "13", "14", 
"15", "16", "17","18"))
r dataframe dplyr
1个回答
2
投票

我们可以计算出独特的数量 Groups 对于每个 COL4 值,如果超过1,则赋值为10。

library(dplyr)
test_df %>% 
 group_by(COL4, Names) %>% 
 mutate(COL5 = {ind = n_distinct(Groups); if(ind > 1) ind else 0})


#   Groups Names  COL1  COL2  COL3 COL4         COL5
#   <chr>  <chr> <int> <dbl> <dbl> <chr>       <dbl>
# 1 G1     SP1       1 0.4   0.5   Sequence1       0
# 2 G1     SP1       1 0.004 0.005 Sequence2       0
# 3 G1     SP1       0 0.004 0.005 Sequence3       2
# 4 G1     SP2       0 0.4   0.005 Sequence123     0
# 5 G1     SP2       0 0.004 0.5   Sequence14      0
# 6 G1     SP3       0 0.005 0.006 Sequence15      0
# 7 G1     SP5       1 0.4   0.006 Sequence16      3
# 8 G1     SP6       1 0.008 0.002 Sequence20      0
# 9 G2     Sp1       0 0.004 0.005 Sequence17      0
#10 G2     SP1       0 0.05  0.6   Sequence18      0
#11 G2     SP1       0 0.4   0.6   Sequence3       2
#12 G2     SP2       0 0.004 0.005 Sequence22      0
#13 G2     SP2       0 0.004 0.005 Sequence23      0
#14 G2     SP5       0 0.004 0.005 Sequence16      3
#15 G2     SP6       0 0.003 0.002 Sequence21      0
#16 G2     SP7       0 0.56  0.76  Sequence67      0
#17 G3     SP5       0 0.87  0.767 Sequence16      3

或者在 data.table :

library(data.table)
setDT(test_df)[, COL5 := {ind = uniqueN(Groups); if(ind > 1) ind else 0}, .(COL4, Names)]

并以R为基数。

test_df$COL5 <- with(test_df, as.integer(ave(as.character(Groups), COL4, Names,
                         FUN = function(x) length(unique(x)))))
test_df$COL5[test_df$COL5 == 1] <- 0

0
投票

我们还可以做到

library(dplyr)
test_df %>% 
  group_by(COL4, Names) %>% 
  mutate(COL5 = n_distinct(Groups) * (n_distinct(Groups) > 1))

或与 data.table

library(data.table)
setDT(test_df)[, COL5 := uniqueN(Groups) * (uniqueN(Groups) > 1), .(COL4, Names)]
© www.soinside.com 2019 - 2024. All rights reserved.