
问题描述 投票:0回答:5



  X    Y       
 ---   ---
  5    10
  5    11
  11   12
  11   13
  13   3
  20   18
  17   18
  50   18
  20   21

我现在所拥有的确实是一团糟。它使用带有匹配函数的 fuzzy_join,该函数将 x,y 作为向量并在它们之间进行匹配。然后,该匹配会创建一个由所有四个匹配数字组成的更大向量,该向量返回到 fuzzy_join 中再次进行匹配。如此循环,直到不再有匹配项为止。它很快就会变得很糟糕,在大约 4k 记录时它就不再响应了。整个初始配对表将保留

< 100k records

在 R 基础上你可以这样做:
r purrr hierarchy
relation <- function(dat){ .relation <- function(x){ k = unique(sort(c(dat[dat[, 1] %in% x, 2], x, dat[dat[, 2] %in% x, 1]))) if(setequal(x,k)) toString(k) else .relation(k)} sapply(dat[,1],.relation) } df$related <- relation(df) df X Y related 1 5 10 3, 5, 10, 11, 12, 13 2 5 11 3, 5, 10, 11, 12, 13 3 11 12 3, 5, 10, 11, 12, 13 4 11 13 3, 5, 10, 11, 12, 13 5 13 3 3, 5, 10, 11, 12, 13 6 20 18 17, 18, 20, 21, 50 7 17 18 17, 18, 20, 21, 50 8 50 18 17, 18, 20, 21, 50 9 20 21 17, 18, 20, 21, 50


a <- components(graph_from_data_frame(df, FALSE))$membership
b <- tapply(names(a),a,toString)
df$related <- b[a[as.character(df$X)]]


sapply(dat[,1], ...)



 将计算该组 4 次而不是一次。现在使用:
relation <- function(dat){
  .relation <- function(x){
    k <- unique(c(dat[dat[, 1] %in% x, 2], x, dat[dat[, 2] %in% x, 1]))
    if(setequal(x,k)) sort(k) else .relation(k)}
  d <- unique(dat[,1])
  m <- setNames(character(length(d)),d)
  while(length(d) > 0){
    s <- .relation(d[1])
    m[as.character(s)] <- toString(s)
    d <- d[!d%in%s]
  dat$groups <- m[as.character(dat[,1])]

 df1 <- do.call(rbind,rep(list(df),100))
 microbenchmark::microbenchmark(relation(df1), group_pairs(df1),unit = "relative")

 microbenchmark::microbenchmark(relation(df1), group_pairs(df1))
Unit: milliseconds
             expr      min        lq       mean    median       uq      max neval
    relation(df1)   1.0909   1.17175   1.499096   1.27145   1.6580   3.2062   100
 group_pairs(df1) 153.3965 173.54265 199.559206 190.62030 213.7964 424.8309   100



clt <- clusters(graph_from_data_frame(df,directed = FALSE))$membership
within(df, ID <- ave(names(clt),clt,FUN = toString)[match(as.character(X),names(clt))])

   X  Y                   ID
1  5 10 5, 11, 13, 10, 12, 3
2  5 11 5, 11, 13, 10, 12, 3
3 11 12 5, 11, 13, 10, 12, 3
4 11 13 5, 11, 13, 10, 12, 3
5 13  3 5, 11, 13, 10, 12, 3
6 20 18   20, 17, 50, 18, 21
7 17 18   20, 17, 50, 18, 21
8 50 18   20, 17, 50, 18, 21
9 20 21   20, 17, 50, 18, 21




(累积) related_ids。这里
df <- data.frame(X = c(5,5,11,11,13,20, 17,50, 20, 5, 1, 17),
                 Y = c(10, 11, 12, 13, 3, 18, 18, 18, 21, 13, 2, 50))


df %>% arrange(pmax(X, Y)) %>% 
  mutate(id = row_number()) %>% rowwise() %>%
  mutate(related_ids = list(c(X, Y))) %>% ungroup() %>%
  mutate(related_ids = accumulate(related_ids, ~if(any(.y %in% .x)){union(.x, .y)} else {.y})) %>%
#>     X  Y id          related_ids
#> 1   1  2  1                 1, 2
#> 2   5 10  2                5, 10
#> 3   5 11  3            5, 10, 11
#> 4  11 12  4        5, 10, 11, 12
#> 5  11 13  5    5, 10, 11, 12, 13
#> 6  13  3  6 5, 10, 11, 12, 13, 3
#> 7   5 13  7 5, 10, 11, 12, 13, 3
#> 8  17 18  8               17, 18
#> 9  20 18  9           17, 18, 20
#> 10 20 21 10       17, 18, 20, 21
#> 11 50 18 11   17, 18, 20, 21, 50
#> 12 17 50 12   17, 18, 20, 21, 50

reprex 包于 2021 年 6 月 1 日创建


这远不如 Onyambu 的基本 R 答案那么优雅,但我对其进行了基准测试,速度大约快 4 或 5 倍。它的工作原理是将每一行分配给一个组,将其内容添加到该组中所有数字的集合中,然后查找该集合中至少有一个成员的下一个未分配的行。一旦没有更多匹配的行,它就会跳转到下一个未分配的行。

group_pairs <- function(df) { df$ID <- numeric(nrow(df)) ID <- 1 row <- 1 current_set <- numeric() while(any(df$ID == 0)) { df$ID[row] <- ID current_set <- unique(c(current_set, df$x[row], df$y[row])) nextrows <- c(which(df$x %in% current_set & df$ID == 0), which(df$y %in% current_set & df$ID == 0)) if (length(nextrows) > 0) { row <- unique(nextrows)[1] } else { ID <- ID + 1 row <- which(df$ID == 0)[1] current_set <- numeric() } } df$ID <- sapply(split(df[-3], df$ID), function(i) paste(sort(unique(unlist(i))), collapse = ", "))[df$ID] df }

#>    x  y                   ID
#> 1  5 10 3, 5, 10, 11, 12, 13
#> 2  5 11 3, 5, 10, 11, 12, 13
#> 3 11 12 3, 5, 10, 11, 12, 13
#> 4 11 13 3, 5, 10, 11, 12, 13
#> 5 13  3 3, 5, 10, 11, 12, 13
#> 6 20 18   17, 18, 20, 21, 50
#> 7 17 18   17, 18, 20, 21, 50
#> 8 50 18   17, 18, 20, 21, 50
#> 9 20 21   17, 18, 20, 21, 50

microbenchmark::microbenchmark(relation(df), group_pairs(df))
#> Unit: milliseconds
#>             expr      min       lq     mean   median       uq      max neval cld
#>     relation(df) 4.535100 5.027551 5.737164 5.829652 6.256301 7.669001   100   b
#>  group_pairs(df) 1.022502 1.159601 1.398604 1.338501 1.458950 8.903800   100  a 


library(dplyr) # First I created an id column to be able to group the observations with any duplicated # values df %>% arrange(X, Y) %>% mutate(dup = ifelse((X == lag(X, default = 0) | X == lag(Y, default = 0)) | (Y == lag(X, default = 0) | Y == lag(Y, default = 0)) | (X == lag(X, n = 2L, default = 0) | Y == lag(Y, n = 2L, default = 0)) | (X == lag(Y, n = 2L, default = 0) | Y == lag(X, n = 2L, default = 0)) | (X == lag(Y, n = 3L, default = 0) | Y == lag(X, n = 3L, default = 0)) | (X == lag(X, n = 3L, default = 0) | Y == lag(Y, n = 3L, default = 0)), 1, 0)) %>% mutate(id = cumsum(dup == 0)) %>% select(-dup) -> df1 df1 %>% group_by(id) %>% pivot_longer(c(X, Y), names_to = "Name", values_to = "Val") %>% arrange(Val) %>% mutate(dup = Val == lag(Val, default = 10000)) %>% filter(!dup) %>% mutate(across(Val, ~ paste(.x, collapse = "-"))) %>% select(-dup) %>% slice(2:n()) %>% select(-Name) %>% right_join(df1, by = "id") %>% group_by(Val, X, Y) %>% distinct() %>% select(-id) %>% relocate(X, Y) # A tibble: 9 x 3 # Groups: Val, X, Y [9] X Y Val <int> <int> <chr> 1 5 10 3-5-10-11-12-13 2 5 11 3-5-10-11-12-13 3 11 12 3-5-10-11-12-13 4 11 13 3-5-10-11-12-13 5 13 3 3-5-10-11-12-13 6 17 18 17-18-20-21-50 7 20 18 17-18-20-21-50 8 20 21 17-18-20-21-50 9 50 18 17-18-20-21-50

我还尝试了@AnilGoyal 精心制作的数据框:
# A tibble: 12 x 3
# Groups:   Val, X, Y [12]
       X     Y Val            
   <dbl> <dbl> <chr>          
 1     1     2 1-2            
 2     5    10 3-5-10-11-12-13
 3     5    11 3-5-10-11-12-13
 4     5    13 3-5-10-11-12-13
 5    11    12 3-5-10-11-12-13
 6    11    13 3-5-10-11-12-13
 7    13     3 3-5-10-11-12-13
 8    17    18 17-18-20-21-50 
 9    17    50 17-18-20-21-50 
10    20    18 17-18-20-21-50 
11    20    21 17-18-20-21-50 
12    50    18 17-18-20-21-50 

