在 dplyr 中使用 OR 来连接任意一组列

问题描述 投票:0回答:1

在 SQL 中,您可以通过 OR 指定的多个列进行连接。我需要它从查找中引入新列,其中有多个可能匹配的标识符。

例如,通过列

x
y
连接
a
b

x <- tibble::tribble(
  ~a,   ~b,  ~old_info,
   1,   10,     "old1",
   2,   20,     "old2",
   3,   30,     "old3",
   4,   40,     "old4",
   5,   50,     "old5"
)

y <- tibble::tribble(
  ~a,   ~b,  ~new_info,
   1,   NA,     "new1",
   2,   NA,     "new2",
  NA,   NA,     "new3",
  NA,   40,     "new4",
  NA,   50,     "new5"
)

这将是所需的输出。

#> # A tibble: 5 × 4
#>       a     b old_info new_info
#>   <dbl> <dbl> <chr>    <chr>   
#> 1     1    10 old1     new1    
#> 2     2    20 old2     new2    
#> 3     3    30 old3     NA      
#> 4     4    40 old4     new4    
#> 5     5    50 old5     new5    
r join dplyr
1个回答
0
投票

这满足了我的需求。

加盟功能

multi_left_join <- function(x, y, by, cols_y,
                            na_matches = "never", relationship = NULL) {

  # Joins -------------------------------------------------------------------
  
  # Get columns to join by as symbols (needed for tidyeval stuff)
  by_syms <- x |> 
    select({{by}}) |> 
    names() |> 
    rlang::syms()
  
  # Loop to join by each column
  for (i in seq_along(by_syms)) {
    by_sym <- by_syms[[i]]
    
    x <- x |> 
      left_join(y |>
                  select({{by_sym}}, {{cols_y}}) |> 
                  rename_with(~ paste0(., "_", by_sym), .cols = -{{by_sym}}),
                by = join_by({{by_sym}}),
                na_matches = na_matches,
                relationship = relationship)
  }
  

  # Coalesce joined columns -------------------------------------------------
  
  # We now have multiple versions of each column (with a suffix of _{{by}}).
  # This will coalesce each of those into one column, without the suffix
  
  # Get columns names as symbols (needed for tidyeval stuff)
  cols_y_syms <- y |> 
    select({{cols_y}}) |> 
    names() |> 
    rlang::syms()
  
  # Loop over each of the new columns
  for (i in seq_along(cols_y_syms)) {
    col_y_sym <- cols_y_syms[[i]]
    
    # List of symbols of each of the versions of the new column
    col_y_versions <- by_syms |> 
      map_chr(~ paste0(col_y_sym, "_", .)) |> 
      rlang::syms()
    
    # Coalesce the multiple versions. .keep = "unused" is to throw away the
    # different versions after we're done. I'd prefer to do it more explicitly
    # in select(), but I couldn't figure out the tidyeval
    x <- x |> 
      mutate({{col_y_sym}} := coalesce(!!!col_y_versions), .keep = "unused")
  }
  
  return(x)
}

示例

x <- tibble::tribble(
  ~a,   ~b,  ~old_info,
   1,   10,     "old1",
   2,   20,     "old2",
   3,   30,     "old3",
   4,   40,     "old4",
   5,   50,     "old5"
)


y <- tibble::tribble(
  ~a,   ~b,  ~new_info,
   1,   NA,     "new1",
   2,   NA,     "new2",
  NA,   NA,     "new3",
  NA,   40,     "new4",
  NA,   50,     "new5"
)



x |> 
  multi_left_join(y,
                  by = c(a, b),
                  cols_y = new_info)
#> # A tibble: 5 × 4
#>       a     b old_info new_info
#>   <dbl> <dbl> <chr>    <chr>   
#> 1     1    10 old1     new1    
#> 2     2    20 old2     new2    
#> 3     3    30 old3     NA      
#> 4     4    40 old4     new4    
#> 5     5    50 old5     new5    
© www.soinside.com 2019 - 2024. All rights reserved.