识别满足 R 中 NA 值的复杂连接条件的记录

问题描述 投票:0回答:1

我有一个包含 3 列的数据框,即

node1
node2
cluster
,它们是从
getLinkCommunities
中的
linkcomm
包中的
R
中提取的。这是一个复制脚本。

community <- data.frame(
  node1 = c(
    "st_sub_main_th=hira", "roo_main=2", "st_con_rt=sub-room", "roo_main=1", "roo_main=1",
    "roo_main=1", "st_sub_main_th=tsuma", "st_sub_main_th=tsuma", "st_sub_main_th=tsuma",
    "st_th=hira", "st_th=hira", "roo_main=1", "st_th=hira", "st_th=hira", "st_con_rt=main-room",
    "st_con_rt=main-room", "st_con_tr=terrace", "roo_main=4", "roo_main=4", "roo_main=4",
    "roo_main=4", "st_th=tsuma", "st_th=tsuma", "st_sub_main_th=hira", "st_sub_main_th=hira",
    "st_sub_main_th=hira", "st_sub_main_th=hira", "roo_main=2", "roo_main=2", "roo_main=2",
    "st_con_tr=direct", "st_con_tr=direct"),
  node2 = c(
    "st_con_tr=terrace", "st_con_tr=terrace", "st_con_tr=terrace", "st_con_tr=direct",
    "st_con_rt=sub-room", "st_adsb=add", "st_con_rt=sub-room", "st_con_tr=terrace",
    "st_adsb=add", "roo_main=1", "st_con_rt=main-room", "st_con_rt=main-room",
    "st_con_tr=terrace", "st_adsb=add", "st_con_tr=terrace", "st_adsb=add", "st_adsb=add",
    "st_th=tsuma", "st_con_rt=main-room", "st_con_tr=terrace", "st_adsb=add",
    "st_con_rt=main-room", "st_adsb=add", "roo_main=2", "st_con_tr=direct",
    "st_con_rt=sub-room", "st_adsb=add", "st_con_tr=direct", "st_con_rt=sub-room",
    "st_adsb=add", "st_con_rt=sub-room", "st_adsb=add"
  ),
  cluster = c(
    1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
  )
)

另一方面,我有一个原始数据框,用于关联分析。

df <- data.frame(
  isstilt = c(NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, NA, NA, NA),
  st_con_rt = c(
    "sub-room", "main-room", "sub-room", "sub-room", "sub-room", "sub-room", NA, "main-room",
    "main-room", "main-room", "main-room", NA, "sub-room", "sub-room", "main-room", "sub-room",
    NA, NA, "main-room", "sub-room"
  ),
  st_con_tr = c(
    "direct", "terrace", "direct", "direct", "direct", "direct", NA, "direct", "terrace", "direct",
    "terrace", NA, "terrace", "direct", "terrace", "terrace", NA, NA, "direct", "terrace"
  ),
  st_th = c(NA, "hira", NA, NA, NA, NA, NA, "tsuma", "tsuma", "tsuma", "tsuma", NA, NA, NA, "hira", NA, NA, NA, "hira", NA),
  st_adsb = c("add", "sub", "sub", "add", "add", "sub", NA, "add", "add", "sub", "add", NA, "add", "add", "add", "add", NA, NA, "add", "add"),
  st_sub_main_th = c("tsuma", NA, "hira", "hira", "hira", "other", NA, NA, NA, NA, NA, NA, "hira", "hira", NA, "tsuma", NA, NA, NA, "hira"),
  st_sub2_main_th = c(
    NA, "hira", "tsuma", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "hira", NA, NA, NA, NA
  ),
  roo_main = c(3, 1, 2, 2, 1, 7, 1, 2, 4, 4, 4, 7, 2, 2, 1, 2, 7, 2, 1, 4)
)

node1
node2
中的
community
包含等号左侧
df
中的一些列和等号右侧的选项。我想根据这些条件将
cluster
信息加入到
df
,但这对我来说似乎很复杂。

我想基本流程可能是这样的:

  1. 将宽格式转换为长格式 将

    node1
    node2
    绑定到一列
    rule

  2. rule
    转换(拆分)为两列(
    element
    choice
    ) 例如,st_sub_main_th=hira 将被拆分为
    st_sub_main_th
    hira

  3. 比较两个数据框并将

    cluster
    添加到
    df
    通过
    element
    检查所有
    choice
    cluster
    ,如果符合簇的所有条件,则将簇号添加到
    df
    。现在我无法想象如何实施它。 例如,
    cluster=1
    包含以下6个条件。

    元素选择 st_sub_main_th hira
    st_con_tr 露台 roo_main 2
    st_con_tr 露台 st_con_rt 子房间 st_con_tr 露台

我需要将

1
添加到新的
cluster
列到
df
中的相应行。

第一个和第二个操作将这样完成:

community <- community %>% pivot_longer(-cluster, names_to="node", values_to="rule") %>%
    select(-node) %>% separate_wider_delim(rule, "=", names=c("element","choice")) %>% data.frame()

现在我得到了一个包含加入条件的表。

> community
  cluster st_sub_main_th st_con_tr roo_main st_con_rt st_adsb st_th
1       1           hira   terrace        2  sub-room    <NA>  <NA>
2       2           <NA>    direct        1  sub-room     add  <NA>
3       3          tsuma   terrace     <NA>  sub-room     add  <NA>
4       4           <NA>      <NA>        1 main-room    <NA>  hira
5       5           <NA>   terrace     <NA> main-room     add  hira
6       6           <NA>   terrace        4 main-room     add tsuma
7       7           hira    direct        2  sub-room     add  <NA>

我根据评论以及

上一篇文章
尝试了superkey方法。

df %>% select(-building) %>% mutate(cluster = {
    match(
      do.call(paste, .),
      com %>% mutate(across(-cluster, ~ if_else(is.na(.x), "", .x))) %>%
        {do.call(paste, select(., -cluster))}
      )
    }) %>% mutate(cluster=as.factor(cluster)) %>%
      cbind(df_com %>% select(building)) %>% select(building, everything())

我也尝试了更简单的方法。

## Extract columns in data frame
    col <- comunity %>% select(-cluster) %>% colnames()
    
    ## Join
    df %>% left_join(com, by=setNames(col, col), na_matches="never")

第一种方法为所有记录返回

NA
,后一种方法找到了一些匹配项,但未按预期工作。 一个问题是,
community
数据框在某些行中包含
NA
,但这并不意味着对应的
element
应该是
NA
中的
df
。此
NA
表示应在比较过程中省略此列。

我很感激你的建议。

编辑

我想我可以通过这篇文章解决这个问题。

community %>%
      ## Split into single row data frame
      split(seq(nrow(.))) %>%
      ## Select columns without NA and inner join
      map_dfr(~ select_if(.x, ~ !any(is.na(.))) %>%
              inner_join(df,.)) %>%
      ## Select columns
      select(building, cluster) %>%
      ## Right join with original data frame
      right_join(df, by="building")
r dataframe join
1个回答
0
投票

如果我理解正确,您想使用

community
data.frame 来包含要匹配的特定规则。换句话说,如果您的
df
具有在给定集群的
community
中找不到的额外信息,那么它仍然应该匹配并加入。

如果这是真的,你可以尝试一些不同的东西。对于

community
数据,删除重复项并添加一个附加列来指示该集群所需的匹配数。

此外,对于您的

df
数据,删除缺失值,并包含一个
row_number()
列来跟踪行并最终重新组合您的宽数据。

然后您可以通过

element
choice
连接这两个数据源。然后,您可以
filter
并保留结果,对于给定的
cluster
和行,在所需规则中找到足够的匹配项。输出还将包括找到的匹配选项。

请告诉我这是否达到了预期的结果。

library(tidyverse)

com_key <- community %>%
  pivot_longer(-cluster, names_to = "node", values_to = "rule") %>%
  select(-node) %>%
  separate(rule, into = c("element", "choice"), sep = "=") %>%
  group_by(cluster) %>%
  distinct(element, .keep_all = TRUE) %>%
  mutate(match = n()) 

df_key <- df %>%
  mutate(rn = row_number()) %>%
  pivot_longer(-rn, names_to = "element", values_to = "choice", values_transform = as.character) %>%
  drop_na(choice)

left_join(df_key, com_key, by = c("element", "choice"), relationship = "many-to-many") %>%
  drop_na(cluster) %>%
  group_by(rn, cluster) %>%
  filter(n() == match) %>%
  pivot_wider(id_cols = c(rn, cluster), names_from = element, values_from = choice) %>%
  arrange(rn)

输出

      rn cluster st_con_rt st_th roo_main st_con_tr st_adsb st_sub_main_th
   <int>   <dbl> <chr>     <chr> <chr>    <chr>     <chr>   <chr>         
 1     2       4 main-room hira  1        NA        NA      NA            
 2     4       7 sub-room  NA    2        direct    add     hira          
 3     5       2 sub-room  NA    1        direct    add     NA            
 4     9       6 main-room tsuma 4        terrace   add     NA            
 5    11       6 main-room tsuma 4        terrace   add     NA            
 6    13       1 sub-room  NA    2        terrace   NA      hira          
 7    14       7 sub-room  NA    2        direct    add     hira          
 8    15       4 main-room hira  1        NA        NA      NA            
 9    15       5 main-room hira  NA       terrace   add     NA            
10    16       3 sub-room  NA    NA       terrace   add     tsuma         
11    19       4 main-room hira  1        NA        NA      NA  
© www.soinside.com 2019 - 2024. All rights reserved.