我有一个包含 3 列的数据框,即
node1
、node2
和 cluster
,它们是从 getLinkCommunities
中的 linkcomm
包中的 R
中提取的。这是一个复制脚本。
community <- data.frame(
node1 = c(
"st_sub_main_th=hira", "roo_main=2", "st_con_rt=sub-room", "roo_main=1", "roo_main=1",
"roo_main=1", "st_sub_main_th=tsuma", "st_sub_main_th=tsuma", "st_sub_main_th=tsuma",
"st_th=hira", "st_th=hira", "roo_main=1", "st_th=hira", "st_th=hira", "st_con_rt=main-room",
"st_con_rt=main-room", "st_con_tr=terrace", "roo_main=4", "roo_main=4", "roo_main=4",
"roo_main=4", "st_th=tsuma", "st_th=tsuma", "st_sub_main_th=hira", "st_sub_main_th=hira",
"st_sub_main_th=hira", "st_sub_main_th=hira", "roo_main=2", "roo_main=2", "roo_main=2",
"st_con_tr=direct", "st_con_tr=direct"),
node2 = c(
"st_con_tr=terrace", "st_con_tr=terrace", "st_con_tr=terrace", "st_con_tr=direct",
"st_con_rt=sub-room", "st_adsb=add", "st_con_rt=sub-room", "st_con_tr=terrace",
"st_adsb=add", "roo_main=1", "st_con_rt=main-room", "st_con_rt=main-room",
"st_con_tr=terrace", "st_adsb=add", "st_con_tr=terrace", "st_adsb=add", "st_adsb=add",
"st_th=tsuma", "st_con_rt=main-room", "st_con_tr=terrace", "st_adsb=add",
"st_con_rt=main-room", "st_adsb=add", "roo_main=2", "st_con_tr=direct",
"st_con_rt=sub-room", "st_adsb=add", "st_con_tr=direct", "st_con_rt=sub-room",
"st_adsb=add", "st_con_rt=sub-room", "st_adsb=add"
),
cluster = c(
1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
)
)
另一方面,我有一个原始数据框,用于关联分析。
df <- data.frame(
isstilt = c(NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, NA, NA, NA),
st_con_rt = c(
"sub-room", "main-room", "sub-room", "sub-room", "sub-room", "sub-room", NA, "main-room",
"main-room", "main-room", "main-room", NA, "sub-room", "sub-room", "main-room", "sub-room",
NA, NA, "main-room", "sub-room"
),
st_con_tr = c(
"direct", "terrace", "direct", "direct", "direct", "direct", NA, "direct", "terrace", "direct",
"terrace", NA, "terrace", "direct", "terrace", "terrace", NA, NA, "direct", "terrace"
),
st_th = c(NA, "hira", NA, NA, NA, NA, NA, "tsuma", "tsuma", "tsuma", "tsuma", NA, NA, NA, "hira", NA, NA, NA, "hira", NA),
st_adsb = c("add", "sub", "sub", "add", "add", "sub", NA, "add", "add", "sub", "add", NA, "add", "add", "add", "add", NA, NA, "add", "add"),
st_sub_main_th = c("tsuma", NA, "hira", "hira", "hira", "other", NA, NA, NA, NA, NA, NA, "hira", "hira", NA, "tsuma", NA, NA, NA, "hira"),
st_sub2_main_th = c(
NA, "hira", "tsuma", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "hira", NA, NA, NA, NA
),
roo_main = c(3, 1, 2, 2, 1, 7, 1, 2, 4, 4, 4, 7, 2, 2, 1, 2, 7, 2, 1, 4)
)
node1
和 node2
中的 community
包含等号左侧 df
中的一些列和等号右侧的选项。我想根据这些条件将cluster
信息加入到df
,但这对我来说似乎很复杂。
我想基本流程可能是这样的:
将宽格式转换为长格式 将
node1
和 node2
绑定到一列 rule
将
rule
转换(拆分)为两列(element
和 choice
)
例如,st_sub_main_th=hira 将被拆分为 st_sub_main_th
和 hira
。
比较两个数据框并将
cluster
添加到 df
通过element
检查所有choice
和cluster
,如果符合簇的所有条件,则将簇号添加到df
。现在我无法想象如何实施它。
例如,cluster=1
包含以下6个条件。
元素选择
st_sub_main_th hira
st_con_tr 露台
roo_main 2
st_con_tr 露台
st_con_rt 子房间
st_con_tr 露台
我需要将
1
添加到新的 cluster
列到 df
中的相应行。
第一个和第二个操作将这样完成:
community <- community %>% pivot_longer(-cluster, names_to="node", values_to="rule") %>%
select(-node) %>% separate_wider_delim(rule, "=", names=c("element","choice")) %>% data.frame()
现在我得到了一个包含加入条件的表。
> community
cluster st_sub_main_th st_con_tr roo_main st_con_rt st_adsb st_th
1 1 hira terrace 2 sub-room <NA> <NA>
2 2 <NA> direct 1 sub-room add <NA>
3 3 tsuma terrace <NA> sub-room add <NA>
4 4 <NA> <NA> 1 main-room <NA> hira
5 5 <NA> terrace <NA> main-room add hira
6 6 <NA> terrace 4 main-room add tsuma
7 7 hira direct 2 sub-room add <NA>
我根据评论以及
上一篇文章尝试了
superkey
方法。
df %>% select(-building) %>% mutate(cluster = {
match(
do.call(paste, .),
com %>% mutate(across(-cluster, ~ if_else(is.na(.x), "", .x))) %>%
{do.call(paste, select(., -cluster))}
)
}) %>% mutate(cluster=as.factor(cluster)) %>%
cbind(df_com %>% select(building)) %>% select(building, everything())
我也尝试了更简单的方法。
## Extract columns in data frame
col <- comunity %>% select(-cluster) %>% colnames()
## Join
df %>% left_join(com, by=setNames(col, col), na_matches="never")
第一种方法为所有记录返回
NA
,后一种方法找到了一些匹配项,但未按预期工作。
一个问题是,community
数据框在某些行中包含NA
,但这并不意味着对应的element
应该是NA
中的df
。此 NA
表示应在比较过程中省略此列。
我很感激你的建议。
编辑
我想我可以通过这篇文章解决这个问题。
community %>%
## Split into single row data frame
split(seq(nrow(.))) %>%
## Select columns without NA and inner join
map_dfr(~ select_if(.x, ~ !any(is.na(.))) %>%
inner_join(df,.)) %>%
## Select columns
select(building, cluster) %>%
## Right join with original data frame
right_join(df, by="building")
如果我理解正确,您想使用
community
data.frame 来包含要匹配的特定规则。换句话说,如果您的 df
具有在给定集群的 community
中找不到的额外信息,那么它仍然应该匹配并加入。
如果这是真的,你可以尝试一些不同的东西。对于
community
数据,删除重复项并添加一个附加列来指示该集群所需的匹配数。
此外,对于您的
df
数据,删除缺失值,并包含一个 row_number()
列来跟踪行并最终重新组合您的宽数据。
然后您可以通过
element
和 choice
连接这两个数据源。然后,您可以 filter
并保留结果,对于给定的 cluster
和行,在所需规则中找到足够的匹配项。输出还将包括找到的匹配选项。
请告诉我这是否达到了预期的结果。
library(tidyverse)
com_key <- community %>%
pivot_longer(-cluster, names_to = "node", values_to = "rule") %>%
select(-node) %>%
separate(rule, into = c("element", "choice"), sep = "=") %>%
group_by(cluster) %>%
distinct(element, .keep_all = TRUE) %>%
mutate(match = n())
df_key <- df %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, names_to = "element", values_to = "choice", values_transform = as.character) %>%
drop_na(choice)
left_join(df_key, com_key, by = c("element", "choice"), relationship = "many-to-many") %>%
drop_na(cluster) %>%
group_by(rn, cluster) %>%
filter(n() == match) %>%
pivot_wider(id_cols = c(rn, cluster), names_from = element, values_from = choice) %>%
arrange(rn)
输出
rn cluster st_con_rt st_th roo_main st_con_tr st_adsb st_sub_main_th
<int> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2 4 main-room hira 1 NA NA NA
2 4 7 sub-room NA 2 direct add hira
3 5 2 sub-room NA 1 direct add NA
4 9 6 main-room tsuma 4 terrace add NA
5 11 6 main-room tsuma 4 terrace add NA
6 13 1 sub-room NA 2 terrace NA hira
7 14 7 sub-room NA 2 direct add hira
8 15 4 main-room hira 1 NA NA NA
9 15 5 main-room hira NA terrace add NA
10 16 3 sub-room NA NA terrace add tsuma
11 19 4 main-room hira 1 NA NA NA