我正在尝试使用 NetworkD3 创建桑基图来绘制通过 A&E 部门的患者流程,并带有示例数据框:
`First_Contact <- c("UTC", "UTC", "111", "111")Second_Contact <- c(NA, "ED - ED RV", "UTC", "UTC")Third_Contact <- c(NA, NA, "ED - ED RV", "ED - ED RV")Final_Pathway_Outcome <- c("Discharged", "Discharged", "Discharged", "Discharged")
df <- data.frame(First_Contact, Second_Contact, Third_Contact, Final_Pathway_Outcome)
其中 NA 表示患者在出院前不需要进行进一步的患者接触,即第 1 行患者去了 UTC,然后出院)
我正在遵循 https://rpubs.com/droach/CPP526-codethrough 上的代码通过,使用下面的代码效果良好,但遇到两个问题:
当患者无需经过第二次第三次接触即可达到最终途径结果时,我的原始数据框的旅程为 UTC -> NA -> NA -> 出院。
filter(!is.na(target))
正在过滤掉以 NA 为目标的行,但在我的数据透视表中,我目前正在手动将源列中的 NA 替换为上一行中的适当目标。有没有办法更优雅地做到这一点?
我的数据集将比上面的示例大得多,并且此代码当前单独处理每一行,甚至是重复的行。有没有办法自动聚合这些并相应地调整值?
##Adding row numbers and pivoting data
links.df <- df %>%
mutate(row = row_number()) %>%
pivot_longer(cols= -row, names_to="column", values_to="source")
##Creating target column and specifying link order
links.df <- links.df %>%
mutate(column= match(column, names(trial))) %>%
group_by(row) %>%
mutate(target= lead(source, order_by= column)) %>%
filter(!is.na(target)) %>%
ungroup()
##Differentiating between areas in each contact
links.df <- links.df %>%
mutate(source = paste0(source, "", column)) %>%
mutate(target= paste0(target, "", column+1)) %>%
select(row, column, source, target)
##Extra modification to swap the value of "NAs" with the target from the previous row. Currently doing this manually
links.df[2,3] <- "UTC_2"links.df[5,3] <- "ED - ED RV_3"
##Creating data frame for nodes
nodes.df <- data.frame(name=unique(c(links.df$source, links.df$target)))nodes.df$label <- sub('_[0-9]*$', '', nodes.df$name)
##Providing instructions for Sankey Diagram (source and target ids)
links.df$source_id <- match(links.df$source, nodes.df$name) -1
links.df$target_id <- match(links.df$target, nodes.df$name) -1
links.df$value <- 1
##Plotting Sankey
sankeyNetwork(Links= links.df,Nodes = nodes.df,Source= 'source_id',Target= 'target_id',Value= 'value',NodeID= 'label',fontSize= 16,iterations=0)`
可重现代码:
library(dplyr)
library(tidyr)
library(networkD3)
First_Contact <- c("UTC", "UTC", "111", "111")
Second_Contact <- c(NA, "ED - ED RV", "UTC", "UTC")
Third_Contact <- c(NA, NA, "ED - ED RV", "ED - ED RV")
Final_Pathway_Outcome <- c("Discharged", "Discharged", "Discharged", "Discharged")
df <- data.frame(First_Contact, Second_Contact, Third_Contact, Final_Pathway_Outcome)
##Adding row numbers and pivoting data
links.df <- df %>%
mutate(row = row_number()) %>%
pivot_longer(cols= -row, names_to="column", values_to="source")
##Creating target column and specifying link order
links.df <- links.df %>%
mutate(column= match(column, names(df))) %>%
group_by(row) %>%
mutate(target= lead(source, order_by= column)) %>%
filter(!is.na(target)) %>%
ungroup()
##Differentiating between areas in each contact
links.df <- links.df %>%
mutate(source = paste0(source, "", column)) %>%
mutate(target= paste0(target, "", column+1)) %>%
select(row, column, source, target)
##Extra modification to swap the value of "NAs" with the target from the previous row. Currently doing this manually
links.df[2,3] <- "UTC_2"
links.df[5,3] <- "ED - ED RV_3"
nodes.df <-
data.frame(
name = unique(c(links.df$source, links.df$target)),
label = unique(c(links.df$source, links.df$target))
)
##Providing instructions for Sankey Diagram (source and target ids)
links.df$source_id <- match(links.df$source, nodes.df$name) -1
links.df$target_id <- match(links.df$target, nodes.df$name) -1
links.df$value <- 1
##Plotting Sankey
sankeyNetwork(Links= links.df,Nodes = nodes.df,Source= 'source_id',Target= 'target_id',Value= 'value',NodeID= 'label',fontSize= 16,iterations=0)
tidy::fill()
是用先前值(向上或向下)填充 data.frame 列中 NA
值的便捷方法。
dplyr::summarise()
可用于聚合重复链接并使用dplyr::n()
设置值
在填写
source
和 target
列之前,我还在 source
名称中添加了列号,以保持节点的顺序/位置。
library(dplyr)
library(tidyr)
library(networkD3)
First_Contact <- c("UTC", "UTC", "111", "111")
Second_Contact <- c(NA, "ED - ED RV", "UTC", "UTC")
Third_Contact <- c(NA, NA, "ED - ED RV", "ED - ED RV")
Final_Pathway_Outcome <- c("Discharged", "Discharged", "Discharged", "Discharged")
df <- data.frame(First_Contact, Second_Contact, Third_Contact, Final_Pathway_Outcome)
links.df <-
df %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = "column", values_to = "source") %>%
mutate(column = match(column, names(df))) %>%
mutate(source = ifelse(!is.na(source), paste0(source, "-", column), NA)) %>%
group_by(row) %>%
mutate(target = lead(source, order_by = column)) %>%
fill(source, .direction = "down") %>%
filter(!is.na(target)) %>%
ungroup() %>%
summarise(value = n(), .by = c("source", "target"))
nodes.df <- data.frame(name = unique(c(links.df$source, links.df$target)))
nodes.df <- mutate(nodes.df, label = sub("-[0-9]*$", "", name))
links.df$source_id <- match(links.df$source, nodes.df$name) -1
links.df$target_id <- match(links.df$target, nodes.df$name) -1
df
#> First_Contact Second_Contact Third_Contact Final_Pathway_Outcome
#> 1 UTC <NA> <NA> Discharged
#> 2 UTC ED - ED RV <NA> Discharged
#> 3 111 UTC ED - ED RV Discharged
#> 4 111 UTC ED - ED RV Discharged
links.df
#> # A tibble: 6 × 5
#> source target value source_id target_id
#> <chr> <chr> <int> <dbl> <dbl>
#> 1 UTC-1 Discharged-4 1 0 5
#> 2 UTC-1 ED - ED RV-2 1 0 1
#> 3 ED - ED RV-2 Discharged-4 1 1 5
#> 4 111-1 UTC-2 2 2 3
#> 5 UTC-2 ED - ED RV-3 2 3 4
#> 6 ED - ED RV-3 Discharged-4 2 4 5
nodes.df
#> name label
#> 1 UTC-1 UTC
#> 2 ED - ED RV-2 ED - ED RV
#> 3 111-1 111
#> 4 UTC-2 UTC
#> 5 ED - ED RV-3 ED - ED RV
#> 6 Discharged-4 Discharged
sankeyNetwork(
Links = links.df,
Nodes = nodes.df,
Source = 'source_id',
Target = 'target_id',
Value = 'value',
NodeID = 'label',
fontSize = 16,
iterations = 0,
sinksRight = FALSE
)