我有一个数据框 df,我想为其创建一个新列,其中填充某些字符串,具体取决于在“标题”列中找到的关键字/字符串。
library(tidyverse)
df <- tibble::tibble(
id = c(36933814, 36921141, 39601489, 36898335, 36432859, 33447951),
treatment_modalities = c("HIFU", "UAE", "UAE; RFA; HIFU", "UAE; RFA", "UAE; HIFU", "UAE"),
no_patients = c(32, NA, 152, NA, 15, 428),
year = c(2023, 2022, 2023, 2023, 2023, 2023),
title = c(
"Title with keyword1 and keyword2 inside of it.",
"A second title with kword3 and keyword1 inside of it.",
"Here we have kword4 and nothing else to see.",
"A title with kword4 and kword3 inside of it.",
"And one with keyword1, keyword2 and kword3 in it.",
"This title does not contain a keyword."
),
)
我可以检测并写入第一个找到的关键字/字符串,但当然 case_when 会停止并且不会触发潜在的其他检测:
df2 <- df %>%
mutate(title_keyword = case_when(
str_detect(df$title, regex("keyword1", ignore_case = T)) ~ "k1",
str_detect(df$title, regex("keyword2", ignore_case = T)) ~ "k2",
str_detect(df$title, regex("kword3", ignore_case = T)) ~ "k3",
str_detect(df$title, regex("kword4", ignore_case = T)) ~ "k4",
TRUE ~ NA_character_), .after = year)
case_when 是错误的辅助函数吗?也许以某种方式使用 if_else 并另外粘贴以进行变异?
预期输出为:
tibble::tibble(
id = c(36933814, 36921141, 39601489, 36898335, 36432859, 33447951),
treatment_modalities = c("HIFU", "UAE", "UAE; RFA; HIFU", "UAE; RFA", "UAE; HIFU", "UAE"),
no_patients = c(32, NA, 152, NA, 15, 428),
year = c(2023, 2022, 2023, 2023, 2023, 2023),
title_keyword = c("k1; k2", "k3; k1", "k4", "k4; k3", "k1; k2; k3", NA),
title = c(
"Title with keyword1 and keyword2 inside of it.", "A second title with kword3 and keyword1 inside of it.",
"Here we have kword4 and nothing else to see.", "A title with kword3 and kword4 inside of it.",
"And one with keyword1, keyword2 and kword3 in it.", "This title does not contain a keyword."
),
)
感谢您的帮助!
您可以使用
stringr
包的 str_extract_all
更简洁地执行此操作,首先提取所有关键字,然后 str_replace_all
替换它们:
ll <- lapply(str_extract_all(df$title, regex("keyword1+|keyword2+|kword3+|kword4+", ignore_case = TRUE)),
paste, collapse = "; ")
df$title_keyword <- unlist(lapply(ll, str_replace_all, regex(c("keyword1" = "k1",
"keyword2" = "k2",
"kword3" = "k3",
"kword4" = "k4"),
ignore_case = TRUE)))
正则表达式中的
+
查找模式的一个或多个实例。
输出:
# A tibble: 6 × 6
id treatment_modalities no_patients year title title_keyword
<dbl> <chr> <dbl> <dbl> <chr> <chr>
1 36933814 HIFU 32 2023 Title with keyword1 and keyword2 inside of it. "k1; k2"
2 36921141 UAE NA 2022 A second title with kword3 and keyword1 inside of it. "k3; k1"
3 39601489 UAE; RFA; HIFU 152 2023 Here we have kword4 and nothing else to see. "k4"
4 36898335 UAE; RFA NA 2023 A title with kword4 and kword3 inside of it. "k4; k3"
5 36432859 UAE; HIFU 15 2023 And one with keyword1, keyword2 and kword3 in it. "k1; k2; k3"
6 33447951 UAE 428 2023 This title does not contain a keyword. ""
一种方法:
paste2 <- function(x,...){
newx <- x[nzchar(x)]
out <- paste(newx,...)
if_else(nzchar(out)==0,NA_character_,out)
}
df2 <- df%>%
rowwise()%>%
mutate(key1=ifelse(str_detect(title, regex("keyword1", ignore_case = T)),"k1",""),
key2=ifelse(str_detect(title, regex("keyword2", ignore_case = T)),"k2",""),
key3=ifelse(str_detect(title, regex("kword3", ignore_case = T)),"k3",""),
key4=ifelse(str_detect(title, regex("kword4", ignore_case = T)),"k4",""),
new=paste2(c(key1,key2,key3,key4),sep=";",collapse=";")) %>%
select(-starts_with("key")) %>%
ungroup()
df2
# A tibble: 6 x 6
id treatment_modalities no_patients year title new
<dbl> <chr> <dbl> <dbl> <chr> <chr>
1 36933814 HIFU 32 2023 Title with keyword1 and keyword2 inside of it. k1;k2
2 36921141 UAE NA 2022 A second title with kword3 and keyword1 inside of it. k1;k3
3 39601489 UAE; RFA; HIFU 152 2023 Here we have kword4 and nothing else to see. k4
4 36898335 UAE; RFA NA 2023 A title with kword4 and kword3 inside of it. k3;k4
5 36432859 UAE; HIFU 15 2023 And one with keyword1, keyword2 and kword3 in it. k1;k2;k3
6 33447951 UAE 428 2023 This title does not contain a keyword. NA
您可以将标记关键字的列分开,然后将它们合并:
df %>%
mutate(
k1 = case_when(str_detect(title, regex("keyword1", ignore_case = T)) ~ "k1", TRUE ~ NA_character_),
k2 = case_when(str_detect(title, regex("keyword2", ignore_case = T)) ~ "k2", TRUE ~ NA_character_),
k3 = case_when(str_detect(title, regex("kword3", ignore_case = T)) ~ "k3", TRUE ~ NA_character_),
k4 = case_when(str_detect(title, regex("kword4", ignore_case = T)) ~ "k4", TRUE ~ NA_character_)) %>%
rowwise() %>%
mutate(
title_keyword=paste(c(na.omit(k1), na.omit(k2), na.omit(k3), na.omit(k4)), collapse = ";"),
title_keyword = ifelse(title_keyword=="", NA, title_keyword)
)
更新
根据评论,这是一种不同的通用方法,可以避免写出这么多条件:
keywords <- c("keyword1", "keyword2", "kword3", "kword4")
df |>
mutate(title_keyword = map_chr(title, ~ str_c(str_c("k", which(str_detect(.x, keywords))), collapse =";")),
title_keyword = na_if(title_keyword, ""))
它是如何运作的
title
,对于每个标题,我们使用str_detect
测试所有关键字。这将返回一个逻辑值向量。 which
给出匹配关键字的位置(例如,1、2、3 或 4,因为在这种情况下有 4 个关键字)。str_c
我们将前缀连接到这些位置以获得像c("k1, "k2")
这样的向量。;
分隔。""
因为没有匹配,所以我们使用 na_if
将其转换为 NA
.输出
id treatment_modalities no_patients year title title~1
<dbl> <chr> <dbl> <dbl> <chr> <chr>
1 36933814 HIFU 32 2023 Title with keyword1 a~ k1;k2
2 36921141 UAE NA 2022 A second title with k~ k1;k3
3 39601489 UAE; RFA; HIFU 152 2023 Here we have kword4 a~ k4
4 36898335 UAE; RFA NA 2023 A title with kword4 a~ k3;k4
5 36432859 UAE; HIFU 15 2023 And one with keyword1~ k1;k2;~
6 33447951 UAE 428 2023 This title does not c~ NA
老
而不是有条件地设置
title_keyword
的值,为什么不直接提取您正在寻找的值:
df |>
mutate(title_keyword = map_chr(str_match_all(title, "(k)e?y?word(\\d)"), ~ str_c(str_c(.x[,2], .x[,3]), collapse = ";")),
title_keyword = na_if(title_keyword, ""))
kword
和keyword
并提取k
和末尾的数字(\\d
)。;
分隔。