你好!我的目标是比较两个字符向量-主要是同义词和另一个混合名。 mixnames中的字符串元素与同义词中的字符串元素不完全匹配,因此需要进行一些字符串比较。我的目标是从同义词中提取元素,这些元素看起来像混合名称中的内容。我尝试仅使用tidyverse来执行此操作,但失败了。我找到了一个使用base的解决方案。我知道有更好的方法,但是我无法解决。...
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 3.6.1
#> Warning: package 'tidyr' was built under R version 3.6.1
#> Warning: package 'dplyr' was built under R version 3.6.1
#Acetometaphin
synonyms <- c("Pediatrix","Percocet-5","Percocet-Demi","Perdolan Mono","Perfalgan",
"Phenaphen","Phenaphen W/Codeine","Phenipirin","Phogoglandin","Pinex",
"Piramin","Pirinasol","Plicet","Polmofen","Predimol","Predualito",
"Prodol","Prontina","Puernol","Pulmofen", "Pyregesic-C")
mixNames <- c("Liquiprin","Midol Maximum Strength","Midol PM Night Time Formula",
"Midol Regular Strength" ,"Midol Teen Formula","Naldegesic",
"Ornex Severe Cold Formula","Percocet","Percogesic with Codeine",
"Propacet" )
尝试失败:
#####STUFF THAT DIDNT WORK!!!!
# cross2(
# .x = synonyms, .y = mixNames #lists - each list has 2 lists - each of those is an atomic vector of 1
# ) %>%
# map_dfc(lift(str_detect)) #lift - modifies function to take a list of arguments - works for nested lists
#this returns a df just like the apply
# mix_syn_lgl_df <- map_dfc(
# mixNames,
# ~ map_lgl(synonyms, str_detect, pattern = .x)
# )
# colnames(mix_syn_lgl_df) <- mixNames
#
# mix_syn_lgl_df$synonyms <- synonyms
这实际上有效:
#remove mixture names from synonyms
mix_syn_lgl_mat <- sapply(mixNames, function(x){
str_detect(string = synonyms, pattern = x)
}) #returns a matrix 21x10 of logicals while preserving colnames
rownames(mix_syn_lgl_mat) <- synonyms #add synoyms as rownames
#create a new object with a new col of sum of TRUES in row
mix_syn_lgl_mat2 <- cbind(mix_syn_lgl_mat, rowSums(mix_syn_lgl_mat))
#take the numerical matrix mix_syn_lgl_mat2 and return the row names where the last col (rowsums) > 0
badNames <- row.names(mix_syn_lgl_mat2[mix_syn_lgl_mat2[, ncol(mix_syn_lgl_mat2)] > 0, ])
#filter out those names from the synonyms vector
pureSyn <- synonyms[!(synonyms %in% badNames)]
由reprex package(v0.3.0)在2019-10-29创建
[您似乎想要synonyms
矢量,而没有与mixNames
重叠的值。您可以子集synonyms
删除匹配项。在此str_c
/ paste
折叠mixNames
,以创建具有所有mixNames
的图案。然后,您只需要使用部分字符串匹配(即从那里开始的str_detect
和grepl
)。
这里使用stringr
-有点整洁
synonyms[str_detect(synonyms, str_c(mixNames, collapse = "|"), negate = T)]
或使用基数R中的功能:
synonyms[!grepl(paste(mixNames, collapse = "|"), synonyms)]
# OR
grep(paste(mixNames, collapse = "|"), synonyms, value = T, invert = T)
作为站点说明,如果您想查看匹配字符串的替代方法,请查看stringdist
或其他字符串距离函数/包。
我喜欢为这样的东西构建一个%g%
运算符。以下内容将创建一个小标题,其中包含来自实际上有效:
library(tidyverse)
`%g%` <- function(x, y) {
z <- paste0(y, collapse = "|")
grepl(z, x, ignore.case = T)
}
tibble(syn = synonyms) %>%
filter(!syn %g% mixNames)
#> # A tibble: 19 x 1
#> syn
#> <chr>
#> 1 Pediatrix
#> 2 Perdolan Mono
#> 3 Perfalgan
#> 4 Phenaphen
#> 5 Phenaphen W/Codeine
#> 6 Phenipirin
#> 7 Phogoglandin
#> 8 Pinex
#> 9 Piramin
#> 10 Pirinasol
#> 11 Plicet
#> 12 Polmofen
#> 13 Predimol
#> 14 Predualito
#> 15 Prodol
#> 16 Prontina
#> 17 Puernol
#> 18 Pulmofen
#> 19 Pyregesic-C
类似地,仅获得mixNames
中与synonym
中的值匹配的药物:
tibble(syn = synonyms) %>%
filter(syn %g% mixNames)
#> # A tibble: 2 x 1
#> syn
#> <chr>
#> 1 Percocet-5
#> 2 Percocet-Demi
由reprex package(v0.3.0)在2019-10-29创建