检查非唯一字符的字符串模式

问题描述 投票:4回答:5

我有一个包含两列的数据框:idgradelist

gradelist列中的值包括长度不同的等级列表(由;分隔)。

这里是数据:

id <- seq(1,7)
gradelist <- c("a;b;b",
            "c;c",
            "d;d;d;f",
            "f;f;f;f;f;f",
            "a;a;a;a",
            "f;b;b;b;b;b;b;b",
            "c;c;d;d;a;a")

df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)

我需要添加另一个cloumn来检查每个标识的所有等级是否都是smae。

输出看起来像:

enter image description here

r string strsplit
5个回答
4
投票

我们可以提取字符并用n_distinct检查以发现不同元素的数量为1

library(dplyr)
library(purrr)
df %>% 
   mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), 
       ~ c("no", "yes")[1+(n_distinct(.x)==1)]))
#   id       gradelist same
#1  1           a;b;b   no
#2  2             c;c  yes
#3  3         d;d;d;f   no
#4  4     f;f;f;f;f;f  yes
#5  5         a;a;a;a  yes
#6  6 f;b;b;b;b;b;b;b   no
#7  7     c;c;d;d;a;a   no

或利用case_when

df %>% 
   mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
         case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))

或者另一个选择是'成绩单'上的separate_rows以扩展数据,找到n_distinct

library(tidyr)
df %>% 
    separate_rows(gradelist) %>%
    distinct %>% 
    group_by(id) %>% 
    summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>% 
    left_join(df)

2
投票

检查哪个字符排在首位,然后用空字符串替换所有出现的字符。如果什么都没有,那就意味着所有字符都是相同的。

sapply(df$gradelist, function(x) {
    nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE

1
投票
df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x) 
                                    length(unique(x))))==1, labels=c("No", "Yes"))

df
#>   id       gradelist same
#> 1  1           a;b;b   No
#> 2  2             c;c  Yes
#> 3  3         d;d;d;f   No
#> 4  4     f;f;f;f;f;f  Yes
#> 5  5         a;a;a;a  Yes
#> 6  6 f;b;b;b;b;b;b;b   No
#> 7  7     c;c;d;d;a;a   No

1
投票

这里有一些基本的R解决方案。

  • 定义您的自定义函数f,即
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))

然后您可以在same列中添加

df$same <- f(df$gradelist)
  • 使用regmatches + sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))

诸如此类

> df
  id       gradelist same
1  1           a;b;b   no
2  2             c;c  yes
3  3         d;d;d;f   no
4  4     f;f;f;f;f;f  yes
5  5         a;a;a;a  yes
6  6 f;b;b;b;b;b;b;b   no
7  7     c;c;d;d;a;a   no

1
投票

尝试:

transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', gradelist)) + 1])

输出:

  id       gradelist same
1  1           a;b;b   No
2  2             c;c  Yes
3  3         d;d;d;f   No
4  4     f;f;f;f;f;f  Yes
5  5         a;a;a;a  Yes
6  6 f;b;b;b;b;b;b;b   No
7  7     c;c;d;d;a;a   No

您也可以按照以下方式进行strsplit方式:

transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])

基准

我们重复字符串几次。我们还重复df的行,以便最终得到超过100k的行,并分配@ThomasIsCoding使用的函数。

df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))

df <- df[rep(seq_len(nrow(df)), each = 15000), ]

f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))

[我们将transform用于所有base函数,以在mutate解和tidy 10次的情况下模拟microbenchmark的行为:

mBench <- microbenchmark::microbenchmark(

  akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
                                          ~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
  akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
                                            case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
  akrun3 = { df %>%
    separate_rows(gradelist) %>%
    distinct %>% 
    group_by(id) %>% 
    summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>% 
    left_join(df) },
  db = { transform(df, same = sapply(gradelist, function(x) { 
    nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
  `M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
  ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
  ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
  arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', df$gradelist)) + 1]) },
  arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },

  times = 10

)

结果:

Unit: seconds
            expr       min        lq      mean    median        uq       max neval
          akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420    10
          akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535    10
          akrun3  6.378463  7.190472  7.379439  7.373730  7.704365  8.321929    10
              db  3.738271  3.785858  3.935769  3.911479  3.926385  4.523876    10
             M--  3.551592  3.648720  3.723315  3.741075  3.798664  3.915588    10
 ThomasIsCoding1  4.453528  4.498858  4.702160  4.613088  4.823517  5.379984    10
 ThomasIsCoding2  3.368358  3.532593  3.752111  3.610664  3.773345  4.969414    10
    arg0naut91_1  1.638212  1.683986  1.699327  1.704614  1.716077  1.759059    10
    arg0naut91_2  3.665604  3.739662  3.774542  3.750144  3.774753  4.071887    10

情节:

enter image description here

© www.soinside.com 2019 - 2024. All rights reserved.