我有一个包含两列的数据框:id
和gradelist
。
gradelist
列中的值包括长度不同的等级列表(由;
分隔)。
这里是数据:
id <- seq(1,7)
gradelist <- c("a;b;b",
"c;c",
"d;d;d;f",
"f;f;f;f;f;f",
"a;a;a;a",
"f;b;b;b;b;b;b;b",
"c;c;d;d;a;a")
df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)
我需要添加另一个cloumn来检查每个标识的所有等级是否都是smae。
输出看起来像:
我们可以提取字符并用n_distinct
检查以发现不同元素的数量为1
library(dplyr)
library(purrr)
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)]))
# id gradelist same
#1 1 a;b;b no
#2 2 c;c yes
#3 3 d;d;d;f no
#4 4 f;f;f;f;f;f yes
#5 5 a;a;a;a yes
#6 6 f;b;b;b;b;b;b;b no
#7 7 c;c;d;d;a;a no
或利用case_when
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))
或者另一个选择是'成绩单'上的separate_rows
以扩展数据,找到n_distinct
library(tidyr)
df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df)
检查哪个字符排在首位,然后用空字符串替换所有出现的字符。如果什么都没有,那就意味着所有字符都是相同的。
sapply(df$gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE TRUE FALSE TRUE TRUE FALSE FALSE
df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x)
length(unique(x))))==1, labels=c("No", "Yes"))
df
#> id gradelist same
#> 1 1 a;b;b No
#> 2 2 c;c Yes
#> 3 3 d;d;d;f No
#> 4 4 f;f;f;f;f;f Yes
#> 5 5 a;a;a;a Yes
#> 6 6 f;b;b;b;b;b;b;b No
#> 7 7 c;c;d;d;a;a No
这里有一些基本的R解决方案。
f
,即f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
然后您可以在same
列中添加
df$same <- f(df$gradelist)
regmatches
+ sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))
诸如此类
> df
id gradelist same
1 1 a;b;b no
2 2 c;c yes
3 3 d;d;d;f no
4 4 f;f;f;f;f;f yes
5 5 a;a;a;a yes
6 6 f;b;b;b;b;b;b;b no
7 7 c;c;d;d;a;a no
尝试:
transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', gradelist)) + 1])
输出:
id gradelist same
1 1 a;b;b No
2 2 c;c Yes
3 3 d;d;d;f No
4 4 f;f;f;f;f;f Yes
5 5 a;a;a;a Yes
6 6 f;b;b;b;b;b;b;b No
7 7 c;c;d;d;a;a No
您也可以按照以下方式进行strsplit
方式:
transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])
基准
我们重复字符串几次。我们还重复df
的行,以便最终得到超过100k的行,并分配@ThomasIsCoding使用的函数。
df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))
df <- df[rep(seq_len(nrow(df)), each = 15000), ]
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
[我们将transform
用于所有base
函数,以在mutate
解和tidy
10次的情况下模拟microbenchmark
的行为:
mBench <- microbenchmark::microbenchmark(
akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
akrun3 = { df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df) },
db = { transform(df, same = sapply(gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
`M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', df$gradelist)) + 1]) },
arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },
times = 10
)
结果:
Unit: seconds
expr min lq mean median uq max neval
akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420 10
akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535 10
akrun3 6.378463 7.190472 7.379439 7.373730 7.704365 8.321929 10
db 3.738271 3.785858 3.935769 3.911479 3.926385 4.523876 10
M-- 3.551592 3.648720 3.723315 3.741075 3.798664 3.915588 10
ThomasIsCoding1 4.453528 4.498858 4.702160 4.613088 4.823517 5.379984 10
ThomasIsCoding2 3.368358 3.532593 3.752111 3.610664 3.773345 4.969414 10
arg0naut91_1 1.638212 1.683986 1.699327 1.704614 1.716077 1.759059 10
arg0naut91_2 3.665604 3.739662 3.774542 3.750144 3.774753 4.071887 10
情节: