我有一个示例数据集,其中有一列的内容如下:
Candy
Sanitizer
Candy
Water
Cake
Candy
Ice Cream
Gum
Candy
Coffee
我想做的就是将其替换为两个因素 - “糖果”和“非糖果”。我可以使用 Python/Pandas 做到这一点,但似乎无法找出基于 dplyr 的解决方案。谢谢!
在
dplyr
和 tidyr
dat %>%
mutate(var = replace(var, var != "Candy", "Not Candy"))
比
ifelse
方法快得多。
创建初始数据框的代码如下:
library(dplyr)
dat <- as_data_frame(c("Candy","Sanitizer","Candy","Water","Cake","Candy","Ice Cream","Gum","Candy","Coffee"))
colnames(dat) <- "var"
dat %>%
mutate(var = case_when(var == 'Candy' ~ 'Candy',
var == 'Water' ~ 'Water',
TRUE ~ 'Neither-Water-Nor-Candy'))
并且您的列是
dat
:var
和列表的组合似乎提供了一种简单的解决方案:
library(microbenchmark)
set.seed(01239)
# resample data
smp <- data.frame(sample(dat$var, 1e6, TRUE))
names(smp) <- "var"
timings <- replicate(50, {
# copy data to facilitate reuse
cop <- smp
t0 <- get_nanotime()
levs <- setdiff(levels(cop$var), "Candy")
levels(cop$var) <- list(Candy = "Candy", "Non-Candy" = levs)
t1 <- get_nanotime() - t0
cop <- smp
t0 <- get_nanotime()
cop = cop %>%
mutate(candy.flag = factor(ifelse(var == "Candy", "Candy", "Non-Candy")))
t2 <- get_nanotime() - t0
cop <- smp
t0 <- get_nanotime()
cop$var <-
factor(cop$var == "Candy", labels = c("Non-Candy", "Candy"))
t3 <- get_nanotime() - t0
c(levels = t1, dplyr = t2, direct = t3)
})
x <- apply(times, 1, median)
x[2]/x[1]
# dplyr direct
# 8.894303 4.962791
# assuming that all sweet things fall in one category
dat <- data.frame(var = c("Candy", "Sanitizer", "Candy", "Water", "Cake", "Candy", "Ice Cream", "Gum", "Candy", "Coffee"))
conditions <- list("Candy" = TRUE, "Sanitizer" = FALSE, "Water" = FALSE,
"Cake" = TRUE, "Ice Cream" = TRUE, "Gum" = TRUE, "Coffee" = FALSE)
dat %>% mutate(sweet = conditions[var])
中的
case_match
dplyr
此外,嵌入的 ifelse 可以模拟与 PhJ 提出的 case_when 解决方案相同的情况(不过,我确实喜欢他的可读性)!
library(dplyr)
dat %>%
mutate(var = case_match(var, "Candy" ~ var,
.default ~ "Not Candy"))