我在用:
raw <- c('0', '13', 'NULL')
data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))
data
data$category <- ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++",
ifelse(is.na(data$number) == TRUE, "unknown",
"unknown")))))))
data
有人会认为数字==“NULL”条目更改为“未知”但我得到:
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA <NA>
代替:
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA unknown
有人可以告诉我为什么吗?目前的解决方法是运行:
data$category[which(is.na(data$number))] = "unknown"
在上面的代码块之后。
我不知道这是否是您想要的方式,但通过使用cut()
函数确保它的代码更简洁。
data$category <- cut(data$number,
breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), # you decide the cuts
labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
# labels for each category
不幸的是,您需要这两行来将NA
转换为"Unknown"
:
levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"
data
# raw number category
# 1 0 0 0
# 2 13 13 [8..14]
# 3 NULL NA Unknown
数据:
raw <- c('0', '13', 'NULL')
data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))
标杆:
microbenchmark::microbenchmark(
#cut
cut = {data$category <- cut(data$number,
breaks=c(-Inf, 0, 7, 14, 31, 62, Inf),
labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"},
#findInt
findInt = {vec<-c(0,7,14,31,62)
levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
paste0(vec[length(vec)]+1,"++"))
res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
res[is.na(res)]<-"unknown"},
# lapply
lapply = {data$category <- lapply(data$number,function(x) {
if(is.na(x) || is.null(x)) "unknown"
else if(x == 0) "0"
else if(x > 0 & x <= 7) "[1 ... 7]"
else if(x > 7 & x <= 14) "[8 ... 14]"
else if(x > 14 & x <= 31) "[15 ... 31]"
else if(x > 31 & x <= 62) "[32 ... 62]"
else if(x > 62) "63++"
else "unknown"
})},
# ifelse
ifelse = {data$category <-
ifelse(is.na(data$number), "unknown",
ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++", "???")))))))}
)
得到:
# Unit: microseconds
# expr min lq mean median uq max neval
# cut 132.207 139.4185 154.78149 144.9770 154.5925 283.043 100
# findInt 18.329 21.7850 26.58004 26.2915 28.8460 60.996 100
# lapply 14.122 15.6250 4269.73574 17.2770 18.7800 425198.055 100
# ifelse 81.728 84.8835 96.09675 88.9400 96.3010 193.503 100
也许你宁愿使用within()
的条件赋值,这更清晰。
data <- within(data, {
category <- NA
category[number == 0] <- 0
category[number > 0 & number <= 7] <- "[1 ... 7]"
category[number > 7 & number <= 14] <- "[8 ... 14]"
category[number > 14 & number <= 31] <- "[15 ... 31]"
category[number > 31 & number <= 62] <- "[32 ... 62]"
category[number > 62] <- "[32 ... 62]"
category[is.na(number)] <- "unknown"
})
> data
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA unknown
我不知道这是否适合您继续使用当前的方法:将data
重命名为df
,因为有一个名为data
的函数
df$category[is.na(df$category)]<-"Unknown"
df$category
如果您将is.na()
移到开头,您当前的代码将起作用:
data$category <-
ifelse(is.na(data$number), "unknown",
ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++", "???")))))))
> data
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA unknown
我用一个lapply而不是嵌套的ifelse函数调用重写它有点友好。值得注意的变化是首先测试is.na()并返回“unknown”而不是最后的默认检查。原因是,当NA被传递给函数时,你的第一个测试是if(NA == 0)但是这将返回NA并且既不是TRUE也不是FALSE这就是为什么它是NA并且没有完成其检查以返回“unknown”
data$category <- lapply(data$number,function(x) {
if(is.na(x) || is.null(x)) "unknown"
else if(x == 0) "0"
else if(x > 0 & x <= 7) "[1 ... 7]"
else if(x > 7 & x <= 14) "[8 ... 14]"
else if(x > 14 & x <= 31) "[15 ... 31]"
else if(x > 31 & x <= 62) "[32 ... 62]"
else if(x > 62) "63++"
else "unknown"
})
试试这个:
#define a vector with the range values
vec<-c(0,7,14,31,62)
#create your labels
levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
paste0(vec[length(vec)]+1,"++"))
#use findInterval to create your result
res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
#substitute the NA's
res[is.na(res)]<-"unknown"