使用ifelse将NA映射到未知

问题描述 投票:0回答:6

我在用:

raw <- c('0', '13', 'NULL')

data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))

data

data$category <- ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++",
ifelse(is.na(data$number) == TRUE, "unknown",
"unknown")))))))

data

有人会认为数字==“NULL”条目更改为“未知”但我得到:

   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA       <NA>

代替:

   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA       unknown

有人可以告诉我为什么吗?目前的解决方法是运行:

data$category[which(is.na(data$number))] = "unknown"

在上面的代码块之后。

r
6个回答
4
投票

我不知道这是否是您想要的方式,但通过使用cut()函数确保它的代码更简洁。

data$category <- cut(data$number, 
    breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), # you decide the cuts
    labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++")) 
    # labels for each category

不幸的是,您需要这两行来将NA转换为"Unknown"

levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"
data
#     raw number category
# 1    0      0        0
# 2   13     13  [8..14]
# 3 NULL     NA  Unknown

数据:

raw <- c('0', '13', 'NULL')

data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))

标杆:

microbenchmark::microbenchmark(
  #cut
  cut = {data$category <- cut(data$number, 
                              breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), 
                              labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
  levels(data$category) <- c(levels(data$category), "Unknown")
  data$category[is.na(data$number)] <- "Unknown"},
  #findInt
  findInt = {vec<-c(0,7,14,31,62)
  levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
            paste0(vec[length(vec)]+1,"++"))
  res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
  res[is.na(res)]<-"unknown"},
  # lapply
  lapply = {data$category <- lapply(data$number,function(x) {
    if(is.na(x) || is.null(x)) "unknown"
    else if(x == 0) "0"
    else if(x > 0 & x <= 7) "[1 ... 7]"
    else if(x > 7 & x <= 14) "[8 ... 14]"
    else if(x > 14 & x <= 31) "[15 ... 31]"
    else if(x > 31 & x <= 62) "[32 ... 62]"
    else if(x > 62) "63++"
    else "unknown"
  })},
  # ifelse
  ifelse = {data$category <- 
    ifelse(is.na(data$number), "unknown", 
           ifelse(data$number == 0, "0",
                  ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
                         ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
                                ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
                                       ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
                                              ifelse(data$number > 62, "63++", "???")))))))}
                               )

得到:

# Unit: microseconds
#    expr     min       lq       mean   median       uq        max neval
#     cut 132.207 139.4185  154.78149 144.9770 154.5925    283.043   100
# findInt  18.329  21.7850   26.58004  26.2915  28.8460     60.996   100
#  lapply  14.122  15.6250 4269.73574  17.2770  18.7800 425198.055   100
#  ifelse  81.728  84.8835   96.09675  88.9400  96.3010    193.503   100

1
投票

也许你宁愿使用within()的条件赋值,这更清晰。

data <- within(data, {
  category <- NA
  category[number == 0] <- 0
  category[number > 0 & number <= 7] <- "[1 ... 7]"
  category[number > 7 & number <= 14] <- "[8 ... 14]"
  category[number > 14 & number <= 31] <- "[15 ... 31]"
  category[number > 31 & number <= 62] <- "[32 ... 62]"
  category[number > 62] <- "[32 ... 62]"
  category[is.na(number)] <- "unknown"
})

> data
   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA    unknown

1
投票

我不知道这是否适合您继续使用当前的方法:将data重命名为df,因为有一个名为data的函数

df$category[is.na(df$category)]<-"Unknown"
df$category

1
投票

如果您将is.na()移到开头,您当前的代码将起作用:

data$category <- 
  ifelse(is.na(data$number), "unknown", 
    ifelse(data$number == 0, "0",
      ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
        ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
          ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
            ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
              ifelse(data$number > 62, "63++", "???")))))))

> data
   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA    unknown

1
投票

我用一个lapply而不是嵌套的ifelse函数调用重写它有点友好。值得注意的变化是首先测试is.na()并返回“unknown”而不是最后的默认检查。原因是,当NA被传递给函数时,你的第一个测试是if(NA == 0)但是这将返回NA并且既不是TRUE也不是FALSE这就是为什么它是NA并且没有完成其检查以返回“unknown”

data$category <- lapply(data$number,function(x) {
  if(is.na(x) || is.null(x)) "unknown"
  else if(x == 0) "0"
  else if(x > 0 & x <= 7) "[1 ... 7]"
  else if(x > 7 & x <= 14) "[8 ... 14]"
  else if(x > 14 & x <= 31) "[15 ... 31]"
  else if(x > 31 & x <= 62) "[32 ... 62]"
  else if(x > 62) "63++"
  else "unknown"
})

1
投票

试试这个:

#define a vector with the range values
vec<-c(0,7,14,31,62)
#create your labels
levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
                paste0(vec[length(vec)]+1,"++"))
#use findInterval to create your result
res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
#substitute the NA's
res[is.na(res)]<-"unknown"
© www.soinside.com 2019 - 2024. All rights reserved.