我有这样一个数据帧:
id subscriberid intName
1 1 1234567890 asdfsadf
2 2 3243245324 dfsafdf
3 3 4532453245 dasdfsd
可再现的例子如下:
structure(list(id = 1:3, subscriberid = c(1234567890, 3243245324, 4532453245),
intName = c("asdfsadf", "dfsafdf", "dasdfsd")),
row.names = c(NA, 3L), class = "data.frame")
我的子编号的阵列和地方的子编号与DF $ subscriberid比赛,我必须将用户ID的第一位数字更改为9。
subid = c(1234567890,2345345234)
我试过如下:
for (i in df$subscriberid) {
df$subscriberid == sub(substr(df$subscriberid,0,1),9,df$subscriberid)
}
我也曾尝试与SUBSTR和GSUB,以及其他不同的组合ifelse。但是打不通。所期望的输出是
id subscriberid intName
1 1 9234567890 asdfsadf <--- only the first digit is changed.
2 2 3243245324 dfsafdf
3 3 4532453245 dasdfsd
一种选择是使用ifelse
如果subscriberid
存在于subid
那么我们paste
9剩余字符串从第二个索引处开始。
df$subscriberid <- with(df, ifelse(subscriberid %in% subid,
paste0("9",substring(subscriberid,2)), subscriberid))
df
# id subscriberid intName
#1 1 9234567890 asdfsadf
#2 2 3243245324 dfsafdf
#3 3 4532453245 dasdfsd
使用substring
的好处是,你只需要提一下开始索引(这里是2),用于中止的默认值是1000000,涵盖了大部分的字符串。
我们可以尝试建立匹配ID的正则表达式,然后使用grepl
找到你的数据帧匹配的行:
regex <- paste0("\\b(", paste(subid, collapse="|"), ")\\b")
df$subscriberid <- ifelse(grepl(regex, df$subscriberid),
paste0("9", substr(df$subscriberid, 2, nchar(df$subscriberid))),
df$subscriberid)
df
id subscriberid intName
1 1 9234567890 asdfsadf
2 2 3243245324 dfsafdf
3 3 4532453245 dasdfsd
用数学的方法,作为结果的利益返回为数字,它的速度更快。
df <- structure(list(id = 1:3, subscriberid = c(1234567890, 3243245324, 4532453245),
intName = c("asdfsadf", "dfsafdf", "dasdfsd")),
row.names = c(NA, 3L), class = "data.frame")
subid <- c(1234567890,2345345234)
idx <- df$subscriberid %in% subid
vals <- df[ idx, "subscriberid" ]
digits <- floor( log10( vals ) )
## number of digits given by `floor( log10( vals) ) + 1`, but we want the first digit
( ( vals / 10^digits ) + 9 - floor( vals / 10^digits ) ) * (10^digits)
# [1] 9234567890
这是什么东西做的是找到data.frame的匹配subid
其中指数
它当时
log10
这些数字library(microbenchmark)
microbenchmark(
ronak = { ronak( df, subid ) },
tim = { tim( df, subid ) },
tmfmnk = { tmfmnk( df, subid ) },
symbolix = { symbolix( df, subid ) },
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# ronak 186.143804 188.618750 214.151592 191.154106 196.399341 308.4420 5
# tim 442.385985 463.510154 526.814255 506.268620 541.829769 680.0767 5
# tmfmnk 236.423472 255.418334 295.652617 295.624544 329.901976 360.8948 5
# symbolix 5.510366 5.828804 8.166222 5.850937 5.942607 17.6984 5
并显示结果相同
res_ronak <- ronak( df, subid )
res_tim <- tim( df, subid )
res_tmfmnk <- tmfmnk( df, subid )
res_symbolix <- symbolix( df, subid )
all.equal(res_ronak, res_tim)
# [1] TRUE
all.equal(res_tim, res_tmfmnk)
# [1] TRUE
res_symbolix$subscriberid <- as.character(res_symbolix$subscriberid)
all.equal(res_tmfmnk, res_symbolix)
# [1] TRUE
set.seed(1234)
df <- data.frame(
subscriberid = sample(1:100000000, size = 1e5)
)
subid <- sample( df$subscriberid, size = 10 )
ronak <- function(df, subid) {
df$subscriberid <- with(df, ifelse(subscriberid %in% subid,
paste0("9",substring(subscriberid,2)), subscriberid))
return(df)
}
tim <- function(df, subid) {
regex <- paste0("\\b(", paste(subid, collapse="|"), ")\\b")
df$subscriberid <- ifelse(grepl(regex, df$subscriberid),
paste0("9", substr(df$subscriberid, 2, nchar(df$subscriberid))),
df$subscriberid)
return(df)
}
tmfmnk <- function(df, subid) {
df$subscriberid <- ifelse(df$subscriberid %in% subid,
sub(".", "9", df$subscriberid), df$subscriberid)
return(df)
}
symbolix <- function(df, subid) {
idx <- df$subscriberid %in% subid
vals <- df[ idx, "subscriberid" ]
digits <- floor( log10( vals ) )
df[ idx, "subscriberid" ] <- ( ( vals / 10^digits ) + 9 - floor( vals / 10^digits ) ) * (10^digits)
return(df)
}
稍微不同的可能性是使用sub()
:
df$subscriberid <- ifelse(df$subscriberid %in% subid,
sub(".", "9", df$subscriberid), df$subscriberid)
id subscriberid intName
1 1 9234567890 asdfsadf
2 2 3243245324 dfsafdf
3 3 4532453245 dasdfsd
在这里,如果“subscriberid”“子编号”匹配,在“subscriberid”的第一个字符被替换为9,否则它保持不变。
这是比较容易利用substring
的分配方法,
# create a logical vector
i1 <- df1$subscriberid %in% subid
# convert the column to character class
df1$subscriberid <- as.character(df1$subscriberid)
#assign with substring<-
substring(df1$subscriberid[i1], 1, 1) <- '9'
df1
# id subscriberid intName
#1 1 9234567890 asdfsadf
#2 2 3243245324 dfsafdf
#3 3 4532453245 dasdfsd
包括@ SymbolixAU的数据沿着这个方法(从他的岗位采取的其他功能)
akrun <- function(df, subid) {
i1 <- df$subscriberid %in% subid
df$subscriberid <- as.character(df$subscriberid)
substring(df$subscriberid[i1], 1, 1) <- '9'
}
set.seed(1234)
df <- data.frame(
subscriberid = sample(1:100000000, size = 1e5)
)
subid <- sample( df$subscriberid, size = 10 )
library(microbenchmark)
microbenchmark(
ronak = { ronak( df, subid ) },
tim = { tim( df, subid ) },
tmfmnk = { tmfmnk( df, subid ) },
symbolix = { symbolix( df, subid ) }, akrun = {akrun(df, subid)}, times = 5)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# ronak 105.073716 128.279151 140.993520 138.241632 154.89092 178.48218 5 b
# tim 224.610660 246.959505 263.138679 264.685503 284.93632 294.50141 5 c
# tmfmnk 119.734979 134.949406 138.735054 135.888113 142.91750 160.18527 5 b
# symbolix 2.487283 3.238862 8.429718 3.540119 10.80669 22.07564 5 a
# akrun 29.530330 33.431953 41.649046 34.772512 36.91314 73.59730 5 a
df1 <- structure(list(id = 1:3, subscriberid = c(1234567890, 3243245324, 4532453245),
intName = c("asdfsadf", "dfsafdf", "dasdfsd")),
row.names = c(NA, 3L), class = "data.frame")