我有一个包含3列的数据框,我想根据其他列中给出的值创建第4列。对于创建new_rank列,我们从1开始作为所有用户的起点,当matric_1大于15且matric_2大于20时,将后续排名值增加1。
我觉得我需要在r中使用cumsum函数,但我正在努力处理ifelse条件。数据帧代码如下
df<-data.frame(user_id=c("a","a","a","a","b","b","b","c","c","c","c","c","d","d","d","d"),matric_1=c(10,23,4,5,17,5,40,1,2,18,19,5,18,2,19,2),matric_2=c(10,25,10,13,21,10,7,3,4,22,21,4,23,4,21,4),new_rank=c(1,1,2,2,1,2,2,1,1,1,2,3,1,2,2,3))
User_id matric_1 matric_2 new_rank
a 10% 10% 1
a 23% 25% 1
a 4% 10% 2
a 5% 13% 2
b 17% 21% 1
b 5% 10% 2
b 40% 7% 2
c 1% 3% 1
c 2% 4% 1
c 18% 22% 1
c 19% 21% 2
c 5% 6% 3
d 18% 23% 1
d 2% 4% 2
d 19% 21% 2
d 2% 4% 3
在按'user_id'分组后,通过获取逻辑lag
的cumsum
的vector
创建'new_rank1'
library(dplyr)
df %>%
group_by(user_id) %>%
mutate(new_rank1 = lag(cumsum(matric_1 > 15 & matric_2 > 20) + 1, default = 1))
# A tibble: 16 x 5
# Groups: user_id [4]
# user_id matric_1 matric_2 new_rank new_rank1
# <fctr> <dbl> <dbl> <dbl> <dbl>
# 1 a 10.0 10.0 1.00 1.00
# 2 a 23.0 25.0 1.00 1.00
# 3 a 4.00 10.0 2.00 2.00
# 4 a 5.00 13.0 2.00 2.00
# 5 b 17.0 21.0 1.00 1.00
# 6 b 5.00 10.0 2.00 2.00
# 7 b 40.0 7.00 2.00 2.00
# 8 c 1.00 3.00 1.00 1.00
# 9 c 2.00 4.00 1.00 1.00
#10 c 18.0 22.0 1.00 1.00
#11 c 19.0 21.0 2.00 2.00
#12 c 5.00 4.00 3.00 3.00
#13 d 18.0 23.0 1.00 1.00
#14 d 2.00 4.00 2.00 2.00
#15 d 19.0 21.0 2.00 2.00
#16 d 2.00 4.00 3.00 3.00
基于@akrun解决方案,但使用data.table
包
library('data.table')
setDT(df)
df[, rank := shift( x = cumsum(matric_1 > 15 & matric_2 > 20) + 1,
fill = 1,
type = "lag" ),
by = user_id]
df
# user_id matric_1 matric_2 new_rank rank
# 1: a 10 10 1 1
# 2: a 23 25 1 1
# 3: a 4 10 2 2
# 4: a 5 13 2 2
# 5: b 17 21 1 1
# 6: b 5 10 2 2
# 7: b 40 7 2 2
# 8: c 1 3 1 1
# 9: c 2 4 1 1
# 10: c 18 22 1 1
# 11: c 19 21 2 2
# 12: c 5 4 3 3
# 13: d 18 23 1 1
# 14: d 2 4 2 2
# 15: d 19 21 2 2
# 16: d 2 4 3 3
数据:
df <- data.frame(user_id=c("a","a","a","a","b","b","b","c","c","c","c","c","d","d","d","d"),matric_1=c(10,23,4,5,17,5,40,1,2,18,19,5,18,2,19,2),matric_2=c(10,25,10,13,21,10,7,3,4,22,21,4,23,4,21,4),new_rank=c(1,1,2,2,1,2,2,1,1,1,2,3,1,2,2,3))