我有一个这种格式的大型数据集。我想 a) 识别具有以下值序列 1 - 1 - >1 - >1 x1 和 x10 之间任意位置的 ID/行;和 b) 生成一个新变量(“事件”),用于标识序列的开头,取值 X1,...,X10。
my_df <- data.frame(ID = c("a","b","c","d","e","f","g","h"),
replicate(8,sample(1:4,8,rep=TRUE)))
对于 a),我将 >1 的值替换为 2,然后将值从 X1 粘贴到 X10,然后过滤序列 1 - 1 - 2 - 2。对于 b),我使用嵌套创建了变量“event” ifelse() 来识别序列的开始位置。仅适用于 8 列。 有没有办法提高具有更多列的数据集的效率?
我非常感谢任何指点!
df_seq <- my_df%>%
mutate_at(vars(starts_with('X')), funs(ifelse(. > 1, 2, .)))%>%
mutate(seq = paste(X1,"-",X2,"-",X3,"-",X4,"-",X5,"-",X6,"-",X7,"-",X8))%>%
filter(grepl("1 - 1 - 2 - 2", seq))%>%
mutate(event = ifelse(X1 == 1 & X2 == 1 & X3 == 2 & X4 == 2,"X1",
ifelse(X2 == 1 & X3 == 1 & X4 == 2 & X5 == 2,"X2",
ifelse(X3 == 1 & X4 == 1 & X5 == 2 & X6 == 2,"X3",
ifelse(X4 == 1 & X5 == 1 & X6 == 2 & X7 == 2,"X4","X5")))))
你可以这样做:
library(dplyr)
library(tidyr)
library(stringr)
df |>
unite(seq, starts_with("X"), sep = "", remove = FALSE) |>
mutate(event = paste0("X", sapply(str_locate_all(seq, pattern ='11[2-9][2-9]'), function(x) x[, 1]['start'])),
event = case_match(event, "XNA" ~ NA, .default = event))
输出示例:
ID seq X1 X2 X3 X4 X5 X6 X7 X8 event
1 a 13341322 1 3 3 4 1 3 2 2 <NA>
2 b 44141221 4 4 1 4 1 2 2 1 <NA>
3 c 14144333 1 4 1 4 4 3 3 3 <NA>
4 d 22414321 2 2 4 1 4 3 2 1 <NA>
5 e 21241341 2 1 2 4 1 3 4 1 <NA>
6 f 13411443 1 3 4 1 1 4 4 3 X4
7 g 31433142 3 1 4 3 3 1 4 2 <NA>
8 h 32114343 3 2 1 1 4 3 4 3 X3
使用数据:
> dput(df)
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h"),
X1 = c(1L, 4L, 1L, 2L, 2L, 1L, 3L, 3L), X2 = c(3L, 4L, 4L,
2L, 1L, 3L, 1L, 2L), X3 = c(3L, 1L, 1L, 4L, 2L, 4L, 4L, 1L
), X4 = c(4L, 4L, 4L, 1L, 4L, 1L, 3L, 1L), X5 = c(1L, 1L,
4L, 4L, 1L, 1L, 3L, 4L), X6 = c(3L, 2L, 3L, 3L, 3L, 4L, 1L,
3L), X7 = c(2L, 2L, 3L, 2L, 4L, 4L, 4L, 4L), X8 = c(2L, 1L,
3L, 1L, 1L, 3L, 2L, 3L)), class = "data.frame", row.names = c(NA,
-8L))