我有一个数据集,其值包括缺失值。目的是创建一个向量change
,该向量指示自上一个有效值起的变化。
以下是一些数据:
test <- data.frame(resp = c(9, NA, NA, 11, NA, NA, 6, 16, NA, 12, 0, 0, 0, 0, 0, NA, 0, 11, NA, NA, NA, NA, NA, NA, 14, NA, 23, NA, NA, 16, 16))
想法如下:
0
值1
-1
,如果前一个已经为负。 所以上面的数据的结果看起来像这样:
resp change
1 9 0
2 NA NA
3 NA NA
4 11 1
5 NA NA
6 NA NA
7 6 -1
8 16 1
9 NA NA
10 12 -1
11 0 -2
12 0 0
13 0 0
14 0 0
15 0 0
16 NA NA
17 0 0
18 11 1
19 NA NA
20 NA NA
21 NA NA
22 NA NA
23 NA NA
24 NA NA
25 14 2
我尝试了一个for循环,它以某种方式起作用,但是我觉得这是一堆凌乱的代码,而且速度很慢。有更好的解决方案(例如purrr)吗?
for (i in 2:nrow(test)) {
test$change[i] <- 0
test$change[i] <- case_when(
test$resp[i] > last(test$resp[which(!is.na(test$resp[1:i-1]))]) & last(test$change[which(!is.na(test$resp[2:i-1]))]) >= 0 ~ test$change[i] + last(test$change[which(!is.na(test$resp[1:i-1]))]) + 1,
test$resp[i] > last(test$resp[which(!is.na(test$resp[1:i-1]))]) & last(test$change[which(!is.na(test$resp[2:i-1]))]) <= 0 ~ test$change[i] + 1,
test$resp[i] < last(test$resp[which(!is.na(test$resp[1:i-1]))]) & last(test$change[which(!is.na(test$resp[2:i-1]))]) <= 0 ~ test$change[i] + last(test$change[which(!is.na(test$resp[1:i-1]))]) - 1,
test$resp[i] < last(test$resp[which(!is.na(test$resp[1:i-1]))]) & last(test$change[which(!is.na(test$resp[2:i-1]))]) >= 0 ~ test$change[i]- 1,
TRUE ~ test$change[i])
test$change[i] <- if_else(is.na(test$resp[i]), NA_real_, test$change[i])
}
最终,这应该应用于具有> 30个变量和> 100000行的数据集。
这将重复您的结果,但它始终使用0表示始终不变(如您的描述),而不是不适用。它基本上使用fill
和lag
来创建包含用last
和which
创建的值的列,然后使用case_when
来填充change
列。
如果要在NA
列中使用0
而不是change
,请将~ 0
的第一子句中的case_when
更改为~ NA_real_
。如果您确实希望像示例中那样混合使用0
和NA
,请说明何时使用它们。
library(tidyverse)
test <- data.frame(resp = c(9, NA, NA, 11, NA, NA, 6, 16, NA, 12, 0, 0, 0, 0, 0, NA, 0, 11, NA, NA, NA, NA, NA, NA, 14, NA, 23, NA, NA, 16, 16))
test %>% mutate(filled=resp) %>%
fill(filled) %>%
mutate(change_sign=sign(filled-lag(filled, default=filled[1])),
lag_filled_change = lag(if_else(change_sign==0, NA_real_, change_sign), default=0)) %>%
fill(lag_filled_change) %>%
mutate(change = case_when(
change_sign==0 ~ 0,
change_sign==1 & lag_filled_change<=0 ~ 1,
change_sign==1 & lag_filled_change >0 ~ lag_filled_change+1,
change_sign==-1& lag_filled_change>=0 ~ -1,
change_sign==-1& lag_filled_change <0 ~ lag_filled_change-1
)) %>%
select(resp, change)
#> resp change
#> 1 9 0
#> 2 NA 0
#> 3 NA 0
#> 4 11 1
#> 5 NA 0
#> 6 NA 0
#> 7 6 -1
#> 8 16 1
#> 9 NA 0
#> 10 12 -1
#> 11 0 -2
#> 12 0 0
#> 13 0 0
#> 14 0 0
#> 15 0 0
#> 16 NA 0
#> 17 0 0
#> 18 11 1
#> 19 NA 0
#> 20 NA 0
#> 21 NA 0
#> 22 NA 0
#> 23 NA 0
#> 24 NA 0
#> 25 14 2
#> 26 NA 0
#> 27 23 2
#> 28 NA 0
#> 29 NA 0
#> 30 16 -1
#> 31 16 0
由reprex package(v0.3.0)在2020-01-15创建
library(tidyverse)
library(zoo)
# example data
test <- data.frame(resp = c(9, NA, NA, 11, NA, NA, 6, 16, NA, 12, 0, 0, 0, 0, 0, NA, 0, 11, NA, NA, NA, NA, NA, NA, 14))
# add an id for each row
test = test %>% mutate(id = row_number())
test %>%
na.omit() %>% # exclude rows with NAs
mutate(flag = case_when(resp == lag(resp, default = first(resp)) ~ 0,
resp > lag(resp, default = first(resp)) ~ 1,
resp < lag(resp, default = first(resp)) ~ -1)) %>% # check relationship between current and previous value
mutate(g = cumsum(flag != lag(flag, default = first(flag)))) %>% # create a grouping based on change in flag column
group_by(g) %>% # for each group
mutate(change = ifelse(flag != 0, flag * row_number(), flag)) %>% # calculate the change column
ungroup() %>% # forget the grouping
select(id, change) %>% # keep useful columns
right_join(test, by="id") %>% # join back to get NA rows in the right place
select(resp, change) # keep useful columns
因此,您会得到:
# resp change
# 1 9 0
# 2 NA NA
# 3 NA NA
# 4 11 1
# 5 NA NA
# 6 NA NA
# 7 6 -1
# 8 16 1
# 9 NA NA
# 10 12 -1
# 11 0 -2
# 12 0 0
# 13 0 0
# 14 0 0
# 15 0 0
# 16 NA NA
# 17 0 0
# 18 11 1
# 19 NA NA
# 20 NA NA
# 21 NA NA
# 22 NA NA
# 23 NA NA
# 24 NA NA
# 25 14 2