我有一个 df 数据框,如下所示。
df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
X1 = c(1,2,3,4,2,2,3,4,4,4),
X2 = c(1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23),
X3 = c(0,0,0,0,0,0,0,0,0,0),
X4 = c(1,1,1,1,1,1,1,1,1,1),
X5 = c(0,0,0,0,0,0,0,0,0,0),
X6 = c(0,0,0,0,0,0,0,0,0,0),
X7 = c(0,0,0,0,0,0,0,0,0,0),
X8 = c(1,1,0,2,8,0,1,3,4,5),
X9 = c(5,2,3,5,7,6,1,3,1,1),
X10 = c(1,2,1,4,0,6,7,5,5,6),
X11 = c(4,0,6,7,8,0,0,0,7,6),
X12 = c(0,1,0,0,0,6,5,4,0,0),
X13 = c(1,0,3,4,3,2,1,7,8,7),
X14 = c(1,2,NA,4,5,7,8,NA,8,5),
X15 = c(2,6,NA,6,5,NA,3,NA,NA,3))
> df
id X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
1 1 1 1.23 0 1 0 0 0 1 5 1 4 0 1 1 2
2 2 2 1.23 0 1 0 0 0 1 2 2 0 1 0 2 6
3 3 3 1.23 0 1 0 0 0 0 3 1 6 0 3 NA NA
4 4 4 1.23 0 1 0 0 0 2 5 4 7 0 4 4 6
5 5 2 1.23 0 1 0 0 0 8 7 0 8 0 3 5 5
6 6 2 1.23 0 1 0 0 0 0 6 6 0 6 2 7 NA
7 7 3 1.23 0 1 0 0 0 1 1 7 0 5 1 8 3
8 8 4 1.23 0 1 0 0 0 3 3 5 0 4 7 NA NA
9 9 4 1.23 0 1 0 0 0 4 1 5 7 0 8 8 NA
10 10 4 1.23 0 1 0 0 0 5 1 6 6 0 7 5 3
对于每个 id,从第
X5
列开始到X7
或X8
,都有一个由 3 或 4 个0
组成的块。在这个块之后,我需要获取数值直到下一个0
。
我想要的输出是:
> df_new
id Gamma1 Gamma2 Gamma3 Gamma4
1 1 1 5 1 4
2 2 1 2 2 NA
3 3 3 1 6 NA
4 4 2 5 4 7
5 5 8 7 NA NA
6 6 6 6 NA NA
7 7 1 1 7 NA
8 8 3 3 5 NA
9 9 4 1 5 7
10 10 5 1 6 6
鉴于条件始终从列位置 5 开始,请使用
pivot_longer
和连续的 pivot_wider
来构造所需的数据框。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-id) %>%
mutate(cons_id = row_number() > 4 & value == 0,
cons_id = consecutive_id(cons_id) == 3, .by = id) %>%
filter(cons_id) %>%
mutate(name = paste0("Gamma", row_number()), .by = id) %>%
pivot_wider(id_cols=id)
# A tibble: 10 × 5
id Gamma1 Gamma2 Gamma3 Gamma4
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 5 1 4
2 2 1 2 2 NA
3 3 3 1 6 NA
4 4 2 5 4 7
5 5 8 7 NA NA
6 6 6 6 NA NA
7 7 1 1 7 NA
8 8 3 3 5 NA
9 9 4 1 5 7
10 10 5 1 6 6
如果后面没有 0,这可能会出错,但它适用于您的输入:
mat = df[-(1:5)] |> as.matrix()
result = apply(mat, 1, \(x) {
# drop leading 0s
x = tail(x, -(match(TRUE, x != 0) - 1))
# keep until first 0
x = head(x, match(TRUE, x == 0) - 1)
unname(x)
})
## pad the lengths
n = max(lengths(result))
result = lapply(result, \(x) {length(x) = n; x})
## put in a data frame and label
result = do.call(rbind, result)
colnames(result) = paste0("Gamma", 1:ncol(result))
result = cbind(df["id"], result)
result
# id Gamma1 Gamma2 Gamma3 Gamma4
# 1 1 1 5 1 4
# 2 2 1 2 2 NA
# 3 3 3 1 6 NA
# 4 4 2 5 4 7
# 5 5 8 7 NA NA
# 6 6 6 6 NA NA
# 7 7 1 1 7 NA
# 8 8 3 3 5 NA
# 9 9 4 1 5 7
# 10 10 5 1 6 6
不可否认,它有点笨重,但它确实有效:
library(dplyr, warn.conflicts = F)
library(tidyr)
df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
X1 = c(1,2,3,4,2,2,3,4,4,4),
X2 = c(1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23),
X3 = c(0,0,0,0,0,0,0,0,0,0),
X4 = c(1,1,1,1,1,1,1,1,1,1),
X5 = c(0,0,0,0,0,0,0,0,0,0),
X6 = c(0,0,0,0,0,0,0,0,0,0),
X7 = c(0,0,0,0,0,0,0,0,0,0),
X8 = c(1,1,0,2,8,0,1,3,4,5),
X9 = c(5,2,3,5,7,6,1,3,1,1),
X10 = c(1,2,1,4,0,6,7,5,5,6),
X11 = c(4,0,6,7,8,0,0,0,7,6),
X12 = c(0,1,0,0,0,6,5,4,0,0),
X13 = c(1,0,3,4,3,2,1,7,8,7),
X14 = c(1,2,NA,4,5,7,8,NA,8,5),
X15 = c(2,6,NA,6,5,NA,3,NA,NA,3))
df %>%
# select relevant cols
dplyr::select(id, X5:ncol(df)) %>%
# reshape to long
tidyr::pivot_longer(2:ncol(.)) %>%
dplyr::group_by(id) %>%
# create "cut" by getting first zero after a non-zero
dplyr::mutate(first_zero = (value == 0 & value < dplyr::lag(value, default = F))) %>%
dplyr::mutate(cut = cumsum(first_zero)) %>%
# only keep non-zero values up until first zero thereafter
dplyr::filter(cut < 1 & value != 0) %>%
# create group index
dplyr::mutate(group = as.factor(dplyr::row_number())) %>%
# fill up missings because of differing group sizes
tidyr::complete(id, group) %>%
dplyr::mutate(group = paste0("Gamma", 1:4)) %>%
# clean and reshape back
dplyr::select(-first_zero, -cut, -name) %>%
tidyr::pivot_wider(names_from = group, values_from = value) %>%
dplyr::ungroup()
#> # A tibble: 10 × 5
#> id Gamma1 Gamma2 Gamma3 Gamma4
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 5 1 4
#> 2 2 1 2 2 NA
#> 3 3 3 1 6 NA
#> 4 4 2 5 4 7
#> 5 5 8 7 NA NA
#> 6 6 6 6 NA NA
#> 7 7 1 1 7 NA
#> 8 8 3 3 5 NA
#> 9 9 4 1 5 7
#> 10 10 5 1 6 6
由 reprex 包于 2023 年 9 月 14 日创建(v2.0.1)