如何从R中的某些值块中获取数值

问题描述 投票:0回答:3

我有一个 df 数据框,如下所示。

df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
                 X1 = c(1,2,3,4,2,2,3,4,4,4),
                 X2 = c(1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23),
                 X3 = c(0,0,0,0,0,0,0,0,0,0),
                 X4 = c(1,1,1,1,1,1,1,1,1,1),
                 X5 = c(0,0,0,0,0,0,0,0,0,0),
                 X6 = c(0,0,0,0,0,0,0,0,0,0),
                 X7 = c(0,0,0,0,0,0,0,0,0,0),
                 X8 = c(1,1,0,2,8,0,1,3,4,5),
                 X9 = c(5,2,3,5,7,6,1,3,1,1),
                 X10 = c(1,2,1,4,0,6,7,5,5,6),
                 X11 = c(4,0,6,7,8,0,0,0,7,6),
                 X12 = c(0,1,0,0,0,6,5,4,0,0),
                 X13 = c(1,0,3,4,3,2,1,7,8,7),
                 X14 = c(1,2,NA,4,5,7,8,NA,8,5),
                 X15 = c(2,6,NA,6,5,NA,3,NA,NA,3))

> df
   id X1   X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
1   1  1 1.23  0  1  0  0  0  1  5   1   4   0   1   1   2
2   2  2 1.23  0  1  0  0  0  1  2   2   0   1   0   2   6
3   3  3 1.23  0  1  0  0  0  0  3   1   6   0   3  NA  NA
4   4  4 1.23  0  1  0  0  0  2  5   4   7   0   4   4   6
5   5  2 1.23  0  1  0  0  0  8  7   0   8   0   3   5   5
6   6  2 1.23  0  1  0  0  0  0  6   6   0   6   2   7  NA
7   7  3 1.23  0  1  0  0  0  1  1   7   0   5   1   8   3
8   8  4 1.23  0  1  0  0  0  3  3   5   0   4   7  NA  NA
9   9  4 1.23  0  1  0  0  0  4  1   5   7   0   8   8  NA
10 10  4 1.23  0  1  0  0  0  5  1   6   6   0   7   5   3

对于每个 id,从第

X5
列开始到
X7
X8
,都有一个由 3 或 4 个
0
组成的块。在这个块之后,我需要获取数值直到下一个
0

我想要的输出是:

> df_new
   id Gamma1 Gamma2 Gamma3 Gamma4
1   1      1      5      1      4
2   2      1      2      2     NA
3   3      3      1      6     NA
4   4      2      5      4      7
5   5      8      7     NA     NA
6   6      6      6     NA     NA
7   7      1      1      7     NA
8   8      3      3      5     NA
9   9      4      1      5      7
10 10      5      1      6      6
r filter extract
3个回答
0
投票

鉴于条件始终从列位置 5 开始,请使用

pivot_longer
和连续的
pivot_wider
来构造所需的数据框。

library(dplyr)
library(tidyr)

df %>% 
  pivot_longer(-id) %>% 
  mutate(cons_id = row_number() > 4 & value == 0, 
         cons_id = consecutive_id(cons_id) == 3, .by = id) %>% 
  filter(cons_id) %>% 
  mutate(name = paste0("Gamma", row_number()), .by = id) %>% 
  pivot_wider(id_cols=id)
# A tibble: 10 × 5
      id Gamma1 Gamma2 Gamma3 Gamma4
   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1     1      1      5      1      4
 2     2      1      2      2     NA
 3     3      3      1      6     NA
 4     4      2      5      4      7
 5     5      8      7     NA     NA
 6     6      6      6     NA     NA
 7     7      1      1      7     NA
 8     8      3      3      5     NA
 9     9      4      1      5      7
10    10      5      1      6      6

0
投票

如果后面没有 0,这可能会出错,但它适用于您的输入:

mat = df[-(1:5)] |> as.matrix()

result = apply(mat, 1, \(x) {
  # drop leading 0s
  x = tail(x, -(match(TRUE, x != 0) - 1))
  # keep until first 0
  x = head(x, match(TRUE, x == 0) - 1)
  unname(x)
})

## pad the lengths
n = max(lengths(result))
result = lapply(result, \(x) {length(x) = n; x})

## put in a data frame and label
result = do.call(rbind, result)
colnames(result) = paste0("Gamma", 1:ncol(result))
result = cbind(df["id"], result)
result
#    id Gamma1 Gamma2 Gamma3 Gamma4
# 1   1      1      5      1      4
# 2   2      1      2      2     NA
# 3   3      3      1      6     NA
# 4   4      2      5      4      7
# 5   5      8      7     NA     NA
# 6   6      6      6     NA     NA
# 7   7      1      1      7     NA
# 8   8      3      3      5     NA
# 9   9      4      1      5      7
# 10 10      5      1      6      6

0
投票

不可否认,它有点笨重,但它确实有效:

library(dplyr, warn.conflicts = F)
library(tidyr)

df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
                 X1 = c(1,2,3,4,2,2,3,4,4,4),
                 X2 = c(1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23,1.23),
                 X3 = c(0,0,0,0,0,0,0,0,0,0),
                 X4 = c(1,1,1,1,1,1,1,1,1,1),
                 X5 = c(0,0,0,0,0,0,0,0,0,0),
                 X6 = c(0,0,0,0,0,0,0,0,0,0),
                 X7 = c(0,0,0,0,0,0,0,0,0,0),
                 X8 = c(1,1,0,2,8,0,1,3,4,5),
                 X9 = c(5,2,3,5,7,6,1,3,1,1),
                 X10 = c(1,2,1,4,0,6,7,5,5,6),
                 X11 = c(4,0,6,7,8,0,0,0,7,6),
                 X12 = c(0,1,0,0,0,6,5,4,0,0),
                 X13 = c(1,0,3,4,3,2,1,7,8,7),
                 X14 = c(1,2,NA,4,5,7,8,NA,8,5),
                 X15 = c(2,6,NA,6,5,NA,3,NA,NA,3))

df %>% 
  # select relevant cols
  dplyr::select(id, X5:ncol(df)) %>% 
  # reshape to long
  tidyr::pivot_longer(2:ncol(.)) %>% 
  dplyr::group_by(id) %>% 
  # create "cut" by getting first zero after a non-zero
  dplyr::mutate(first_zero = (value == 0 & value < dplyr::lag(value, default = F))) %>% 
  dplyr::mutate(cut = cumsum(first_zero)) %>% 
  # only keep non-zero values up until first zero thereafter
  dplyr::filter(cut < 1 & value != 0) %>% 
  # create group index
  dplyr::mutate(group = as.factor(dplyr::row_number())) %>% 
  # fill up missings because of differing group sizes
  tidyr::complete(id, group) %>% 
  dplyr::mutate(group = paste0("Gamma", 1:4)) %>% 
  # clean and reshape back
  dplyr::select(-first_zero, -cut, -name) %>% 
  tidyr::pivot_wider(names_from = group, values_from = value) %>% 
  dplyr::ungroup()

#> # A tibble: 10 × 5
#>       id Gamma1 Gamma2 Gamma3 Gamma4
#>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1     1      1      5      1      4
#>  2     2      1      2      2     NA
#>  3     3      3      1      6     NA
#>  4     4      2      5      4      7
#>  5     5      8      7     NA     NA
#>  6     6      6      6     NA     NA
#>  7     7      1      1      7     NA
#>  8     8      3      3      5     NA
#>  9     9      4      1      5      7
#> 10    10      5      1      6      6

reprex 包于 2023 年 9 月 14 日创建(v2.0.1)

© www.soinside.com 2019 - 2024. All rights reserved.