根据 r 中数据帧中各列的条件结转前一行的数据

问题描述 投票:0回答:1

数据集

我有一个数据框(

issue_termi_episode
),采样如下:

issue_termi_episode <- structure(
  list(
    new_conflictep_id = c(20504, 20505, 20506, 20507, 20508, 20902, 20903, 20904, 22003, 22101, 22102, 22103, 22104, 22105, 22202),
    conflict_id = c(205, 205, 205, 205, 205, 209, 209, 209, 220, 221, 221, 221, 221, 221, 222),
    location = c("Iran", "Iran", "Iran", "Iran", "Iran", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)"),
    incompatibility = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2),
    conflict = c("Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma)"),
    conflictepisode = c(4, 5, 6, 7, 8, 2, 3, 4, 3, 1, 2, 3, 4, 5, 2),
    outcome = c(5, 5, 5, 5, 5, 2, 2, NA, 4, 5, 5, 5, 2, 5, 5),
    version = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3),
    intensity_level = c(1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1),
    region = c("2", "2", "2", "2", "2", "3", "3", "3", "5", "3", "3", "3", "3", "3", "3"),
    first_year_active = c(1990, 1993, 1996, 2016, 2018, 1989, 1997, 1999, 1989, 1989, 1994, 1997, 2000, 2013, 1990),
    last_year_active = c(1990, 1993, 1996, 2016, 2018, 1995, 1997, 2020, 1989, 1992, 1995, 1998, 2011, 2013, 1992),
    issue_territory = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1),
    issue_statestruc = c(1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
    issue_gov = c(1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1),
    issue_polrights = c(1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
    issue_distrib = c(1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1)
  ),
  row.names = c(NA, -15L),
  class = c("tbl_df", "tbl", "data.frame")
)
issue_termi_episode
# A tibble: 15 × 17
   new_conflictep_id conflict_id location        incompatibility conflict   conflictepisode outcome version intensity_level region first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib
               <dbl>       <dbl> <chr>                     <dbl> <chr>                <dbl>   <dbl>   <dbl>           <dbl> <chr>              <dbl>            <dbl>           <dbl>            <dbl>     <dbl>           <dbl>         <dbl>
 1             20504         205 Iran                          1 Iran: Kur…               4       5       3               1 2                   1990             1990               1                1         1               1             1
 2             20505         205 Iran                          1 Iran: Kur…               5       5       3               1 2                   1993             1993               1                1         0               1             0
 3             20506         205 Iran                          1 Iran: Kur…               6       5       3               1 2                   1996             1996               1                1         1               0             0
 4             20507         205 Iran                          1 Iran: Kur…               7       5       3               1 2                   2016             2016               1                1         1               1             1
 5             20508         205 Iran                          1 Iran: Kur…               8       5       3               1 2                   2018             2018               1                1         1               1             1
 6             20902         209 Philippines                   2 Philippin…               2       2       3               2 3                   1989             1995               0                1         1               1             1
 7             20903         209 Philippines                   2 Philippin…               3       2       3               1 3                   1997             1997               0                1         1               1             1
 8             20904         209 Philippines                   2 Philippin…               4      NA       3               1 3                   1999             2020               0                1         1               1             1
 9             22003         220 Paraguay                      2 Paraguay                 3       4       3               1 5                   1989             1989               0                1         1               1             0
10             22101         221 Myanmar (Burma)               1 Myanmar (…               1       5       3               2 3                   1989             1992               1                1         1               1             1
11             22102         221 Myanmar (Burma)               1 Myanmar (…               2       5       3               1 3                   1994             1995               1                1         0               1             1
12             22103         221 Myanmar (Burma)               1 Myanmar (…               3       5       3               1 3                   1997             1998               1                1         1               1             1
13             22104         221 Myanmar (Burma)               1 Myanmar (…               4       2       3               1 3                   2000             2011               1                1         1               1             1
14             22105         221 Myanmar (Burma)               1 Myanmar (…               5       5       3               1 3                   2013             2013               1                1         0               1             0
15             22202         222 Myanmar (Burma)               2 Myanmar (…               2       5       3               1 3                   1990             1992               1                1         1               1             1

最后 5 个变量(形式为

issue_*
)是二进制的(
0
1
)。
new_conflictep_id
对于每一行都是唯一的,并且
conflict_id
是一个分组变量,通常对于多行来说是相同的,但有时也只是对于一行而言。

目标

conflict_id
分组,我想将所有
1
变量的数据(
0
issue_*
)“结转到”下一行,但前提是

  1. 下一行的
    first_year_active
    与上一行的
    last_year_active
    之间的差距为5年或更小;
  2. 下一行的所有
  3. issue_*
     变量都等于 
    0
我想要的输出将更新示例的第 5 行,其中

new_conflictep_id

 = 
20508
...

new_conflictep_id conflict_id ... first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib 1 20504 205 ... 1990 1990 1 1 1 1 1 2 20505 205 ... 1993 1993 1 1 0 1 0 3 20506 205 ... 1996 1996 1 1 1 0 0 4 20507 205 ... 2016 2016 1 1 1 1 1 # ^^^ ^^^^ 5 20508 205 ... 2018 2018 0 0 0 0 0 # ^^^ ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 6 20902 209 ... 1989 1995 0 1 1 1 1
...通过使用上行中的所有 

issue_*

 填充 
1
 变量。

new_conflictep_id conflict_id ... first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib 1 20504 205 ... 1990 1990 1 1 1 1 1 2 20505 205 ... 1993 1993 1 1 0 1 0 3 20506 205 ... 1996 1996 1 1 1 0 0 4 20507 205 ... 2016 2016 1 1 1 1 1 # ↓ ↓ ↓ ↓ ↓ 5 20508 205 ... 2018 2018 1 1 1 1 1 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 6 20902 209 ... 1989 1995 0 1 1 1 1
尝试

我用

dplyr

尝试了不同的解决方案。第一:

library(dplyr)
issue_termi_episode <- issue_termi_episode %>%
arrange(conflict_id, first_year_active) %>%
group_by(conflict_id) %>%
mutate(across(starts_with("issue_"), ~ ifelse(first_year_active - lag(last_year_active, default = first_year_active[1] + 10) <= 5 & all(. == 0), lag(.), .)))
从我的数据内容来看,对于一组

conflict_id

中的第一行,不需要进行任何更改(无论如何这是不可能的,因为没有数据可以从上面结转) 。因此,我将 
+ 10
 添加到 
lag()
 中的默认参数中,因此该条件永远不会应用于第一行。
这段代码只是在一些随机位置创建了 
NA
,但既没有在我想要根据条件定位的那些行中创建,也没有在给定行的所有 5 个 
NA
 变量中填充这些 
issue_*
(如我所愿)。

然后我尝试了一种解决方法,首先创建一个新变量,如果满足我的条件,则填充特定值(

333

)(稍后使用该变量来传递数据):

issue_termi_episode <- issue_termi_episode %>% arrange(conflict_id, first_year_active) %>% group_by(conflict_id) %>% mutate(row_fill = ifelse(first_year_active - lag(last_year_active, default = first_year_active[1] + 10) <= 5 &all(c_across(starts_with("issue_")) == 0), 333, 0)) %>% ungroup() %>% mutate(across(starts_with("issue_"), ~ ifelse(row_fill == 333, lag(.), .)))
但是,在新创建的

row_fill

中,仅填写了
0
。这使我得出结论:我的情况未被识别。尝试所有 
issue_*
 变量 
= 0, 333
 的条件填充了很少但不是所有应该满足我的条件的行。我无法落后,为什么?其他条件(以相同方式测试)工作正常。

issue_termi_episode <- issue_termi_episode %>% arrange(conflict_id, first_year_active) %>% group_by(conflict_id) %>% mutate(row_fill = if_else(all(c_across(starts_with("issue_")) == 0), 333, 0)) %>% ungroup()
我非常感谢一些帮助,无论是另一种方法的完整解决方案,还是只是调试我的情况。

r dataframe if-statement dplyr mutate
1个回答
0
投票
解决方案

这是

tidyverse

解决方案,我在评论中建议并且您确认。它将相关的 0
 转换为 
NA
,并从上面的数据中将它们转换为 
fill()

library(dplyr) library(tidyr) # For filling. # ... # Code to generate 'issue_termi_episode'. # ... # Set the "closeness" threshold. year_threshold <- 5 issue_termi_episode %>% # Sort groups in chronological order. arrange(conflict_id, first_year_active) %>% # Prepare for filling within groups. group_by(conflict_id) %>% mutate( # Flag rows with only 0s. all_zero = if_all(starts_with("issue_"), ~ . == 0), # Flag rows whose year is "close" to the prior. is_close = abs(first_year_active - lag(last_year_active)) <= year_threshold, # Flag rows that are both, and should thus be overwritten. needs_fill = if_else(is.na(is_close), FALSE, all_zero & is_close), # Replace their 0s with NAs. across(starts_with("issue_"), ~ if_else(needs_fill, NA_real_, .)) ) %>% # Fill downwards with the prior values. fill(starts_with("issue_")) %>% # Clean up from preparations. ungroup() %>% select(!c(all_zero, is_close, needs_fill))
结果

给定像您的示例这样的

issue_termi_episode

数据集...

issue_termi_episode <- structure( list( new_conflictep_id = c(20504, 20505, 20506, 20507, 20508, 20902, 20903, 20904, 22003, 22101, 22102, 22103, 22104, 22105, 22202), conflict_id = c(205, 205, 205, 205, 205, 209, 209, 209, 220, 221, 221, 221, 221, 221, 222), location = c("Iran", "Iran", "Iran", "Iran", "Iran", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)", "Myanmar (Burma)"), incompatibility = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2), conflict = c("Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Iran: Kurdistan", "Philippines", "Philippines", "Philippines", "Paraguay", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma): Karen", "Myanmar (Burma)"), conflictepisode = c(4, 5, 6, 7, 8, 2, 3, 4, 3, 1, 2, 3, 4, 5, 2), outcome = c(5, 5, 5, 5, 5, 2, 2, NA, 4, 5, 5, 5, 2, 5, 5), version = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), intensity_level = c(1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1), region = c("2", "2", "2", "2", "2", "3", "3", "3", "5", "3", "3", "3", "3", "3", "3"), first_year_active = c(1990, 1993, 1996, 2016, 2018, 1989, 1997, 1999, 1989, 1989, 1994, 1997, 2000, 2013, 1990), last_year_active = c(1990, 1993, 1996, 2016, 2018, 1995, 1997, 2020, 1989, 1992, 1995, 1998, 2011, 2013, 1992), issue_territory = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1), issue_statestruc = c(1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), issue_gov = c(1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1), issue_polrights = c(1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), issue_distrib = c(1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1) ), row.names = c(NA, -15L), class = c("tbl_df", "tbl", "data.frame") )
...这应该会产生您想要的结果。

# A tibble: 15 × 17 new_conflictep_id conflict_id location incompatibility conflict conflictepisode outcome version intensity_level region first_year_active last_year_active issue_territory issue_statestruc issue_gov issue_polrights issue_distrib <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 20504 205 Iran 1 Iran: Kur… 4 5 3 1 2 1990 1990 1 1 1 1 1 2 20505 205 Iran 1 Iran: Kur… 5 5 3 1 2 1993 1993 1 1 0 1 0 3 20506 205 Iran 1 Iran: Kur… 6 5 3 1 2 1996 1996 1 1 1 0 0 4 20507 205 Iran 1 Iran: Kur… 7 5 3 1 2 2016 2016 1 1 1 1 1 5 20508 205 Iran 1 Iran: Kur… 8 5 3 1 2 2018 2018 1 1 1 1 1 6 20902 209 Philippines 2 Philippin… 2 2 3 2 3 1989 1995 0 1 1 1 1 7 20903 209 Philippines 2 Philippin… 3 2 3 1 3 1997 1997 0 1 1 1 1 8 20904 209 Philippines 2 Philippin… 4 NA 3 1 3 1999 2020 0 1 1 1 1 9 22003 220 Paraguay 2 Paraguay 3 4 3 1 5 1989 1989 0 1 1 1 0 10 22101 221 Myanmar (Burma) 1 Myanmar (… 1 5 3 2 3 1989 1992 1 1 1 1 1 11 22102 221 Myanmar (Burma) 1 Myanmar (… 2 5 3 1 3 1994 1995 1 1 0 1 1 12 22103 221 Myanmar (Burma) 1 Myanmar (… 3 5 3 1 3 1997 1998 1 1 1 1 1 13 22104 221 Myanmar (Burma) 1 Myanmar (… 4 2 3 1 3 2000 2011 1 1 1 1 1 14 22105 221 Myanmar (Burma) 1 Myanmar (… 5 5 3 1 3 2013 2013 1 1 0 1 0 15 22202 222 Myanmar (Burma) 2 Myanmar (… 2 5 3 1 3 1990 1992 1 1 1 1 1

注意第

5

 行是如何从第 
4
 开始填充所有 
issue_*
 列的:

# ... %>% slice(4:5) %>% select(ends_with("_id"), starts_with("issue_"))

# A tibble: 2 × 7 new_conflictep_id conflict_id issue_territory issue_statestruc issue_gov issue_polrights issue_distrib <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 20507 205 1 1 1 1 1 2 20508 205 1 1 1 1 1

注意

如果一行

0

 是其组中的最上面,则保持不变。这在概念上是可取的,因为我们没有可以用来填充它的“先验”数据。

这是通过

lag()

 来实现的,它在顶部填充 “缺失值”:即 NA
。这会产生 
NA
is_close
...

# ... # Flag rows whose year is "close" to the prior. is_close = abs(first_year_active - lag(last_year_active)) <= year_threshold, # ^^^^^^^^^^^^^^^^^^^^^ # ...
...因此 

FALSE

 代表 
needs_fill
:

# ... # Flag rows that are both, and should thus be overwritten. needs_fill = if_else(is.na(is_close), FALSE, all_zero & is_close) # ^^^^^^^^^^^^^^^ # ...
    
© www.soinside.com 2019 - 2024. All rights reserved.