使用dplyr链中的if_else / summary添加成千上万列?

问题描述 投票:0回答:3

我拥有的数据包括各州的观察结果(即各州的快餐公司)。在同一年对同一国家集团的多次观察是很常见的。每行包含一个值(即新快餐特许经营的数量)。我将这些数据汇总到各州组年中,然后需要创建一个二分指标,用于确定每个州组对是否都达到了每年的特定特许经营门槛。之后,我需要将数据汇总到州/年份的水平,并创建一个二分指标,以衡量any个州/州对是否在该年通过了阈值。

我正在使用dplyr来执行此操作,下面的代码运行完美。但是,我很难对不同的阈值(25个专营权,50个专营权等)进行编码,并且希望找到一种解决方案,在该解决方案中,我可以为众多阈值创建变量-例如所有25:1000之间的所有数字。是否有一种简单且编程的方式来做到这一点?我已经尝试过dplyr链中的循环,但是在如何使用原则名称添加新变量时遇到了麻烦(名称应包括阈值,以便在以后的代码中轻松引用)。感谢您的帮助!

注意:请随意编辑标题/问题以使其更清晰。

x <- data.frame("state" = c(rep("mi",12),
                            rep("tx",12)),
                "group" = c(rep("grp1",6),rep("grp2",6),
                            rep("grp3",6),rep("grp4",6)), 
                "year"  = c(rep(1990,3),rep(1991,3),rep(1992,3),rep(1993,3),
                            rep(1990,3),rep(1991,3),rep(1992,3),rep(1993,3)),
                "value" = c(seq(20,1200, by = 100),
                            seq(20,2400, by = 200)))

x_agg <- x %>%
  group_by(state, group, year) %>%
  summarise(value_tot = sum(value)) %>%
  mutate(val20   = ifelse(value_tot >= 20,   yes = 1, no = 0),
         val50   = ifelse(value_tot >= 50,   yes = 1, no = 0),
         val100  = ifelse(value_tot >= 100,  yes = 1, no = 0),
         val250  = ifelse(value_tot >= 250,  yes = 1, no = 0),
         val500  = ifelse(value_tot >= 500,  yes = 1, no = 0),
         val750  = ifelse(value_tot >= 750,  yes = 1, no = 0),
         val1000 = ifelse(value_tot >= 1000, yes = 1, no = 0)) %>%
  ungroup() %>%
  group_by(state, year) %>%
  summarise(val20   = as.numeric(any(val20 == 1)),
            val50   = as.numeric(any(val50 == 1)),
            val100  = as.numeric(any(val100 == 1)),
            val250  = as.numeric(any(val250 == 1)),
            val500  = as.numeric(any(val500 == 1)),
            val750  = as.numeric(any(val750 == 1)),
            val1000 = as.numeric(any(val1000 == 1)),) %>%
  ungroup()
r dplyr data.table tidyverse data-cleaning
3个回答
1
投票

我想在此处使用倍数表上的lapplyjoin。在treshold中定义列。

library("dplyr")

x <- data.frame("state" = c(rep("mi",12),
                            rep("tx",12)),
                "group" = c(rep("grp1",6),rep("grp2",6),
                            rep("grp3",6),rep("grp4",6)), 
                "year"  = c(rep(1990,3),rep(1991,3),rep(1992,3),rep(1993,3),
                            rep(1990,3),rep(1991,3),rep(1992,3),rep(1993,3)),
                "value" = c(seq(20,1200, by = 100),
                            seq(20,2400, by = 200)))

treshold <- c(20, 50, 100, 250, 500, 750, 1000)

lapply(as.list(treshold), function(tres){

  name <- paste0("val", tres)

  x %>% 
    group_by(state, group, year) %>% 
    summarise(value_tot = sum(value)) %>%
    mutate(!!name := as.integer(value_tot >= tres)) %>% 
    ungroup() %>% 
    group_by(state, year) %>%
    summarise(!!name := as.numeric(any(!!sym(name) == 1)))

}) %>% Reduce(function(d1, d2) full_join(d1, d2, by = c("state", "year")), .)

1
投票

您可以使用lapply创建功能列表,并使用mutate_at将所有功能都应用。我从您提供的x开始。只需将seq_val更改为要测试的数字顺序即可。

seq_val <- seq(1000, 10000, by = 1000)
val_funs <- lapply(seq_val, function(x) (function(a) as.integer(a >= x)))
names(val_funs) <- paste0("val", seq_val)

agg1 <- x %>%
  group_by(state, group, year) %>%
  summarise(value_tot = sum(value)) %>%
  ungroup() %>%
  mutate_at(
    "value_tot", 
    val_funs
  )

[agg1输出:

# A tibble: 8 x 14
  state group  year value_tot val1000 val2000 val3000 val4000 val5000 val6000
  <fct> <fct> <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 mi    grp1   1990       360       0       0       0       0       0       0
2 mi    grp1   1991      1260       1       0       0       0       0       0
3 mi    grp2   1992      2160       1       1       0       0       0       0
4 mi    grp2   1993      3060       1       1       1       0       0       0
5 tx    grp3   1990       660       0       0       0       0       0       0
6 tx    grp3   1991      2460       1       1       0       0       0       0
7 tx    grp4   1992      4260       1       1       1       1       0       0
8 tx    grp4   1993      6060       1       1       1       1       1       1
# … with 4 more variables: val7000 <dbl>, val8000 <dbl>, val9000 <dbl>,
#   val10000 <dbl>

然后将summarise_atany一起

agg1 %>%
  group_by(state, year) %>%
  summarise_at(
    vars(matches("val[^u]")),
    function(x) as.numeric(any(x == 1))
  )

输出:

# A tibble: 8 x 12
# Groups:   state [2]
  state  year val1000 val2000 val3000 val4000 val5000 val6000 val7000 val8000
  <fct> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 mi     1990       0       0       0       0       0       0       0       0
2 mi     1991       1       0       0       0       0       0       0       0
3 mi     1992       1       1       0       0       0       0       0       0
4 mi     1993       1       1       1       0       0       0       0       0
5 tx     1990       0       0       0       0       0       0       0       0
6 tx     1991       1       1       0       0       0       0       0       0
7 tx     1992       1       1       1       1       0       0       0       0
8 tx     1993       1       1       1       1       1       1       0       0
# … with 2 more variables: val9000 <dbl>, val10000 <dbl>

0
投票
valueExceeds <- function(df, n){
    variableName <- paste0("val", n)
    df %>%
        group_by(state, group, year) %>%
        summarise(value_tot = sum(value)) %>%
        mutate(!!variableName := as.integer(value_tot >= n))
}

x %>%
    valueExceeds(20)

输出此

  state group  year value_tot val20
  <fct> <fct> <dbl>     <dbl> <int>
1 mi    grp1   1990       360     1
2 mi    grp1   1991      1260     1
3 mi    grp2   1992      2160     1
4 mi    grp2   1993      3060     1
5 tx    grp3   1990       660     1
6 tx    grp3   1991      2460     1
7 tx    grp4   1992      4260     1
8 tx    grp4   1993      6060     1
© www.soinside.com 2019 - 2024. All rights reserved.