我想使用tidyverse对多列进行计算。我知道如何为单个用户(在单个列中表示)执行此操作,但是我需要为1000个以上的用户(因此,列数相等)执行此操作。
但是,我对使用tidyverse和使用小数位数不太熟悉,但是我已经在该平台上获得了较早的帮助(确切的编码与下面的有所不同,但我将其归结为核心问题) 。
数据集包含一年中的所有小时(8760个值,每24小时365天),以及多个用户的值。
每位用户,我需要在特定时间范围内(例如,从00:00到03:00之间的所有值)汇总positive值,从03:00到05:00之间的汇总值中减去这些值(这些值是正数或负数)。总共有1000多个用户。
library(tidyverse)
library(lubridate)
set.seed(4)
time_index <- seq(
from = as.POSIXct("2016-01-01 00:00"),
to = as.POSIXct("2016-12-31 23:00"),
by = "hour"
)
user1 <- runif(length(time_index), min = -1, max = 1)
user2 <- runif(length(time_index), min = -1, max = 1)
user3 <- runif(length(time_index), min = -1, max = 1)
example <- data.frame(time_index, user1, user2, user3)
单个列(用户)的代码是:
df_intermediate <- example %>%
mutate(
date = as_date(time_index),
hour = hour(time_index),
hour_block = case_when(
between(hour, 0, 2) ~ "block_1",
between(hour, 3, 5) ~ "block_2",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(hour_block)) %>%
group_by(date, hour_block) %>%
nest() %>%
ungroup() %>%
mutate(
intermediate_result = if_else(
hour_block == "block_1",
map_dbl(data, ~ sum(.$user[.$user> 0 ])),
map_dbl(data, ~ sum(.$user))
)
) %>%
group_by(date) %>%
summarise(
final_result = first(intermediate_result) - last(intermediate_result)
)
这将为单个用户提供以下结果:
df_intermediate
#> # A tibble: 366 x 2
#> date final_result
#> <date> <dbl>
#> 1 2016-01-01 0.469
#> 2 2016-01-02 0.189
#> 3 2016-01-03 -1.32
我无法将其扩展到多个用户。我看过使用mutate_at或编写自己的函数以将其包括在mutate_at中,但是我不知道如何包括该条件(“ first_block”中应只包含正值)以及许多列。那么如何对多列而不是仅一列进行突变呢?
这是一种方法,可以匹配您的部分结果。这些步骤当然可以链接在一起以避免中间数据帧。
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
set.seed(4)
time_index <- seq(
from = as.POSIXct("2016-01-01 00:00"),
to = as.POSIXct("2016-12-31 23:00"),
by = "hour"
)
user1 <- runif(length(time_index), min = -1, max = 1)
user2 <- runif(length(time_index), min = -1, max = 1)
user3 <- runif(length(time_index), min = -1, max = 1)
example <- data.frame(time_index, user1, user2, user3)
step1 <- example %>%
mutate(
date = as_date(time_index),
hour = hour(time_index),
hour_block = case_when(
between(hour, 0, 2) ~ "block_1",
between(hour, 3, 5) ~ "block_2",
TRUE ~ NA_character_
)
)
step2 <- step1 %>%
filter(!is.na(hour_block)) %>%
pivot_longer(cols = starts_with("user"), names_to = "user_id") %>%
group_by(date, user_id) %>%
summarise(bl1_calc = sum(value[value>0 & hour_block == "block_1"]),
bl2_calc = sum(value[hour_block == "block_2"]),
final_result = bl1_calc - bl2_calc) %>%
select(-starts_with("bl"))
step3 <- step2 %>%
pivot_wider(names_from = user_id, values_from = final_result)
step3
#> # A tibble: 366 x 4
#> # Groups: date [366]
#> date user1 user2 user3
#> <date> <dbl> <dbl> <dbl>
#> 1 2016-01-01 0.469 2.25 0.662
#> 2 2016-01-02 0.189 0.345 4.33
#> 3 2016-01-03 -1.32 0.375 0.931
#> 4 2016-01-04 0.746 1.21 2.05
#> 5 2016-01-05 0.362 1.42 -0.578
#> 6 2016-01-06 1.55 -1.12 1.79
#> 7 2016-01-07 -1.22 1.07 -0.896
#> 8 2016-01-08 0.873 1.41 -0.640
#> 9 2016-01-09 -0.0262 1.85 0.930
#> 10 2016-01-10 -0.953 0.666 0.624
#> # … with 356 more rows
由reprex软件包(v0.3.0)于2020-05-20创建