我正在构建一个面板数据集,目前进展顺利。我无法解决要在另一个数据帧中创建一些变量的问题。
我很确定我需要for循环,但找不到针对这种特定情况的解决方案。
我有这两个数据框:
name <- c("apple", "apple", "apple", "orange", "orange", "orange", "orange","orange")
day <- c(1,8,9,0,2,2,2,7)
score <- c(7,7,8,1,5,8,4,4)
df1 <- data.frame(name, day, score)
&
name1 <- c("apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "orange", "orange", "orange", "orange","orange", "orange", "orange", "orange","orange", "orange","orange")
day1 <- c(0,1,2,3,4,5,6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10)
volume_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
volume_day_cum <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
avg_score_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
avg_score_cum <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
var_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
var_cum <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
df2 <- data.frame(name1, day1, volume_day, volume_day_cum, avg_score_day, avg_score_cum, var_day, var_cum)
我有一个名称-日期级别的面板数据集。因此,df1的实例(每行给出分数)需要在df2中进行编码,以匹配名称和日期。如果没有匹配项,则可以保留0。我正在寻找实例本身(数量),每天的平均得分和变化以及所有三个变量的累积值。结果数据框应如下所示:
volume_day <- c(0,1,0,0,0,0,0,0,1,1,0,1,0,3,0,0,0,0,1,0,0,0)
volume_day_cum <- c(0,1,1,1,1,1,1,1,2,3,3,1,1,4,4,4,4,4,5,5,5,5)
avg_score_day <- c(0,7,0,0,0,0,0,0,7,8,0,1,0,5.66,0,0,0,0,4,0,0,0)
avg_score_cum <- c(0,7,7,7,7,7,7,7,7,7.33,7.33,1,1,4.5,4.5,4.5,4.5,4.5,4.4,4.4,4.4,4.4)
var_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,2.88,0,0,0,0,0,0,0,0)
var_cum <- c(0,0,0,0,0,0,0,0,0,0.22,0.22,0,0,6.25,6.25,6.25,6.25,6.25,5.04,5.04,5.04,5.04)
resultdata <- data.frame(name1, day1, volume_day, volume_day_cum, avg_score_day, avg_score_cum, var_day, var_cum)
我对R和编码一般还比较陌生。如果我对问题的描述不够充分,请告诉我。希望有人可以在这里帮助我。
df1
和resultdata
之间存在一些不一致之处,但这是镜头:
library(dplyr)
# library(zoo)
df1 %>%
group_by(name, day) %>%
summarize(
volume_day = as.numeric(n()),
var_day = var(score),
avg_score_day = mean(score),
score = sum(score)
) %>%
ungroup() %>%
full_join(select(df2, name=name1, day=day1), by = c("name", "day")) %>%
arrange(name, day) %>%
group_by(name) %>%
mutate_at(vars(volume_day, score, avg_score_day, var_day), ~ if_else(is.na(.), 0, .)) %>%
mutate(
volume_day_cum = cumsum(volume_day),
avg_score_cum = if_else(cumsum(score) == 0, 0, cumsum(score) / volume_day_cum),
var_cum = zoo::rollapply(score, n(), var, partial = TRUE)
) %>%
print(n=99)
# # A tibble: 22 x 9
# # Groups: name [2]
# name day volume_day var_day avg_score_day score volume_day_cum avg_score_cum var_cum
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 apple 0 0 0 0 0 0 0 8.17
# 2 apple 1 1 0 7 7 1 7 7
# 3 apple 2 0 0 0 0 1 7 6.12
# 4 apple 3 0 0 0 0 1 7 9.53
# 5 apple 4 0 0 0 0 1 7 12.6
# 6 apple 5 0 0 0 0 1 7 11.8
# 7 apple 6 0 0 0 0 1 7 12.6
# 8 apple 7 0 0 0 0 1 7 11
# 9 apple 8 1 0 7 7 2 7 12.1
# 10 apple 9 1 0 8 8 3 7.33 13.5
# 11 apple 10 0 0 0 0 3 7.33 15.1
# 12 orange 0 1 0 1 1 1 1 47.2
# 13 orange 1 0 0 0 0 1 1 40.6
# 14 orange 2 3 4.33 5.67 17 4 4.5 35.1
# 15 orange 3 0 0 0 0 4 4.5 31.5
# 16 orange 4 0 0 0 0 4 4.5 28.6
# 17 orange 5 0 0 0 0 4 4.5 26.2
# 18 orange 6 0 0 0 0 4 4.5 29.0
# 19 orange 7 1 0 4 4 5 4.4 32
# 20 orange 8 0 0 0 0 5 4.4 2
# 21 orange 9 0 0 0 0 5 4.4 2.29
# 22 orange 10 0 0 0 0 5 4.4 2.67