我有一个包含约 64,000 行的数据框,其结构如下
df <- data.frame(FY = c(2018, 2018, 2019, 2019, 2020, 2020), WorkArea = c("03218", "03219", "07589", "07589", "01125", "01126"), ProjectCode = c(12, 12, 13, 11, 12, 12), TotalHours = c(180, 250, 125, 300, 450, 750))
我正在尝试为每年/工作区域/项目代码的每个分组创建一个大的汇总统计表:
我想我已经完成了最后一个...
df_table <- df %>% group_by(FY, WorkArea, ProjectCode) %>%
summarise(opCount = n(),
Hours = sum(TotalHours),
AvgHours = Hours/opCount,
medHours = median(TotalHours),
Bottom25 = quantile(TotalHours, 0.25),
Top25 = quantile(TotalHours, 0.75),
OutlierCount = sum(TotalHours > quantile(TotalHours, 0.8))
)
但我不知道如何有条件地对大于
quantile(Hours,0.8)
的小时数进行求和。
您可以在自定义摘要函数中使用
quantile
。
> fn <- \(x) {
+ q80 <- quantile(x, .8)
+ c(n=length(x), total=sum(x), summary(x), n80=length(x[x > q80]), total80=sum(x[x > q80]))
+ }
> res <- aggregate(TotalHours ~ WorkArea + ProjectCode, df1, fn)
> head(res)
WorkArea ProjectCode TotalHours.n TotalHours.total TotalHours.Min.
1 03218 11 3.0000 1746.0000 398.0000
2 03219 11 3.0000 1330.0000 245.0000
3 07589 11 3.0000 1530.0000 397.0000
4 01125 11 3.0000 1317.0000 123.0000
5 01126 11 3.0000 944.0000 257.0000
6 03218 12 3.0000 1028.0000 123.0000
TotalHours.1st Qu. TotalHours.Median TotalHours.Mean TotalHours.3rd Qu.
1 405.5000 413.0000 582.0000 674.0000
2 375.0000 505.0000 443.3333 542.5000
3 405.0000 413.0000 510.0000 566.5000
4 285.0000 447.0000 439.0000 597.0000
5 276.5000 296.0000 314.6667 343.5000
6 184.0000 245.0000 342.6667 452.5000
TotalHours.Max. TotalHours.n80 TotalHours.total80
1 935.0000 1.0000 935.0000
2 580.0000 1.0000 580.0000
3 720.0000 1.0000 720.0000
4 747.0000 1.0000 747.0000
5 391.0000 1.0000 391.0000
6 660.0000 1.0000 660.0000
数据:
> dput(df1)
structure(list(WorkArea = structure(c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L, 5L), levels = c("03218", "03219", "07589",
"01125", "01126"), class = "factor"), FY = c(2018, 2018, 2018,
2018, 2018, 2019, 2019, 2019, 2019, 2019, 2020, 2020, 2020, 2020,
2020, 2018, 2018, 2018, 2018, 2018, 2019, 2019, 2019, 2019, 2019,
2020, 2020, 2020, 2020, 2020, 2018, 2018, 2018, 2018, 2018, 2019,
2019, 2019, 2019, 2019, 2020, 2020, 2020, 2020, 2020), ProjectCode = c(12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), TotalHours = c(660L,
420L, 252L, 173L, 327L, 245L, 733L, 148L, 227L, 402L, 123L, 938L,
455L, 700L, 264L, 721L, 631L, 509L, 981L, 978L, 998L, 396L, 700L,
382L, 720L, 616L, 311L, 959L, 971L, 358L, 413L, 580L, 397L, 123L,
257L, 398L, 505L, 413L, 747L, 391L, 935L, 245L, 720L, 447L, 296L
)), out.attrs = list(dim = c(WorkArea = 5L, FY = 3L, ProjectCode = 3L
), dimnames = list(WorkArea = c("WorkArea=03218", "WorkArea=03219",
"WorkArea=07589", "WorkArea=01125", "WorkArea=01126"), FY = c("FY=2018",
"FY=2019", "FY=2020"), ProjectCode = c("ProjectCode=12", "ProjectCode=13",
"ProjectCode=11"))), row.names = c(NA, -45L), class = "data.frame")