我们必须计算数据框中 100 列的百分位数。在下面的示例中,需要百分位数的列名称是
pctile_columns
。接收百分位数的标准是 (1) 该列不是 NA
,并且 (1) min_pg
列是 >= 12
。我们正在努力获取正确的百分位数集:
temp_df = structure(list(group_var = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
min_pg = c(11, 15, 19, 7, 5, 34, 32, 27, 24, 18, 13, 10),
stat1 = c(0.35, 0.32, 0.27, NA, NA, 0.42, 0.45, 0.47, 0.33, NA, 0.24, 0.39)),
row.names = c(NA, -12L), class = "data.frame")
pctile_columns <- c('stat1')
library(dplyr)
temp_output <- temp_df %>%
group_by(group_var) %>%
mutate(across(.cols = all_of(pctile_columns),
.fns = ~ if_else(is.na(.) | min_pg < 12, as.numeric(NA),
rank(., ties.method = "max")),
.names = "{.col}__rank")) %>%
mutate(across(.cols = all_of(pctile_columns),
.fns = ~ if_else(is.na(.) | min_pg < 12, as.numeric(NA),
round((rank(., ties.method = "max") - 1) / (n() - 1) * 100, 0)),
.names = "{.col}__pctile"))
# Groups: group_var [1]
group_var min_pg stat1 stat1__rank stat1__pctile
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 11 0.35 NA NA
2 1 15 0.32 3 18
3 1 19 0.27 2 9
4 1 7 NA NA NA
5 1 5 NA NA NA
6 1 34 0.42 7 55
7 1 32 0.45 8 64
8 1 27 0.47 9 73
9 1 24 0.33 4 27
10 1 18 NA NA NA
11 1 13 0.24 1 0
12 1 10 0.39 NA NA
此输出的问题在于,排名从 1 到 9,而它们应该从 1 到 7。即使具有
stat1
的 min_pg < 12
值已正确分配为 NA
值,但在计算所有其他行的排名时,这些 stat1
值仍会被考虑到 rank
方程中。在这种情况下,正确的排名集应该是 1-7,因为有 7 个指标满足 stat1
接收排名/百分位数的标准。
我们如何修改代码以根据我们的标准正确计算排名/百分位数?
您可以编写
statfun
并在 by
中使用它。
> statfun <- \(x, stat) {
+ rk <- \(x, m, z=12) rank(replace(x, m < z, NA), 'keep', 'max') ## rank fun
+ pctl <- \(x) round((x - 1L)/length(na.omit(x) - 1L)*100) ## perc fun
+ o <- lapply(stat, \(s) {
+ r <- with(x, rk(get(s), x$min_pg))
+ p <- pctl(r)
+ data.frame(r, p) |> setNames(paste(s, c('rank', 'percentile'), sep='_'))
+ })
+ cbind(x, o)
+ }
> by(temp_df, ~group_var, statfun, stat=c('stat1', 'stat2')) |> do.call(what='rbind')
group_var min_pg stat1 stat2 stat1_rank stat1_percentile stat2_rank stat2_percentile
1.1 1 11 0.35 NA NA NA NA NA
1.2 1 15 0.32 0.45 3 29 3 29
1.3 1 19 0.27 0.89 2 14 5 14
1.4 1 7 NA NA NA NA NA NA
1.5 1 5 NA 0.27 NA NA NA NA
1.6 1 34 0.42 0.63 5 57 4 57
1.7 1 32 0.45 NA 6 71 NA 71
1.8 1 27 0.47 0.24 7 86 1 86
1.9 1 24 0.33 NA 4 43 NA 43
1.10 1 18 NA 0.27 NA NA 2 NA
1.11 1 13 0.24 NA 1 0 NA 0
1.12 1 10 0.39 0.43 NA NA NA NA
2.13 2 11 0.35 0.42 NA NA NA NA
2.14 2 12 0.31 NA 2 29 NA 29
2.15 2 13 0.27 0.47 1 14 5 14
2.16 2 6 NA 0.45 NA NA NA NA
2.17 2 5 NA 0.39 NA NA NA NA
2.18 2 31 0.43 0.45 3 57 4 57
2.19 2 22 0.45 0.35 5 71 3 71
2.20 2 29 0.45 0.27 5 86 1 86
2.21 2 24 0.63 0.31 6 43 2 43
2.22 2 11 NA 0.35 NA NA NA NA
2.23 2 11 0.27 0.32 NA 0 NA 0
2.24 2 9 0.89 0.33 NA NA NA NA
数据:
> dput(temp_df)
structure(list(group_var = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), min_pg = c(11L, 15L, 19L, 7L, 5L, 34L, 32L, 27L, 24L, 18L,
13L, 10L, 11L, 12L, 13L, 6L, 5L, 31L, 22L, 29L, 24L, 11L, 11L,
9L), stat1 = c(0.35, 0.32, 0.27, NA, NA, 0.42, 0.45, 0.47, 0.33,
NA, 0.24, 0.39, 0.35, 0.31, 0.27, NA, NA, 0.43, 0.45, 0.45, 0.63,
NA, 0.27, 0.89), stat2 = c(NA, 0.45, 0.89, NA, 0.27, 0.63, NA,
0.24, NA, 0.27, NA, 0.43, 0.42, NA, 0.47, 0.45, 0.39, 0.45, 0.35,
0.27, 0.31, 0.35, 0.32, 0.33)), class = "data.frame", row.names = c(NA,
-24L))
怎么样:
## helper function:
rank_special <- \(xs, reject = FALSE){
xs[reject] <- NA
list(rank = rank(xs, ties.method = 'max', na.last = 'keep'),
pctile = round(findInterval(xs, quantile(xs, 1:100 * .01, na.rm = TRUE)))
)
}
library(dplyr)
pctile_columns <- c('stat1')
LB <- 12 ## lower bound (set values below to NA)
temp_df %>%
mutate(across(.cols = all_of(pctile_columns),
.fns = list(rank = ~ rank_special(.x, reject = min_pg < LB )$rank,
ptile = ~ rank_special(.x, reject = min_pg < LB )$ptile
),
.names = '{.col}_{.fn}'
),
.by = group_var
)
## group_var min_pg stat1 stat1_rank stat1_pctile
## 1 1 11 0.35 NA NA
## 2 1 15 0.32 3 33
## 3 1 19 0.27 2 16
## 4 1 7 NA NA NA
## 5 1 5 NA NA NA
## 6 1 34 0.42 5 66
## 7 1 32 0.45 6 83
## 8 1 27 0.47 7 100
## 9 1 24 0.33 4 50
## 10 1 18 NA NA NA
## 11 1 13 0.24 1 0
## 12 1 10 0.39 NA NA
(注意调整百分位数计算以适应是否考虑 NA)