考虑
n结果 (
gp
) 的组 (ngp
),如何选择/子集给定数量的结果 (nesgp
),这些结果在最小值和最大值(都必须包括在内)之间尽可能均匀地间隔在新专栏中selec
?NA
列中显示为 selec
,而不是重复。
> print(dat, n=56)
# A tibble: 56 x 4
gp result ngp nesgp
<chr> <dbl> <dbl> <dbl>
1 CA 1.64 24 15
2 CA 1.69 24 15
3 CA 1.71 24 15
4 CA 1.74 24 15
5 CA 1.78 24 15
6 CA 1.82 24 15
7 CA 1.86 24 15
8 CA 1.9 24 15
9 CA 1.94 24 15
10 CA 1.98 24 15
11 CA 2.6 24 15
12 CA 2.65 24 15
13 CA 2.71 24 15
14 CA 2.76 24 15
15 CA 2.83 24 15
16 CA 2.89 24 15
17 CA 2.94 24 15
18 CA 3 24 15
19 CA 3.22 24 15
20 CA 3.42 24 15
21 CA 3.47 24 15
22 CA 3.68 24 15
23 CA 3.85 24 15
24 CA 4.38 24 15
25 ASAT 9 20 12
26 ASAT 11 20 12
27 ASAT 51 20 12
28 ASAT 61 20 12
29 ASAT 69 20 12
30 ASAT 78 20 12
31 ASAT 89 20 12
32 ASAT 102 20 12
33 ASAT 111 20 12
34 ASAT 120 20 12
35 ASAT 146 20 12
36 ASAT 163 20 12
37 ASAT 189 20 12
38 ASAT 208 20 12
39 ASAT 218 20 12
40 ASAT 304 20 12
41 ASAT 332 20 12
42 ASAT 345 20 12
43 ASAT 362 20 12
44 ASAT 402 20 12
45 ORO 0.56 12 8
46 ORO 0.7 12 8
47 ORO 0.77 12 8
48 ORO 0.78 12 8
49 ORO 0.82 12 8
50 ORO 0.82 12 8
51 ORO 0.92 12 8
52 ORO 0.94 12 8
53 ORO 1.16 12 8
54 ORO 1.46 12 8
55 ORO 1.54 12 8
56 ORO 1.77 12 8
数据
dat <-
structure(list(gp = c("CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "ASAT", "ASAT", "ASAT", "ASAT",
"ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT",
"ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT",
"ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO",
"ORO", "ORO", "ORO"), result = c(1.64, 1.69, 1.71, 1.74, 1.78,
1.82, 1.86, 1.9, 1.94, 1.98, 2.6, 2.65, 2.71, 2.76, 2.83, 2.89,
2.94, 3, 3.22, 3.42, 3.47, 3.68, 3.85, 4.38, 9, 11, 51, 61, 69,
78, 89, 102, 111, 120, 146, 163, 189, 208, 218, 304, 332, 345,
362, 402, 0.56, 0.7, 0.77, 0.78, 0.82, 0.82, 0.92, 0.94, 1.16,
1.46, 1.54, 1.77), ngp = c(24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12),
nesgp = c(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -56L))
感谢您的帮助。
以下解决方案使用了辅助功能
fun
,也许还有更简单的方法可以做到这一点。n
值,并使用 findInterval
来查看这些值在 x
中的位置。然后,在 for
循环中,检查到每个区间极值的距离,并将最小值分配给返回值 y
。
suppressPackageStartupMessages(
library(tidyverse)
)
fun <- function(x, n, na.rm = FALSE) {
xmin <- min(x, na.rm = na.rm)
xmax <- max(x, na.rm = na.rm)
ref <- seq(xmin, xmax, length.out = n)
x <- sort(x)
j <- findInterval(ref, x)
y <- numeric(n)
y[1L] <- xmin
y[n] <- xmax
for(i in seq_len(n)[-c(1L, n)]) {
if(abs(ref[i] - x[ j[i] ]) < abs(ref[i] - x[ j[i + 1L] ])) {
y[i] <- x[ j[i] ]
} else y[i] <- x[ j[i + 1L] ]
}
y
}
dat %>%
reframe(selec = fun(result, first(ngp)), .by = gp)
#> # A tibble: 56 × 2
#> gp selec
#> <chr> <dbl>
#> 1 CA 1.64
#> 2 CA 1.74
#> 3 CA 1.86
#> 4 CA 1.98
#> 5 CA 1.98
#> 6 CA 1.98
#> 7 CA 1.98
#> 8 CA 1.98
#> 9 CA 2.71
#> 10 CA 2.71
#> # ℹ 46 more rows
创建于 2024-02-04,使用 reprex v2.0.2
我不确定你所说的“尽可能均匀地间隔”是什么意思,但我写了一个例子,使用点的采样来最小化它们之间的分布,这对你来说可能是一个很好的起点:
par(mfrow = c(length(unique(dat$gp)), 1))
dat$selec <- NA
# for each group,
groups <- unique(dat$gp)
for(gp in groups){
x <- dat$result[dat$gp == gp]
minmax_x <- range(x)
possible_xs <- x[!(x %in% minmax_x)]
# run a lot of samples of different possible lengths to test
r <- replicate(20000, sort(c(minmax_x,
sample(possible_xs,
size = sample(3:length(possible_xs),1)
)
)
)
)
spreads <- sapply(r, function(obj) var(diff(obj)))
minimized_variance_index <- which.min(spreads)
dat$selec[which(dat$result %in% r[[minimized_variance_index]])] <- 1
# visualize
plot(x, rep(1, length(x)), yaxt = "n", ylab = "", xlab = "result",
main = paste(gp,", spread =", round(var(diff(r[[minimized_variance_index]])),5)))
abline(v= r[[minimized_variance_index]])
}