我正在尝试从数据库的两列中提取随机样本(工作时间和就诊患者的相对数量),然后我想逐步计算平均值。我的意思是,前两个样本之间的平均值,然后是我刚刚计算的平均值与第三个样本之间的平均值……等等。
这可能吗?有那个功能吗?
谢谢大家的帮助。
L.
这就是我提取样本的方式。
library(dplyr)
set.seed(2020)
obs <- rnorm(10, mean = 0, sd = 1)
time <- rnorm(10, mean = 0.5, sd = 1)
rdf <- data.frame(obs, time)
sample_n(rdf, 1)
p <- replicate(100, expr = (sample_n(rdf, 1) + sample_n(rdf, 1))/2)
一个选项是使用 for 循环并确定您想要的样本数。例如,如果我们想要获取 5 个样本并逐步计算均值,我们可以执行一个循环,从第一个样本开始并迭代选择下一个样本。然后计算前一个平均值和下一个样本之间的平均值:
set.seed(2020)
obs <- rnorm(10, mean = 0, sd = 1)
time <- rnorm(10, mean = 0.5, sd = 1)
rdf <- data.frame(obs, time)
nsamp <- 5 # number of samples
mean_vect <- numeric(nsamp) # create a vector to store the means
mean_vect[1] <- mean(sample_n(rdf, 1)$obs) # mean of first sample as starting point
# start calculations to fifth sample iteratively
for (i in 2:nsamp) {
# select the next sample
next_samp <- sample_n(rdf, 1)
# calculate the mean between the previous mean and the next sample
mean_vect[i] <- mean(c(mean_vect[i-1], next_samp$obs))
}
# print the means
print(mean_vect)
[1] -1.13040590 -0.20491620 0.04831609 0.08284144 0.40170747
您可以定义一个递归函数(一个调用自身的函数)。
f <- function(S, R, i=1, cm=NULL, res=NULL, ...) {
S <- rbind(cm, rdf[sample.int(nrow(rdf), 1), ])
cm <- colMeans(S)
res <- rbind(res, cm)
return(if (i < R) {
f(S, R=R, i=i + 1, cm=cm, res=res) ## also `Recall(.)` instead of `f(.)`
} else {
`rownames<-`(as.data.frame(res), NULL)
})
}
set.seed(42)
f(rdf[sample.int(nrow(rdf), 1), ], R=10)
# obs time
# 1 0.376972125 -0.35312282
# 2 -1.209781097 0.01180847
# 3 -0.416404486 -0.17065718
# 4 0.671363430 -0.97981606
# 5 0.394365109 -0.21075628
# 6 -0.368020398 -0.04117009
# 7 -0.033236012 0.68404454
# 8 0.042065388 0.62117402
# 9 0.209518756 0.13402560
# 10 -0.009929495 -1.20236950
你可能必须增加你的C堆栈大小.
但您也可以使用
for
循环。
R <- 10
res1 <- matrix(nrow=0, ncol=2)
set.seed(42)
for (i in seq_len(R - 1)) {
if (nrow(res1) == 0) {
res1 <- rdf[sample.int(nrow(rdf), 1), ]
}
S <- rdf[sample.int(nrow(rdf), 1), ]
res1 <- rbind(res1, colMeans(rbind(res1[nrow(res1), ], S)))
}
res1
# obs time
# 1 0.376972125 -0.35312282
# 2 -1.209781097 0.01180847
# 3 -0.416404486 -0.17065718
# 4 0.671363430 -0.97981606
# 5 0.394365109 -0.21075628
# 6 -0.368020398 -0.04117009
# 7 -0.033236012 0.68404454
# 8 0.042065388 0.62117402
# 9 0.209518756 0.13402560
# 10 -0.009929495 -1.20236950
这里是两个版本的快速基准测试(R=2K),递归速度几乎是原来的两倍。
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# recursive 577.0595 582.0189 587.3052 586.9783 592.4281 597.8778 3 a
# for-loop 991.4360 993.7170 997.2436 995.9980 1000.1473 1004.2966 3 b
资料:
rdf <- structure(list(obs = c(0.376972124936433, 0.301548373935665,
-1.0980231706536, -1.13040590360378, -2.79653431987176, 0.720573498411587,
0.93912102300901, -0.229377746707471, 1.75913134696347, 0.117366786802848
), time = c(-0.353122822287008, 1.40925918161821, 1.69637295955276,
0.128416096258652, 0.376739766712564, 2.30004311672545, 2.20399587729432,
-2.53876460529759, -1.78897494991878, 0.558303494992923)), class = "data.frame", row.names = c(NA,
-10L))
另一种方法(使用您的示例数据
rdf
):
mean_of_random_pair(xs)
抽取一组中的两个随机项 xs
并计算它们的平均值:mean_of_random_pair <- function(xs){
xs |> sample(size = 2) |> mean(na.rm = TRUE)
}
cumulative_mean
计算总均值X作为现有X和新项目x的均值:cumulative_mean <- function(xs){
xs |> Reduce(f = \(X, x) mean(c(X, x)),
accumulate = TRUE
)
}
将以上功能链接到管道中并在场景中运行
runs
次rdf$obs
:
runs = 100
1:runs |>
Map(f = \(i) mean_of_random_pair(rdf$obs)) |>
cumulative_mean()
output(迭代平均的序列):
[1] 1.1000858 0.8557774 0.3041130 0.4262881 -0.4658256
# ...
检查输出(对于 n = 5000 次模拟运行):
runs = 5e3
set.seed(4711)
densities <-
list(obs = 'obs', time = 'time') |>
map(\(var){
1:runs |>
Map(f = \(i) mean_of_random_pair(rdf[[var]])) |>
cumulative_mean() |>
density()
})
densities$time |> plot(col = 'blue', ylim = c(0, 1), xlim = c(-3, 3), main = 'foo')
densities$obs |> lines(col = 'red')
非常感谢大家的帮助!!
我的问题现在解决了:D