我有以下假设数据:
district <- c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,3,3)
village <- c(1,2,3,4,1,2,3,4,5,1,2,3,4,5,6,7)
status <- c(1,0,1,0,1,1,1,0,0,1,1,1,1,0,0,0)
datei <- data.table(district, village, status)
我想根据地区重新调整状态并将结果放入新的列中。我知道如何使用以下代码完成一次:
datei[, randomstat := sample(status), district]
现在,我想重新调整状态 1000 次并将结果放入新列中。我尝试了以下代码:
n <- 1000
datei[, paste0("randomstat", 1:n) := replicate(n, list(sample(status), district))]
但是失败了。有人可以帮我弄这个吗?谢谢你。
您的代码中有一个拼写错误:
district
放在list
里面,即list(sample(status), district))
,这是不正确的。
您有以下选择:
simplify = FALSE
时启用 replicate
,例如,datei[, paste0("randomstat", 1:n) := replicate(n, sample(status), simplify = FALSE), district]
sample(status)
包裹
list()
datei[, paste0("randomstat", 1:n) := replicate(n, list(sample(status))), district]
或者您可以创建自己的随机播放函数,以下是 10 列的示例:
n <- 10
reshuffle <- function(status, district) {
unlist(mapply(function(s, d) sample(s),
split(status, district),
split(district, district),
SIMPLIFY = FALSE))
}
datei[, (paste0("randomstat", 1:n)) := lapply(1:n, \(i) shuffle(status, district))]
datei
district village status randomstat1 randomstat2 randomstat3 randomstat4 randomstat5 randomstat6 randomstat7 randomstat8 randomstat9 randomstat10
1: 1 1 1 1 1 1 1 1 0 1 1 1 1
2: 1 2 0 0 0 0 0 0 0 0 0 0 0
3: 1 3 1 1 0 0 1 1 1 0 0 0 1
4: 1 4 0 0 1 1 0 0 1 1 1 1 0
5: 2 1 1 1 1 1 0 0 1 1 1 1 1
6: 2 2 1 0 0 1 0 1 0 0 1 0 0
7: 2 3 1 1 1 0 1 1 0 1 1 1 1
8: 2 4 0 0 1 0 1 1 1 1 0 1 0
9: 2 5 0 1 0 1 1 0 1 0 0 0 1
10: 3 1 1 0 0 0 1 1 0 1 1 1 1
11: 3 2 1 1 0 0 1 1 1 1 0 1 1
12: 3 3 1 0 0 0 0 1 1 1 1 1 0
13: 3 4 1 1 1 1 0 0 1 1 0 0 0
14: 3 5 0 0 1 1 1 0 0 0 1 0 1
15: 3 6 0 1 1 1 0 1 1 0 1 1 1
16: 3 7 0 1 1 1 1 0 0 0 0 0 0
如果性能是一个问题,使用
Rfast
包的一些矢量化选项:
library(Rfast)
n <- 1e3
microbenchmark::microbenchmark(
replicate = dt[,paste0("randomstat", 1:n) := replicate(n, sample(status), FALSE), district],
reshuffle = dt[,paste0("randomstat", 1:n) := lapply(1:n, \(i) reshuffle(status, district))],
colShuffle = dt[,paste0("randomstat", 1:n) := as.data.frame(colShuffle(matrix(rep(status, n), .N, n))), district],
colRanks = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n))], .N, n)), district],
colRanksAll = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n) + match(district, unique(district)))], .N, n))],
setup = {dt <- copy(datei)}
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> replicate 15.1496 17.55985 21.687518 20.56755 24.95855 37.4373 100
#> reshuffle 120.1781 135.82605 153.436604 146.41200 160.21505 242.8556 100
#> colShuffle 4.5537 4.94215 6.118132 5.11265 5.82665 19.3036 100
#> colRanks 5.7100 6.08095 7.730242 6.40530 8.20650 19.7137 100
#> colRanksAll 5.0109 5.35780 7.171746 5.59560 8.67885 17.7488 100
对更大的数据集进行计时:
set.seed(2037213561)
datei <- data.table(district = rep(1:1e3, sample(10, 1e3, 1)))[
,`:=`(village = rowid(district), status = runif(.N)%/%0.5)
]
microbenchmark::microbenchmark(
replicate = dt[,paste0("randomstat", 1:n) := replicate(n, sample(status), FALSE), district],
reshuffle = dt[,paste0("randomstat", 1:n) := lapply(1:n, \(i) reshuffle(status, district))],
colShuffle = dt[,paste0("randomstat", 1:n) := as.data.frame(colShuffle(matrix(rep(status, n), .N, n))), district],
colRanks = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n))], .N, n)), district],
colRanksAll = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n) + match(district, unique(district)))], .N, n))],
setup = {dt <- copy(datei)},
times = 10
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> replicate 5602.0342 6649.0791 7237.3350 7318.1749 7641.8868 8434.8158 10
#> reshuffle 7348.9950 8237.6315 9946.4001 10728.3559 11320.5539 11680.5648 10
#> colShuffle 1521.4851 1578.3194 2024.9663 2181.7678 2312.6861 2368.4342 10
#> colRanks 2063.0544 2633.7768 2784.5394 2868.1073 3097.8065 3232.0499 10
#> colRanksAll 374.6916 403.4064 467.1745 478.0322 521.1787 580.5176 10